Basic tools to interact with solr

This commit is contained in:
Frédéric Péters 2012-09-20 09:22:23 +02:00
commit eced00aaf2
2 changed files with 60 additions and 0 deletions

42
feed-pdfs.py Normal file
View File

@ -0,0 +1,42 @@
#! /usr/bin/env python
import sys
import os
import subprocess
from pysolr import Solr
if os.path.isdir(sys.argv[1]):
os.chdir(sys.argv[1])
if not os.path.exists('/tmp/.importged-filenames'):
os.system('ls -1 > /tmp/.importged-filenames')
filenames = open('/tmp/.importged-filenames')
else:
filenames = [sys.argv[1]]
conn = Solr('http://127.0.0.1:8080/solr/')
for i, filename in enumerate(filenames):
if i < 877375: continue
filename = filename.strip()
if not filename.endswith('.pdf'):
continue
print '[%d] importing filename %s' % (i, filename)
cmd = ['pdftotext', filename, '-']
try:
p = subprocess.Popen(cmd,
close_fds=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except OSError, e:
print 'E: error running pdftotext'
continue
stdout, stderr = p.communicate()
if p.returncode != 0:
print 'E: error running pdftotext (rc:%d)' % p.returncode
continue
conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}])
if i%100 == 99:
conn.commit()
#if i == 500000: break

18
search.py Normal file
View File

@ -0,0 +1,18 @@
#! /usr/bin/env python
import sys
import os
import subprocess
from pysolr import Solr
conn = Solr('http://127.0.0.1:8080/solr/')
query = unicode(sys.argv[1], 'utf-8')
results = conn.search(query, rows=20, fl='* score')
print 'number of results:', results.hits
print 'qtime:', results.qtime
for result in results:
print '%.2f - %s' % (result['score'], result['id'])