Basic tools to interact with solr
This commit is contained in:
commit
eced00aaf2
|
@ -0,0 +1,42 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pysolr import Solr
|
||||||
|
|
||||||
|
|
||||||
|
if os.path.isdir(sys.argv[1]):
|
||||||
|
os.chdir(sys.argv[1])
|
||||||
|
if not os.path.exists('/tmp/.importged-filenames'):
|
||||||
|
os.system('ls -1 > /tmp/.importged-filenames')
|
||||||
|
filenames = open('/tmp/.importged-filenames')
|
||||||
|
else:
|
||||||
|
filenames = [sys.argv[1]]
|
||||||
|
|
||||||
|
conn = Solr('http://127.0.0.1:8080/solr/')
|
||||||
|
|
||||||
|
for i, filename in enumerate(filenames):
|
||||||
|
if i < 877375: continue
|
||||||
|
filename = filename.strip()
|
||||||
|
if not filename.endswith('.pdf'):
|
||||||
|
continue
|
||||||
|
print '[%d] importing filename %s' % (i, filename)
|
||||||
|
cmd = ['pdftotext', filename, '-']
|
||||||
|
try:
|
||||||
|
p = subprocess.Popen(cmd,
|
||||||
|
close_fds=True,
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
|
except OSError, e:
|
||||||
|
print 'E: error running pdftotext'
|
||||||
|
continue
|
||||||
|
stdout, stderr = p.communicate()
|
||||||
|
if p.returncode != 0:
|
||||||
|
print 'E: error running pdftotext (rc:%d)' % p.returncode
|
||||||
|
continue
|
||||||
|
conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}])
|
||||||
|
if i%100 == 99:
|
||||||
|
conn.commit()
|
||||||
|
#if i == 500000: break
|
|
@ -0,0 +1,18 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pysolr import Solr
|
||||||
|
|
||||||
|
conn = Solr('http://127.0.0.1:8080/solr/')
|
||||||
|
query = unicode(sys.argv[1], 'utf-8')
|
||||||
|
|
||||||
|
results = conn.search(query, rows=20, fl='* score')
|
||||||
|
|
||||||
|
print 'number of results:', results.hits
|
||||||
|
print 'qtime:', results.qtime
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
print '%.2f - %s' % (result['score'], result['id'])
|
||||||
|
|
Reference in New Issue