commit eced00aaf2f5a66af29a463a204c8b199ad12d51 Author: Frédéric Péters Date: Thu Sep 20 09:22:23 2012 +0200 Basic tools to interact with solr diff --git a/feed-pdfs.py b/feed-pdfs.py new file mode 100644 index 0000000..d5bd31f --- /dev/null +++ b/feed-pdfs.py @@ -0,0 +1,42 @@ +#! /usr/bin/env python + +import sys +import os +import subprocess +from pysolr import Solr + + +if os.path.isdir(sys.argv[1]): + os.chdir(sys.argv[1]) + if not os.path.exists('/tmp/.importged-filenames'): + os.system('ls -1 > /tmp/.importged-filenames') + filenames = open('/tmp/.importged-filenames') +else: + filenames = [sys.argv[1]] + +conn = Solr('http://127.0.0.1:8080/solr/') + +for i, filename in enumerate(filenames): + if i < 877375: continue + filename = filename.strip() + if not filename.endswith('.pdf'): + continue + print '[%d] importing filename %s' % (i, filename) + cmd = ['pdftotext', filename, '-'] + try: + p = subprocess.Popen(cmd, + close_fds=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + except OSError, e: + print 'E: error running pdftotext' + continue + stdout, stderr = p.communicate() + if p.returncode != 0: + print 'E: error running pdftotext (rc:%d)' % p.returncode + continue + conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}]) + if i%100 == 99: + conn.commit() + #if i == 500000: break diff --git a/search.py b/search.py new file mode 100644 index 0000000..b1b263b --- /dev/null +++ b/search.py @@ -0,0 +1,18 @@ +#! /usr/bin/env python + +import sys +import os +import subprocess +from pysolr import Solr + +conn = Solr('http://127.0.0.1:8080/solr/') +query = unicode(sys.argv[1], 'utf-8') + +results = conn.search(query, rows=20, fl='* score') + +print 'number of results:', results.hits +print 'qtime:', results.qtime + +for result in results: + print '%.2f - %s' % (result['score'], result['id']) +