#! /usr/bin/env python import sys import os import subprocess from pysolr import Solr if os.path.isdir(sys.argv[1]): os.chdir(sys.argv[1]) if not os.path.exists('/tmp/.importged-filenames'): os.system('ls -1 > /tmp/.importged-filenames') filenames = open('/tmp/.importged-filenames') else: filenames = [sys.argv[1]] conn = Solr('http://127.0.0.1:8080/solr/') for i, filename in enumerate(filenames): if i < 877375: continue filename = filename.strip() if not filename.endswith('.pdf'): continue print '[%d] importing filename %s' % (i, filename) cmd = ['pdftotext', filename, '-'] try: p = subprocess.Popen(cmd, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: print 'E: error running pdftotext' continue stdout, stderr = p.communicate() if p.returncode != 0: print 'E: error running pdftotext (rc:%d)' % p.returncode continue conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}]) if i%100 == 99: conn.commit() #if i == 500000: break