43 lines
1.2 KiB
Python
43 lines
1.2 KiB
Python
#! /usr/bin/env python
|
|
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
from pysolr import Solr
|
|
|
|
|
|
if os.path.isdir(sys.argv[1]):
|
|
os.chdir(sys.argv[1])
|
|
if not os.path.exists('/tmp/.importged-filenames'):
|
|
os.system('ls -1 > /tmp/.importged-filenames')
|
|
filenames = open('/tmp/.importged-filenames')
|
|
else:
|
|
filenames = [sys.argv[1]]
|
|
|
|
conn = Solr('http://127.0.0.1:8080/solr/')
|
|
|
|
for i, filename in enumerate(filenames):
|
|
if i < 877375: continue
|
|
filename = filename.strip()
|
|
if not filename.endswith('.pdf'):
|
|
continue
|
|
print '[%d] importing filename %s' % (i, filename)
|
|
cmd = ['pdftotext', filename, '-']
|
|
try:
|
|
p = subprocess.Popen(cmd,
|
|
close_fds=True,
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
except OSError, e:
|
|
print 'E: error running pdftotext'
|
|
continue
|
|
stdout, stderr = p.communicate()
|
|
if p.returncode != 0:
|
|
print 'E: error running pdftotext (rc:%d)' % p.returncode
|
|
continue
|
|
conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}])
|
|
if i%100 == 99:
|
|
conn.commit()
|
|
#if i == 500000: break
|