This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
pfwb-ged-proto/feed-pdfs.py

43 lines
1.2 KiB
Python

#! /usr/bin/env python
import sys
import os
import subprocess
from pysolr import Solr
if os.path.isdir(sys.argv[1]):
os.chdir(sys.argv[1])
if not os.path.exists('/tmp/.importged-filenames'):
os.system('ls -1 > /tmp/.importged-filenames')
filenames = open('/tmp/.importged-filenames')
else:
filenames = [sys.argv[1]]
conn = Solr('http://127.0.0.1:8080/solr/')
for i, filename in enumerate(filenames):
if i < 877375: continue
filename = filename.strip()
if not filename.endswith('.pdf'):
continue
print '[%d] importing filename %s' % (i, filename)
cmd = ['pdftotext', filename, '-']
try:
p = subprocess.Popen(cmd,
close_fds=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except OSError, e:
print 'E: error running pdftotext'
continue
stdout, stderr = p.communicate()
if p.returncode != 0:
print 'E: error running pdftotext (rc:%d)' % p.returncode
continue
conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}])
if i%100 == 99:
conn.commit()
#if i == 500000: break