summaryrefslogtreecommitdiffstats
path: root/feed-pdfs.py
blob: d5bd31f533c1adca856e2f710cea3be7c2f5eba2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#! /usr/bin/env python

import sys
import os
import subprocess
from pysolr import Solr


if os.path.isdir(sys.argv[1]):
    os.chdir(sys.argv[1])
    if not os.path.exists('/tmp/.importged-filenames'):
        os.system('ls -1 > /tmp/.importged-filenames')
    filenames = open('/tmp/.importged-filenames')
else:
    filenames = [sys.argv[1]]

conn = Solr('http://127.0.0.1:8080/solr/')

for i, filename in enumerate(filenames):
    if i < 877375: continue
    filename = filename.strip()
    if not filename.endswith('.pdf'):
        continue
    print '[%d] importing filename %s' % (i, filename)
    cmd = ['pdftotext', filename, '-']
    try:
        p = subprocess.Popen(cmd,
                             close_fds=True,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    except OSError, e:
        print 'E: error running pdftotext'
        continue
    stdout, stderr = p.communicate()
    if p.returncode != 0:
        print 'E: error running pdftotext (rc:%d)' % p.returncode
        continue
    conn.add([{'id': filename, 'text': unicode(stdout, 'utf-8')}])
    if i%100 == 99:
        conn.commit()
    #if i == 500000: break