summaryrefslogtreecommitdiffstats
path: root/ocrloader.py
blob: 2120e94660e02dd680c221493bad15e2a485a500 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#! /usr/bin/env python

import ConfigParser
import email
import email.parser
import imaplib
import os
import os.path
import requests
import sys
import tempfile
import time

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='ocrloader.ini')
args = parser.parse_args()

config_filepath = args.config


import logging
logging.basicConfig(level=logging.DEBUG)
def process(cfg, filename, payload, enable_ocr=True):
    ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
    if not enable_ocr:
        if not os.path.exists(ocr_filename):
            print >> sys.stderr, '  skipping OCR phase'
            fd = file(ocr_filename, 'w')
            fd.write(payload)
            fd.close()

    if not os.path.exists(ocr_filename) and enable_ocr:
        fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
        os.write(fd, payload)
        os.close(fd)
        print >> sys.stderr, '  running OCR on file'
        os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \
                        (tmpfilename, ocr_filename))
        if not os.path.exists(ocr_filename):
            print >> sys.stderr, 'failed to OCR %s' % filename
            file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
            return False

    print '  uploading file'
    t = os.system('curl -v --insecure -X POST '\
              '--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\
              '-F "form.buttons.import=Import" '\
              '-F "form.widgets.portal_type=%s" '\
              '-F "form.widgets.location=%s" '\
              '-F "form.widgets.owner=%s" '\
              '-u admin:admin '\
              '%s/@@fileimport' % (
                      ocr_filename, filename,
                      cfg.get('default_type'),
                      cfg.get('default_directory'),
                      cfg.get('user'),
                      cfg.get('ged_base_url')))
    return (t == 0)


#    try:
#        r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
#            auth=(cfg.get('ged_username'), cfg.get('ged_password')),
#            verify=False,
#            proxies={'https': 'http://172.23.3.30:3128'},
#            files={'form.widgets.file': (filename, file(ocr_filename))},
#            data={'form.buttons.import': 'Import',
#                  'form.widgets.portal_type': cfg.get('default_type'),
#                  'form.widgets.location': cfg.get('default_directory'),
#                  'form.widgets.owner': cfg.get('user'),
#                 })
#    except Exception, e:
#        print e
#        return False
#
#    if r.status_code != requests.codes.ok:
#        file('/tmp/error.html', 'w').write(r.text)
#    return (r.status_code == requests.codes.ok)


while True:
    cfg = ConfigParser.ConfigParser()
    cfg.read(config_filepath)
    for section in cfg.sections():
        print 'processing', section
        imap_server = cfg.get(section, 'imap_server')
        ssl = cfg.getboolean(section, 'ssl')
        ged_base_url = cfg.get(section, 'ged_base_url')

        try:
            if ssl:
                M = imaplib.IMAP4_SSL(host=imap_server)
            else:
                M = imaplib.IMAP4(host_imap_server)
        except:
            print 'failed to connect to imap server'
            time.sleep(30)
            continue
        try:
            M.login(section, cfg.get(section, 'password'))
        except imaplib.IMAP4.error:
            continue
        M.select()
        typ, data = M.search(None, '(NOT SEEN)')
        for num in data[0].split():
            typ, data = M.fetch(num, '(RFC822)')
            msg = email.parser.Parser().parsestr(data[0][1])
            enable_ocr = True
            if 'disable_ocr' in msg['Subject']:
                enable_ocr = False
            for part in msg.walk():
                if part.get_content_type() == 'application/pdf':
                    filename = part.get_filename()
                    print '  handling', filename
                    payload = part.get_payload(decode=True)
                    if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
                        print '  error -> marking as unseen'
                        M.store(num, '-FLAGS', r'\Seen')
                    break
        M.close()
        M.logout()

    print 'waiting a bit'
    time.sleep(30)