summaryrefslogtreecommitdiffstats
path: root/ocrloader.py
diff options
context:
space:
mode:
authorFrédéric Péters <fpeters@entrouvert.com>2013-08-14 11:21:32 (GMT)
committerFrédéric Péters <fpeters@entrouvert.com>2013-08-14 11:21:32 (GMT)
commit35bb78fbab82e9b85c4644196de5b85fd17be4e0 (patch)
treefb39ac5684db472e008de65092f77139aa502e90 /ocrloader.py
downloadpfwbged.ocrloader-35bb78fbab82e9b85c4644196de5b85fd17be4e0.zip
pfwbged.ocrloader-35bb78fbab82e9b85c4644196de5b85fd17be4e0.tar.gz
pfwbged.ocrloader-35bb78fbab82e9b85c4644196de5b85fd17be4e0.tar.bz2
add script and config file as they were developed on the server
Diffstat (limited to 'ocrloader.py')
-rw-r--r--ocrloader.py117
1 files changed, 117 insertions, 0 deletions
diff --git a/ocrloader.py b/ocrloader.py
new file mode 100644
index 0000000..96dea41
--- /dev/null
+++ b/ocrloader.py
@@ -0,0 +1,117 @@
+#! /usr/bin/env python
+
+import ConfigParser
+import email
+import email.parser
+import imaplib
+import os
+import os.path
+import requests
+import sys
+import tempfile
+import time
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+def process(cfg, filename, payload, enable_ocr=True):
+ ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
+ if not enable_ocr:
+ if not os.path.exists(ocr_filename):
+ print >> sys.stderr, ' skipping OCR phase'
+ fd = file(ocr_filename, 'w')
+ fd.write(payload)
+ fd.close()
+
+ if not os.path.exists(ocr_filename) and not enable_ocr:
+ fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
+ os.write(fd, payload)
+ os.close(fd)
+ print >> sys.stderr, ' running OCR on file'
+ os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \
+ (tmpfilename, ocr_filename))
+ if not os.path.exists(ocr_filename):
+ print >> sys.stderr, 'failed to OCR %s' % filename
+ file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
+ return False
+
+ print ' uploading file'
+ t = os.system('curl -v --insecure -X POST '\
+ '--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\
+ '-F "form.buttons.import=Import" '\
+ '-F "form.widgets.portal_type=%s" '\
+ '-F "form.widgets.location=%s" '\
+ '-F "form.widgets.owner=%s" '\
+ '-u admin:admin '\
+ '%s/@@fileimport' % (
+ ocr_filename, filename,
+ cfg.get('default_type'),
+ cfg.get('default_directory'),
+ cfg.get('user'),
+ cfg.get('ged_base_url')))
+ return (t == 0)
+
+
+# try:
+# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
+# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
+# verify=False,
+# proxies={'https': 'http://172.23.3.30:3128'},
+# files={'form.widgets.file': (filename, file(ocr_filename))},
+# data={'form.buttons.import': 'Import',
+# 'form.widgets.portal_type': cfg.get('default_type'),
+# 'form.widgets.location': cfg.get('default_directory'),
+# 'form.widgets.owner': cfg.get('user'),
+# })
+# except Exception, e:
+# print e
+# return False
+#
+# if r.status_code != requests.codes.ok:
+# file('/tmp/error.html', 'w').write(r.text)
+# return (r.status_code == requests.codes.ok)
+
+
+while True:
+ cfg = ConfigParser.ConfigParser()
+ cfg.read('ocrloader.ini')
+ for section in cfg.sections():
+ print 'processing', section
+ imap_server = cfg.get(section, 'imap_server')
+ ssl = cfg.getboolean(section, 'ssl')
+ ged_base_url = cfg.get(section, 'ged_base_url')
+
+ try:
+ if ssl:
+ M = imaplib.IMAP4_SSL(host=imap_server)
+ else:
+ M = imaplib.IMAP4(host_imap_server)
+ except:
+ print 'failed to connect to imap server'
+ time.sleep(30)
+ continue
+ try:
+ M.login(section, cfg.get(section, 'password'))
+ except imaplib.IMAP4.error:
+ continue
+ M.select()
+ typ, data = M.search(None, '(NOT SEEN)')
+ for num in data[0].split():
+ typ, data = M.fetch(num, '(RFC822)')
+ msg = email.parser.Parser().parsestr(data[0][1])
+ enable_ocr = True
+ if 'disable_ocr' in msg['Subject']:
+ enable_ocr = False
+ for part in msg.walk():
+ if part.get_content_type() == 'application/pdf':
+ filename = part.get_filename()
+ print ' handling', filename
+ payload = part.get_payload(decode=True)
+ if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
+ print ' error -> marking as unseen'
+ M.store(num, '-FLAGS', r'\Seen')
+ break
+ M.close()
+ M.logout()
+
+ print 'waiting a bit'
+ time.sleep(30)