From 35bb78fbab82e9b85c4644196de5b85fd17be4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20P=C3=A9ters?= Date: Wed, 14 Aug 2013 13:21:32 +0200 Subject: [PATCH] add script and config file as they were developed on the server --- ocrloader.ini | 19 ++++++++ ocrloader.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 ocrloader.ini create mode 100644 ocrloader.py diff --git a/ocrloader.ini b/ocrloader.ini new file mode 100644 index 0000000..4f865c4 --- /dev/null +++ b/ocrloader.ini @@ -0,0 +1,19 @@ +[DEFAULT] +imap_server = imap.pfwb.be +ssl = yes +ged_base_url = http://test.ged.pfwb.be +ged_username = admin +ged_password = xxxxx +ocrized_directory = /root/tests + +[xxxx-test-greffe@pfwb.be] +password = xxxxxx +default_type = dmsincomingmail +default_directory = documents +user = secretariat-greffe + +[xxxx-test-gaetandeberdt@pfwb.be] +password = xxxxxx +default_type = dmsdocument +default_directory = Members/gaetan +user = gaetan diff --git a/ocrloader.py b/ocrloader.py new file mode 100644 index 0000000..96dea41 --- /dev/null +++ b/ocrloader.py @@ -0,0 +1,117 @@ +#! /usr/bin/env python + +import ConfigParser +import email +import email.parser +import imaplib +import os +import os.path +import requests +import sys +import tempfile +import time + +import logging +logging.basicConfig(level=logging.DEBUG) +def process(cfg, filename, payload, enable_ocr=True): + ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename) + if not enable_ocr: + if not os.path.exists(ocr_filename): + print >> sys.stderr, ' skipping OCR phase' + fd = file(ocr_filename, 'w') + fd.write(payload) + fd.close() + + if not os.path.exists(ocr_filename) and not enable_ocr: + fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-') + os.write(fd, payload) + os.close(fd) + print >> sys.stderr, ' running OCR on file' + os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \ + (tmpfilename, ocr_filename)) + if not os.path.exists(ocr_filename): + print >> sys.stderr, 'failed to OCR %s' % filename + file('/tmp/' + filename, 'w').write(payload) # keep it for inspection + return False + + print ' uploading file' + t = os.system('curl -v --insecure -X POST '\ + '--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\ + '-F "form.buttons.import=Import" '\ + '-F "form.widgets.portal_type=%s" '\ + '-F "form.widgets.location=%s" '\ + '-F "form.widgets.owner=%s" '\ + '-u admin:admin '\ + '%s/@@fileimport' % ( + ocr_filename, filename, + cfg.get('default_type'), + cfg.get('default_directory'), + cfg.get('user'), + cfg.get('ged_base_url'))) + return (t == 0) + + +# try: +# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport', +# auth=(cfg.get('ged_username'), cfg.get('ged_password')), +# verify=False, +# proxies={'https': 'http://172.23.3.30:3128'}, +# files={'form.widgets.file': (filename, file(ocr_filename))}, +# data={'form.buttons.import': 'Import', +# 'form.widgets.portal_type': cfg.get('default_type'), +# 'form.widgets.location': cfg.get('default_directory'), +# 'form.widgets.owner': cfg.get('user'), +# }) +# except Exception, e: +# print e +# return False +# +# if r.status_code != requests.codes.ok: +# file('/tmp/error.html', 'w').write(r.text) +# return (r.status_code == requests.codes.ok) + + +while True: + cfg = ConfigParser.ConfigParser() + cfg.read('ocrloader.ini') + for section in cfg.sections(): + print 'processing', section + imap_server = cfg.get(section, 'imap_server') + ssl = cfg.getboolean(section, 'ssl') + ged_base_url = cfg.get(section, 'ged_base_url') + + try: + if ssl: + M = imaplib.IMAP4_SSL(host=imap_server) + else: + M = imaplib.IMAP4(host_imap_server) + except: + print 'failed to connect to imap server' + time.sleep(30) + continue + try: + M.login(section, cfg.get(section, 'password')) + except imaplib.IMAP4.error: + continue + M.select() + typ, data = M.search(None, '(NOT SEEN)') + for num in data[0].split(): + typ, data = M.fetch(num, '(RFC822)') + msg = email.parser.Parser().parsestr(data[0][1]) + enable_ocr = True + if 'disable_ocr' in msg['Subject']: + enable_ocr = False + for part in msg.walk(): + if part.get_content_type() == 'application/pdf': + filename = part.get_filename() + print ' handling', filename + payload = part.get_payload(decode=True) + if not process(dict(cfg.items(section)), filename, payload, enable_ocr): + print ' error -> marking as unseen' + M.store(num, '-FLAGS', r'\Seen') + break + M.close() + M.logout() + + print 'waiting a bit' + time.sleep(30)