#! /usr/bin/env python import ConfigParser import email import email.parser import imaplib import os import os.path import requests import sys import tempfile import time import argparse parser = argparse.ArgumentParser() parser.add_argument('--config', default='ocrloader.ini') args = parser.parse_args() config_filepath = args.config import logging logging.basicConfig(level=logging.DEBUG) def process(cfg, filename, payload, enable_ocr=True): ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename) if not enable_ocr: if not os.path.exists(ocr_filename): print >> sys.stderr, ' skipping OCR phase' fd = file(ocr_filename, 'w') fd.write(payload) fd.close() if not os.path.exists(ocr_filename) and enable_ocr: fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-') os.write(fd, payload) os.close(fd) print >> sys.stderr, ' running OCR on file' os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \ (tmpfilename, ocr_filename)) if not os.path.exists(ocr_filename): print >> sys.stderr, 'failed to OCR %s' % filename file('/tmp/' + filename, 'w').write(payload) # keep it for inspection return False print ' uploading file' t = os.system('curl -v --insecure -X POST '\ '--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\ '-F "form.buttons.import=Import" '\ '-F "form.widgets.portal_type=%s" '\ '-F "form.widgets.location=%s" '\ '-F "form.widgets.owner=%s" '\ '-u admin:admin '\ '%s/@@fileimport' % ( ocr_filename, filename, cfg.get('default_type'), cfg.get('default_directory'), cfg.get('user'), cfg.get('ged_base_url'))) return (t == 0) # try: # r = requests.post(cfg.get('ged_base_url') + '/@@fileimport', # auth=(cfg.get('ged_username'), cfg.get('ged_password')), # verify=False, # proxies={'https': 'http://172.23.3.30:3128'}, # files={'form.widgets.file': (filename, file(ocr_filename))}, # data={'form.buttons.import': 'Import', # 'form.widgets.portal_type': cfg.get('default_type'), # 'form.widgets.location': cfg.get('default_directory'), # 'form.widgets.owner': cfg.get('user'), # }) # except Exception, e: # print e # return False # # if r.status_code != requests.codes.ok: # file('/tmp/error.html', 'w').write(r.text) # return (r.status_code == requests.codes.ok) while True: cfg = ConfigParser.ConfigParser() cfg.read(config_filepath) for section in cfg.sections(): print 'processing', section imap_server = cfg.get(section, 'imap_server') ssl = cfg.getboolean(section, 'ssl') ged_base_url = cfg.get(section, 'ged_base_url') try: if ssl: M = imaplib.IMAP4_SSL(host=imap_server) else: M = imaplib.IMAP4(host_imap_server) except: print 'failed to connect to imap server' time.sleep(30) continue try: M.login(section, cfg.get(section, 'password')) except imaplib.IMAP4.error: continue M.select() typ, data = M.search(None, '(NOT SEEN)') for num in data[0].split(): typ, data = M.fetch(num, '(RFC822)') msg = email.parser.Parser().parsestr(data[0][1]) enable_ocr = True if 'disable_ocr' in msg['Subject']: enable_ocr = False for part in msg.walk(): if part.get_content_type() == 'application/pdf': filename = part.get_filename() print ' handling', filename payload = part.get_payload(decode=True) if not process(dict(cfg.items(section)), filename, payload, enable_ocr): print ' error -> marking as unseen' M.store(num, '-FLAGS', r'\Seen') break M.close() M.logout() print 'waiting a bit' time.sleep(30)