#! /usr/bin/env python import ConfigParser import urllib import datetime import email import email.parser import imaplib import os import os.path import requests import sys import tempfile import time import shutil import optparse parser = optparse.OptionParser() parser.add_option('--config', dest='config', default='ocrloader.ini') (options, args) = parser.parse_args() config_filepath = options.config import logging logging.basicConfig(level=logging.DEBUG) def process(cfg, filename, payload, enable_ocr=True): ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename) if not enable_ocr: if not os.path.exists(ocr_filename): print >> sys.stderr, ' skipping OCR phase' fd = file(ocr_filename, 'w') fd.write(payload) fd.close() if not os.path.exists(ocr_filename) and enable_ocr: fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-') os.write(fd, payload) os.close(fd) print >> sys.stderr, ' running OCR on file' os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \ (tmpfilename, ocr_filename)) if not os.path.exists(ocr_filename): print >> sys.stderr, 'failed to OCR %s' % filename file('/tmp/' + filename, 'w').write(payload) # keep it for inspection return False if cfg.get('store_path'): print ' storing file locally' shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename)) return True else: print ' uploading file' now = datetime.datetime.now() title = cfg.get('title') % {'date': now.strftime('%d/%m/%Y'), 'time': now.strftime('%H:%M')} t = os.system('curl -v --insecure -X POST '\ '--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\ '-F "form.buttons.import=Import" '\ '-F "form.widgets.portal_type=%s" '\ '-F "form.widgets.location=%s" '\ '-F "form.widgets.owner=%s" '\ '-F "form.widgets.treating_group=%s" '\ '-F "form.widgets.title=%s" '\ '-u admin:admin '\ '%s/@@fileimport' % ( ocr_filename, filename, cfg.get('default_type'), cfg.get('default_directory'), cfg.get('user'), cfg.get('treating_group'), urllib.quote(title), cfg.get('ged_base_url'))) return (t == 0) # try: # r = requests.post(cfg.get('ged_base_url') + '/@@fileimport', # auth=(cfg.get('ged_username'), cfg.get('ged_password')), # verify=False, # proxies={'https': 'http://172.23.3.30:3128'}, # files={'form.widgets.file': (filename, file(ocr_filename))}, # data={'form.buttons.import': 'Import', # 'form.widgets.portal_type': cfg.get('default_type'), # 'form.widgets.location': cfg.get('default_directory'), # 'form.widgets.owner': cfg.get('user'), # }) # except Exception, e: # print e # return False # # if r.status_code != requests.codes.ok: # file('/tmp/error.html', 'w').write(r.text) # return (r.status_code == requests.codes.ok) while True: cfg = ConfigParser.ConfigParser() cfg.read(config_filepath) for section in cfg.sections(): print 'processing', section imap_server = cfg.get(section, 'imap_server') ssl = cfg.getboolean(section, 'ssl') ged_base_url = cfg.get(section, 'ged_base_url') try: if ssl: M = imaplib.IMAP4_SSL(host=imap_server) else: M = imaplib.IMAP4(host_imap_server) except: print 'failed to connect to imap server' time.sleep(30) continue try: M.login(section, cfg.get(section, 'password')) except imaplib.IMAP4.error: continue M.select() typ, data = M.search(None, '(NOT SEEN)') for num in data[0].split(): typ, data = M.fetch(num, '(RFC822)') msg = email.parser.Parser().parsestr(data[0][1]) enable_ocr = True if 'disable_ocr' in msg['Subject']: enable_ocr = False for part in msg.walk(): if part.get_content_type() == 'application/pdf': filename = part.get_filename() print ' handling', filename payload = part.get_payload(decode=True) if not process(dict(cfg.items(section)), filename, payload, enable_ocr): print ' error -> marking as unseen' M.store(num, '-FLAGS', r'\Seen') break M.close() M.logout() print 'waiting a bit', time.strftime('%Y-%m-%d %H:%M:%S') time.sleep(30)