#! /usr/bin/env python import ConfigParser import urllib import datetime import email import email.parser import imaplib import os import os.path import requests import string import subprocess import sys import tempfile import time import shutil import socket import optparse parser = optparse.OptionParser() parser.add_option('--config', dest='config', default='ocrloader.ini') (options, args) = parser.parse_args() config_filepath = options.config import logging import logging.handlers logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s') logging.getLogger().addHandler(logging.StreamHandler()) logging.getLogger().addHandler(logging.handlers.RotatingFileHandler( filename='/var/log/pfwbged.ocrloader.log', maxBytes=1000*1000 )) def process(cfg, filename, payload, enable_ocr=True): ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename) if not enable_ocr: if not os.path.exists(ocr_filename): logging.debug(' skipping OCR phase') fd = file(ocr_filename, 'w') fd.write(payload) fd.close() if not os.path.exists(ocr_filename) and enable_ocr: fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-') os.write(fd, payload) os.close(fd) logging.debug(' running OCR on file (%s / %s)' % (tmpfilename, ocr_filename)) cmd = ['/opt/ABBYYOCR9/abbyyocr9', '-pi', '-rl', 'French', '-if', tmpfilename, '-f', 'PDF', '-pem', 'ImageOnText', '-pfpr', '150', '-pfq', '100', '-of', ocr_filename] logging.debug(' %s' % ' '.join(cmd)) subprocess.call(cmd) if os.stat(ocr_filename)[6] == 0: os.unlink(ocr_filename) if not os.path.exists(ocr_filename): logging.error('failed to OCR %s', filename) file('/tmp/' + filename, 'w').write(payload) # keep it for inspection return False if os.stat(ocr_filename)[6] == 0: os.unlink(ocr_filename) if cfg.get('store_path'): logging.debug(' storing file locally') shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename)) return True else: logging.debug(' uploading file') now = datetime.datetime.now() title = string.Template(cfg.get('title')).substitute( {'date': now.strftime('%d/%m/%Y'), 'time': now.strftime('%H:%M:%S')}) t = subprocess.call(['curl', '-v', '--insecure', '-X', 'POST', '-F', 'form.widgets.file=@%s;filename=%s;type=application/pdf' % (ocr_filename, filename), '-F', 'form.buttons.import=Import', '-F', 'form.widgets.portal_type=%s' % cfg.get('default_type'), '-F', 'form.widgets.location=%s' % cfg.get('default_directory'), '-F', 'form.widgets.owner=%s' % cfg.get('user'), '-F', 'form.widgets.treating_groups=%s' % ( cfg.get('treating_groups') or cfg.get('treating_group') or ''), '-F', 'form.widgets.recipient_groups=%s' % (cfg.get('recipient_groups') or ''), '-F', 'form.widgets.title=%s' % title, '-F', 'form.widgets.notification_recipients=%s' % cfg.get('notification_recipients', ''), '-F', 'form.widgets.keywords=%s' % cfg.get('keywords', ''), '-F', 'form.widgets.transitions_to_apply=%s' % cfg.get('transitions_to_apply', ''), '-u', '%s:%s' % (cfg.get('ged_username'), cfg.get('ged_password')), '%s/@@fileimport' % cfg.get('ged_base_url'),]) return (t == 0) # try: # r = requests.post(cfg.get('ged_base_url') + '/@@fileimport', # auth=(cfg.get('ged_username'), cfg.get('ged_password')), # verify=False, # proxies={'https': 'http://172.23.3.30:3128'}, # files={'form.widgets.file': (filename, file(ocr_filename))}, # data={'form.buttons.import': 'Import', # 'form.widgets.portal_type': cfg.get('default_type'), # 'form.widgets.location': cfg.get('default_directory'), # 'form.widgets.owner': cfg.get('user'), # }) # except Exception, e: # print e # return False # # if r.status_code != requests.codes.ok: # file('/tmp/error.html', 'w').write(r.text) # return (r.status_code == requests.codes.ok) while True: cfg = ConfigParser.ConfigParser() cfg.read(config_filepath) for section in cfg.sections(): logging.debug('processing %s', section) ged_base_url = cfg.get(section, 'ged_base_url') if section.startswith('/'): # handle dropped files for basedir, dirnames, filenames in os.walk(section): for filename in filenames: if filename.endswith('.uploaded'): continue filepath = os.path.realpath(os.path.join(basedir, filename)) if not filepath.startswith(basedir): # check the real path as an attacker could create a # symlink to whatever directory and cause total # destruction of it. (as well as the upload of its # contents to the GED...). logging.warn('wrong base dir for %s', filepath) continue payload = file(filepath).read() logging.debug(' uploading file %s', filepath) enable_ocr = (not filename.startswith('no-ocr-') and filename.endswith('.pdf')) if not process(dict(cfg.items(section)), filename, payload, enable_ocr): logging.error(' error processing %s', filepath) else: if cfg.get(section, 'file_success_action') == 'delete': os.unlink(filepath) else: os.rename(filepath, filepath + '.uploaded') continue # handle imap mailboxes imap_server = cfg.get(section, 'imap_server') ssl = cfg.getboolean(section, 'ssl') try: if ssl: M = imaplib.IMAP4_SSL(host=imap_server) else: M = imaplib.IMAP4(host_imap_server) except: logging.error('failed to connect to imap server') time.sleep(30) continue try: M.login(section, cfg.get(section, 'password')) except imaplib.IMAP4.error: continue try: M.select() except socket.error: logging.error('failure talking to imap server') continue typ, data = M.search(None, '(NOT SEEN)') for num in data[0].split(): try: typ, data = M.fetch(num, '(RFC822)') except socket.error: logging.error('failure talking to imap server') break msg = email.parser.Parser().parsestr(data[0][1]) enable_ocr = True if 'disable_ocr' in msg['Subject']: enable_ocr = False for part in msg.walk(): if part.get_content_type() == 'application/pdf': filename = part.get_filename() logging.info(' handling %s', filename) payload = part.get_payload(decode=True) if not process(dict(cfg.items(section)), filename, payload, enable_ocr): logging.error(' error -> marking as unseen') try: M.store(num, '-FLAGS', r'\Seen') except socket.error: logging.error('failure talking to imap server') pass break try: M.close() M.logout() except socket.error: logging.error('failure talking to imap server') pass logging.debug('waiting a bit %s', time.strftime('%Y-%m-%d %H:%M:%S')) time.sleep(30)