This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
pfwbged.ocrloader/ocrloader.py

205 lines
8.2 KiB
Python
Executable File

#! /usr/bin/env python
import ConfigParser
import urllib
import datetime
import email
import email.parser
import imaplib
import os
import os.path
import requests
import string
import subprocess
import sys
import tempfile
import time
import shutil
import socket
import optparse
parser = optparse.OptionParser()
parser.add_option('--config', dest='config', default='ocrloader.ini')
(options, args) = parser.parse_args()
config_filepath = options.config
import logging
import logging.handlers
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(message)s')
logging.getLogger().addHandler(logging.StreamHandler())
logging.getLogger().addHandler(logging.handlers.RotatingFileHandler(
filename='/var/log/pfwbged.ocrloader.log',
maxBytes=1000*1000
))
def process(cfg, filename, payload, enable_ocr=True):
ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
if not enable_ocr:
if not os.path.exists(ocr_filename):
logging.debug(' skipping OCR phase')
fd = file(ocr_filename, 'w')
fd.write(payload)
fd.close()
if not os.path.exists(ocr_filename) and enable_ocr:
fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
os.write(fd, payload)
os.close(fd)
logging.debug(' running OCR on file (%s / %s)' % (tmpfilename, ocr_filename))
cmd = ['/opt/ABBYYOCR9/abbyyocr9', '-pi',
'-rl', 'French', '-if', tmpfilename,
'-f', 'PDF', '-pem', 'ImageOnText', '-pfpr', '150',
'-pfq', '100', '-of', ocr_filename]
logging.debug(' %s' % ' '.join(cmd))
subprocess.call(cmd)
if os.stat(ocr_filename)[6] == 0:
os.unlink(ocr_filename)
if not os.path.exists(ocr_filename):
logging.error('failed to OCR %s', filename)
file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
return False
if os.stat(ocr_filename)[6] == 0:
os.unlink(ocr_filename)
if cfg.get('store_path'):
logging.debug(' storing file locally')
shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename))
return True
else:
logging.debug(' uploading file')
now = datetime.datetime.now()
title = string.Template(cfg.get('title')).substitute(
{'date': now.strftime('%d/%m/%Y'),
'time': now.strftime('%H:%M:%S')})
t = subprocess.call(['curl', '-v', '--insecure', '-X', 'POST',
'-F', 'form.widgets.file=@%s;filename=%s;type=application/pdf' % (ocr_filename, filename),
'-F', 'form.buttons.import=Import',
'-F', 'form.widgets.portal_type=%s' % cfg.get('default_type'),
'-F', 'form.widgets.location=%s' % cfg.get('default_directory'),
'-F', 'form.widgets.owner=%s' % cfg.get('user'),
'-F', 'form.widgets.treating_groups=%s' % (
cfg.get('treating_groups') or cfg.get('treating_group') or ''),
'-F', 'form.widgets.recipient_groups=%s' % (cfg.get('recipient_groups') or ''),
'-F', 'form.widgets.title=%s' % title,
'-F', 'form.widgets.notification_recipients=%s' % cfg.get('notification_recipients', ''),
'-F', 'form.widgets.keywords=%s' % cfg.get('keywords', ''),
'-F', 'form.widgets.transitions_to_apply=%s' % cfg.get('transitions_to_apply', ''),
'-u', '%s:%s' % (cfg.get('ged_username'), cfg.get('ged_password')),
'%s/@@fileimport' % cfg.get('ged_base_url'),])
return (t == 0)
# try:
# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
# verify=False,
# proxies={'https': 'http://172.23.3.30:3128'},
# files={'form.widgets.file': (filename, file(ocr_filename))},
# data={'form.buttons.import': 'Import',
# 'form.widgets.portal_type': cfg.get('default_type'),
# 'form.widgets.location': cfg.get('default_directory'),
# 'form.widgets.owner': cfg.get('user'),
# })
# except Exception, e:
# print e
# return False
#
# if r.status_code != requests.codes.ok:
# file('/tmp/error.html', 'w').write(r.text)
# return (r.status_code == requests.codes.ok)
while True:
cfg = ConfigParser.ConfigParser()
cfg.read(config_filepath)
for section in cfg.sections():
logging.debug('processing %s', section)
ged_base_url = cfg.get(section, 'ged_base_url')
if section.startswith('/'):
# handle dropped files
for basedir, dirnames, filenames in os.walk(section):
for filename in filenames:
if filename.endswith('.uploaded'):
continue
filepath = os.path.realpath(os.path.join(basedir, filename))
if not filepath.startswith(basedir):
# check the real path as an attacker could create a
# symlink to whatever directory and cause total
# destruction of it. (as well as the upload of its
# contents to the GED...).
logging.warn('wrong base dir for %s', filepath)
continue
payload = file(filepath).read()
logging.debug(' uploading file %s', filepath)
enable_ocr = (not filename.startswith('no-ocr-') and filename.endswith('.pdf'))
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
logging.error(' error processing %s', filepath)
else:
if cfg.get(section, 'file_success_action') == 'delete':
os.unlink(filepath)
else:
os.rename(filepath, filepath + '.uploaded')
continue
# handle imap mailboxes
imap_server = cfg.get(section, 'imap_server')
ssl = cfg.getboolean(section, 'ssl')
try:
if ssl:
M = imaplib.IMAP4_SSL(host=imap_server)
else:
M = imaplib.IMAP4(host_imap_server)
except:
logging.error('failed to connect to imap server')
time.sleep(30)
continue
try:
M.login(section, cfg.get(section, 'password'))
except imaplib.IMAP4.error:
continue
try:
M.select()
except socket.error:
logging.error('failure talking to imap server')
continue
typ, data = M.search(None, '(NOT SEEN)')
for num in data[0].split():
try:
typ, data = M.fetch(num, '(RFC822)')
except socket.error:
logging.error('failure talking to imap server')
break
msg = email.parser.Parser().parsestr(data[0][1])
enable_ocr = True
if 'disable_ocr' in msg['Subject']:
enable_ocr = False
for part in msg.walk():
if part.get_content_type() == 'application/pdf':
filename = part.get_filename()
logging.info(' handling %s', filename)
payload = part.get_payload(decode=True)
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
logging.error(' error -> marking as unseen')
try:
M.store(num, '-FLAGS', r'\Seen')
except socket.error:
logging.error('failure talking to imap server')
pass
break
try:
M.close()
M.logout()
except socket.error:
logging.error('failure talking to imap server')
pass
logging.debug('waiting a bit %s', time.strftime('%Y-%m-%d %H:%M:%S'))
time.sleep(30)