2013-08-14 13:21:32 +02:00
|
|
|
#! /usr/bin/env python
|
|
|
|
|
|
|
|
import ConfigParser
|
2014-01-02 13:47:34 +01:00
|
|
|
import urllib
|
2013-11-05 13:06:33 +01:00
|
|
|
import datetime
|
2013-08-14 13:21:32 +02:00
|
|
|
import email
|
|
|
|
import email.parser
|
|
|
|
import imaplib
|
|
|
|
import os
|
|
|
|
import os.path
|
|
|
|
import requests
|
2014-02-24 14:44:41 +01:00
|
|
|
import string
|
2014-03-10 10:01:38 +01:00
|
|
|
import subprocess
|
2013-08-14 13:21:32 +02:00
|
|
|
import sys
|
|
|
|
import tempfile
|
|
|
|
import time
|
2013-10-10 13:41:22 +02:00
|
|
|
import shutil
|
2013-08-14 13:21:32 +02:00
|
|
|
|
2013-10-10 13:44:46 +02:00
|
|
|
import optparse
|
2013-10-10 13:36:45 +02:00
|
|
|
|
2013-10-10 13:44:46 +02:00
|
|
|
parser = optparse.OptionParser()
|
|
|
|
parser.add_option('--config', dest='config', default='ocrloader.ini')
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
|
|
|
config_filepath = options.config
|
2013-10-10 13:36:45 +02:00
|
|
|
|
|
|
|
|
2013-08-14 13:21:32 +02:00
|
|
|
import logging
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.basicConfig(level=logging.DEBUG,
|
|
|
|
filename='/var/log/pfwbged.ocrloader.log',
|
|
|
|
format='%(asctime)s %(message)s')
|
2014-09-23 14:25:23 +02:00
|
|
|
logging.getLogger().addHandler(logging.StreamHandler())
|
2014-03-27 11:57:55 +01:00
|
|
|
|
2013-08-14 13:21:32 +02:00
|
|
|
def process(cfg, filename, payload, enable_ocr=True):
|
|
|
|
ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
|
|
|
|
if not enable_ocr:
|
|
|
|
if not os.path.exists(ocr_filename):
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.debug(' skipping OCR phase')
|
2013-08-14 13:21:32 +02:00
|
|
|
fd = file(ocr_filename, 'w')
|
|
|
|
fd.write(payload)
|
|
|
|
fd.close()
|
|
|
|
|
2013-08-16 10:57:18 +02:00
|
|
|
if not os.path.exists(ocr_filename) and enable_ocr:
|
2013-08-14 13:21:32 +02:00
|
|
|
fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
|
|
|
|
os.write(fd, payload)
|
|
|
|
os.close(fd)
|
2014-09-23 14:25:40 +02:00
|
|
|
logging.debug(' running OCR on file (%s / %s)' % (tmpfilename, ocr_filename))
|
|
|
|
cmd = ['/opt/ABBYYOCR9/abbyyocr9', '-pi',
|
2014-03-12 12:37:02 +01:00
|
|
|
'-rl', 'French', '-if', tmpfilename,
|
|
|
|
'-f', 'PDF', '-pem', 'ImageOnText', '-pfpr', '150',
|
2014-09-23 14:25:40 +02:00
|
|
|
'-pfq', '100', '-of', ocr_filename]
|
|
|
|
logging.debug(' %s' % ' '.join(cmd))
|
|
|
|
subprocess.call(cmd)
|
|
|
|
if os.stat(ocr_filename)[6] == 0:
|
|
|
|
os.unlink(ocr_filename)
|
2013-08-14 13:21:32 +02:00
|
|
|
if not os.path.exists(ocr_filename):
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.error('failed to OCR %s', filename)
|
2013-08-14 13:21:32 +02:00
|
|
|
file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
|
|
|
|
return False
|
|
|
|
|
2014-09-23 14:25:40 +02:00
|
|
|
if os.stat(ocr_filename)[6] == 0:
|
|
|
|
os.unlink(ocr_filename)
|
|
|
|
|
2013-10-10 13:41:22 +02:00
|
|
|
if cfg.get('store_path'):
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.debug(' storing file locally')
|
2013-10-10 13:41:22 +02:00
|
|
|
shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename))
|
|
|
|
return True
|
|
|
|
else:
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.debug(' uploading file')
|
2013-11-05 13:06:33 +01:00
|
|
|
now = datetime.datetime.now()
|
2014-02-24 14:44:41 +01:00
|
|
|
title = string.Template(cfg.get('title')).substitute(
|
|
|
|
{'date': now.strftime('%d/%m/%Y'),
|
|
|
|
'time': now.strftime('%H:%M')})
|
2014-03-10 10:01:38 +01:00
|
|
|
t = subprocess.call(['curl', '-v', '--insecure', '-X', 'POST',
|
|
|
|
'-F', 'form.widgets.file=@%s;filename=%s;type=application/pdf' % (ocr_filename, filename),
|
|
|
|
'-F', 'form.buttons.import=Import',
|
|
|
|
'-F', 'form.widgets.portal_type=%s' % cfg.get('default_type'),
|
|
|
|
'-F', 'form.widgets.location=%s' % cfg.get('default_directory'),
|
|
|
|
'-F', 'form.widgets.owner=%s' % cfg.get('user'),
|
|
|
|
'-F', 'form.widgets.treating_group=%s' % cfg.get('treating_group'),
|
|
|
|
'-F', 'form.widgets.title=%s' % title,
|
2014-03-12 15:28:51 +01:00
|
|
|
'-u', '%s:%s' % (cfg.get('ged_username'), cfg.get('ged_password')),
|
2014-03-10 10:03:08 +01:00
|
|
|
'%s/@@fileimport' % cfg.get('ged_base_url'),])
|
2013-10-10 13:41:22 +02:00
|
|
|
return (t == 0)
|
2013-08-14 13:21:32 +02:00
|
|
|
|
|
|
|
|
|
|
|
# try:
|
|
|
|
# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
|
|
|
|
# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
|
|
|
|
# verify=False,
|
|
|
|
# proxies={'https': 'http://172.23.3.30:3128'},
|
|
|
|
# files={'form.widgets.file': (filename, file(ocr_filename))},
|
|
|
|
# data={'form.buttons.import': 'Import',
|
|
|
|
# 'form.widgets.portal_type': cfg.get('default_type'),
|
|
|
|
# 'form.widgets.location': cfg.get('default_directory'),
|
|
|
|
# 'form.widgets.owner': cfg.get('user'),
|
|
|
|
# })
|
|
|
|
# except Exception, e:
|
|
|
|
# print e
|
|
|
|
# return False
|
|
|
|
#
|
|
|
|
# if r.status_code != requests.codes.ok:
|
|
|
|
# file('/tmp/error.html', 'w').write(r.text)
|
|
|
|
# return (r.status_code == requests.codes.ok)
|
|
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
cfg = ConfigParser.ConfigParser()
|
2013-10-10 13:36:45 +02:00
|
|
|
cfg.read(config_filepath)
|
2013-08-14 13:21:32 +02:00
|
|
|
for section in cfg.sections():
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.debug('processing %s', section)
|
2014-10-07 10:26:42 +02:00
|
|
|
ged_base_url = cfg.get(section, 'ged_base_url')
|
|
|
|
|
|
|
|
if section.startswith('/'):
|
|
|
|
# handle dropped files
|
|
|
|
for basedir, dirnames, filenames in os.walk(section):
|
|
|
|
for filename in filenames:
|
|
|
|
if filename.endswith('.uploaded'):
|
|
|
|
continue
|
|
|
|
filepath = os.path.realpath(os.path.join(basedir, filename))
|
|
|
|
if not filepath.startswith(basedir):
|
|
|
|
# check the real path as an attacker could create a
|
|
|
|
# symlink to whatever directory and cause total
|
|
|
|
# destruction of it. (as well as the upload of its
|
|
|
|
# contents to the GED...).
|
|
|
|
logging.warn('wrong base dir for %s', filepath)
|
|
|
|
continue
|
|
|
|
payload = file(filepath).read()
|
|
|
|
logging.debug(' uploading file %s', filepath)
|
|
|
|
enable_ocr = (filename.startswith('ocr-') and filename.endswith('.pdf'))
|
|
|
|
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
|
|
|
|
logging.error(' error processing %s', filepath)
|
|
|
|
else:
|
|
|
|
if cfg.get(section, 'file_success_action') == 'delete':
|
|
|
|
os.unlink(filepath)
|
|
|
|
else:
|
|
|
|
os.rename(filepath, filepath + '.uploaded')
|
|
|
|
continue
|
|
|
|
|
|
|
|
# handle imap mailboxes
|
2013-08-14 13:21:32 +02:00
|
|
|
imap_server = cfg.get(section, 'imap_server')
|
|
|
|
ssl = cfg.getboolean(section, 'ssl')
|
|
|
|
|
|
|
|
try:
|
|
|
|
if ssl:
|
|
|
|
M = imaplib.IMAP4_SSL(host=imap_server)
|
|
|
|
else:
|
|
|
|
M = imaplib.IMAP4(host_imap_server)
|
|
|
|
except:
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.error('failed to connect to imap server')
|
2013-08-14 13:21:32 +02:00
|
|
|
time.sleep(30)
|
|
|
|
continue
|
|
|
|
try:
|
|
|
|
M.login(section, cfg.get(section, 'password'))
|
|
|
|
except imaplib.IMAP4.error:
|
|
|
|
continue
|
2014-09-23 14:25:57 +02:00
|
|
|
try:
|
|
|
|
M.select()
|
|
|
|
except socket.error:
|
|
|
|
logging.error('failure talking to imap server')
|
|
|
|
continue
|
2013-08-14 13:21:32 +02:00
|
|
|
typ, data = M.search(None, '(NOT SEEN)')
|
|
|
|
for num in data[0].split():
|
2014-09-23 14:25:57 +02:00
|
|
|
try:
|
|
|
|
typ, data = M.fetch(num, '(RFC822)')
|
|
|
|
except socket.error:
|
|
|
|
logging.error('failure talking to imap server')
|
|
|
|
break
|
2013-08-14 13:21:32 +02:00
|
|
|
msg = email.parser.Parser().parsestr(data[0][1])
|
|
|
|
enable_ocr = True
|
|
|
|
if 'disable_ocr' in msg['Subject']:
|
|
|
|
enable_ocr = False
|
|
|
|
for part in msg.walk():
|
|
|
|
if part.get_content_type() == 'application/pdf':
|
|
|
|
filename = part.get_filename()
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.info(' handling %s', filename)
|
2013-08-14 13:21:32 +02:00
|
|
|
payload = part.get_payload(decode=True)
|
|
|
|
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.error(' error -> marking as unseen')
|
2014-09-23 14:25:57 +02:00
|
|
|
try:
|
|
|
|
M.store(num, '-FLAGS', r'\Seen')
|
|
|
|
except socket.error:
|
|
|
|
logging.error('failure talking to imap server')
|
|
|
|
pass
|
2013-08-14 13:21:32 +02:00
|
|
|
break
|
2014-09-23 14:25:57 +02:00
|
|
|
try:
|
|
|
|
M.close()
|
|
|
|
M.logout()
|
|
|
|
except socket.error:
|
|
|
|
logging.error('failure talking to imap server')
|
|
|
|
pass
|
2013-08-14 13:21:32 +02:00
|
|
|
|
2014-03-27 11:57:55 +01:00
|
|
|
logging.debug('waiting a bit %s', time.strftime('%Y-%m-%d %H:%M:%S'))
|
2013-08-14 13:21:32 +02:00
|
|
|
time.sleep(30)
|