This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
pfwbged.ocrloader/ocrloader.py

141 lines
5.0 KiB
Python

#! /usr/bin/env python
import ConfigParser
import urllib
import datetime
import email
import email.parser
import imaplib
import os
import os.path
import requests
import sys
import tempfile
import time
import shutil
import optparse
parser = optparse.OptionParser()
parser.add_option('--config', dest='config', default='ocrloader.ini')
(options, args) = parser.parse_args()
config_filepath = options.config
import logging
logging.basicConfig(level=logging.DEBUG)
def process(cfg, filename, payload, enable_ocr=True):
ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
if not enable_ocr:
if not os.path.exists(ocr_filename):
print >> sys.stderr, ' skipping OCR phase'
fd = file(ocr_filename, 'w')
fd.write(payload)
fd.close()
if not os.path.exists(ocr_filename) and enable_ocr:
fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
os.write(fd, payload)
os.close(fd)
print >> sys.stderr, ' running OCR on file'
os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \
(tmpfilename, ocr_filename))
if not os.path.exists(ocr_filename):
print >> sys.stderr, 'failed to OCR %s' % filename
file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
return False
if cfg.get('store_path'):
print ' storing file locally'
shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename))
return True
else:
print ' uploading file'
now = datetime.datetime.now()
title = cfg.get('title') % {'date': now.strftime('%d/%m/%Y'), 'time': now.strftime('%H:%M')}
t = os.system('curl -v --insecure -X POST '\
'--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\
'-F "form.buttons.import=Import" '\
'-F "form.widgets.portal_type=%s" '\
'-F "form.widgets.location=%s" '\
'-F "form.widgets.owner=%s" '\
'-F "form.widgets.treating_group=%s" '\
'-F "form.widgets.title=%s" '\
'-u admin:admin '\
'%s/@@fileimport' % (
ocr_filename, filename,
cfg.get('default_type'),
cfg.get('default_directory'),
cfg.get('user'),
cfg.get('treating_group'),
urllib.quote(title),
cfg.get('ged_base_url')))
return (t == 0)
# try:
# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
# verify=False,
# proxies={'https': 'http://172.23.3.30:3128'},
# files={'form.widgets.file': (filename, file(ocr_filename))},
# data={'form.buttons.import': 'Import',
# 'form.widgets.portal_type': cfg.get('default_type'),
# 'form.widgets.location': cfg.get('default_directory'),
# 'form.widgets.owner': cfg.get('user'),
# })
# except Exception, e:
# print e
# return False
#
# if r.status_code != requests.codes.ok:
# file('/tmp/error.html', 'w').write(r.text)
# return (r.status_code == requests.codes.ok)
while True:
cfg = ConfigParser.ConfigParser()
cfg.read(config_filepath)
for section in cfg.sections():
print 'processing', section
imap_server = cfg.get(section, 'imap_server')
ssl = cfg.getboolean(section, 'ssl')
ged_base_url = cfg.get(section, 'ged_base_url')
try:
if ssl:
M = imaplib.IMAP4_SSL(host=imap_server)
else:
M = imaplib.IMAP4(host_imap_server)
except:
print 'failed to connect to imap server'
time.sleep(30)
continue
try:
M.login(section, cfg.get(section, 'password'))
except imaplib.IMAP4.error:
continue
M.select()
typ, data = M.search(None, '(NOT SEEN)')
for num in data[0].split():
typ, data = M.fetch(num, '(RFC822)')
msg = email.parser.Parser().parsestr(data[0][1])
enable_ocr = True
if 'disable_ocr' in msg['Subject']:
enable_ocr = False
for part in msg.walk():
if part.get_content_type() == 'application/pdf':
filename = part.get_filename()
print ' handling', filename
payload = part.get_payload(decode=True)
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
print ' error -> marking as unseen'
M.store(num, '-FLAGS', r'\Seen')
break
M.close()
M.logout()
print 'waiting a bit', time.strftime('%Y-%m-%d %H:%M:%S')
time.sleep(30)