This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
pfwbged.ocrloader/ocrloader.py

138 lines
4.9 KiB
Python

#! /usr/bin/env python
import ConfigParser
import datetime
import email
import email.parser
import imaplib
import os
import os.path
import requests
import sys
import tempfile
import time
import shutil
import optparse
parser = optparse.OptionParser()
parser.add_option('--config', dest='config', default='ocrloader.ini')
(options, args) = parser.parse_args()
config_filepath = options.config
import logging
logging.basicConfig(level=logging.DEBUG)
def process(cfg, filename, payload, enable_ocr=True):
ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
if not enable_ocr:
if not os.path.exists(ocr_filename):
print >> sys.stderr, ' skipping OCR phase'
fd = file(ocr_filename, 'w')
fd.write(payload)
fd.close()
if not os.path.exists(ocr_filename) and enable_ocr:
fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
os.write(fd, payload)
os.close(fd)
print >> sys.stderr, ' running OCR on file'
os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \
(tmpfilename, ocr_filename))
if not os.path.exists(ocr_filename):
print >> sys.stderr, 'failed to OCR %s' % filename
file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
return False
if cfg.get('store_path'):
print ' storing file locally'
shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename))
return True
else:
print ' uploading file'
now = datetime.datetime.now()
title = cfg.get('title') % {'date': now.strftime('%d/%m/%Y'), 'time': now.strftime('%H:%M')}
t = os.system('curl -v --insecure -X POST '\
'--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\
'-F "form.buttons.import=Import" '\
'-F "form.widgets.portal_type=%s" '\
'-F "form.widgets.location=%s" '\
'-F "form.widgets.owner=%s" '\
'-F "form.widgets.title=%s" '\
'-u admin:admin '\
'%s/@@fileimport' % (
ocr_filename, filename,
cfg.get('default_type'),
cfg.get('default_directory'),
cfg.get('user'),
urllib.quote(title),
cfg.get('ged_base_url')))
return (t == 0)
# try:
# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
# verify=False,
# proxies={'https': 'http://172.23.3.30:3128'},
# files={'form.widgets.file': (filename, file(ocr_filename))},
# data={'form.buttons.import': 'Import',
# 'form.widgets.portal_type': cfg.get('default_type'),
# 'form.widgets.location': cfg.get('default_directory'),
# 'form.widgets.owner': cfg.get('user'),
# })
# except Exception, e:
# print e
# return False
#
# if r.status_code != requests.codes.ok:
# file('/tmp/error.html', 'w').write(r.text)
# return (r.status_code == requests.codes.ok)
while True:
cfg = ConfigParser.ConfigParser()
cfg.read(config_filepath)
for section in cfg.sections():
print 'processing', section
imap_server = cfg.get(section, 'imap_server')
ssl = cfg.getboolean(section, 'ssl')
ged_base_url = cfg.get(section, 'ged_base_url')
try:
if ssl:
M = imaplib.IMAP4_SSL(host=imap_server)
else:
M = imaplib.IMAP4(host_imap_server)
except:
print 'failed to connect to imap server'
time.sleep(30)
continue
try:
M.login(section, cfg.get(section, 'password'))
except imaplib.IMAP4.error:
continue
M.select()
typ, data = M.search(None, '(NOT SEEN)')
for num in data[0].split():
typ, data = M.fetch(num, '(RFC822)')
msg = email.parser.Parser().parsestr(data[0][1])
enable_ocr = True
if 'disable_ocr' in msg['Subject']:
enable_ocr = False
for part in msg.walk():
if part.get_content_type() == 'application/pdf':
filename = part.get_filename()
print ' handling', filename
payload = part.get_payload(decode=True)
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
print ' error -> marking as unseen'
M.store(num, '-FLAGS', r'\Seen')
break
M.close()
M.logout()
print 'waiting a bit'
time.sleep(30)