add script and config file as they were developed on the server
This commit is contained in:
commit
35bb78fbab
|
@ -0,0 +1,19 @@
|
|||
[DEFAULT]
|
||||
imap_server = imap.pfwb.be
|
||||
ssl = yes
|
||||
ged_base_url = http://test.ged.pfwb.be
|
||||
ged_username = admin
|
||||
ged_password = xxxxx
|
||||
ocrized_directory = /root/tests
|
||||
|
||||
[xxxx-test-greffe@pfwb.be]
|
||||
password = xxxxxx
|
||||
default_type = dmsincomingmail
|
||||
default_directory = documents
|
||||
user = secretariat-greffe
|
||||
|
||||
[xxxx-test-gaetandeberdt@pfwb.be]
|
||||
password = xxxxxx
|
||||
default_type = dmsdocument
|
||||
default_directory = Members/gaetan
|
||||
user = gaetan
|
|
@ -0,0 +1,117 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
import ConfigParser
|
||||
import email
|
||||
import email.parser
|
||||
import imaplib
|
||||
import os
|
||||
import os.path
|
||||
import requests
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
def process(cfg, filename, payload, enable_ocr=True):
|
||||
ocr_filename = os.path.join(cfg.get('ocrized_directory'), filename)
|
||||
if not enable_ocr:
|
||||
if not os.path.exists(ocr_filename):
|
||||
print >> sys.stderr, ' skipping OCR phase'
|
||||
fd = file(ocr_filename, 'w')
|
||||
fd.write(payload)
|
||||
fd.close()
|
||||
|
||||
if not os.path.exists(ocr_filename) and not enable_ocr:
|
||||
fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-')
|
||||
os.write(fd, payload)
|
||||
os.close(fd)
|
||||
print >> sys.stderr, ' running OCR on file'
|
||||
os.system('abbyyocr9 -rl French -if %s -f PDF -pem ImageOnText -pfpr 150 -pfq 100 -of %s' % \
|
||||
(tmpfilename, ocr_filename))
|
||||
if not os.path.exists(ocr_filename):
|
||||
print >> sys.stderr, 'failed to OCR %s' % filename
|
||||
file('/tmp/' + filename, 'w').write(payload) # keep it for inspection
|
||||
return False
|
||||
|
||||
print ' uploading file'
|
||||
t = os.system('curl -v --insecure -X POST '\
|
||||
'--form "form.widgets.file=@%s;filename=%s;type=application/pdf" '\
|
||||
'-F "form.buttons.import=Import" '\
|
||||
'-F "form.widgets.portal_type=%s" '\
|
||||
'-F "form.widgets.location=%s" '\
|
||||
'-F "form.widgets.owner=%s" '\
|
||||
'-u admin:admin '\
|
||||
'%s/@@fileimport' % (
|
||||
ocr_filename, filename,
|
||||
cfg.get('default_type'),
|
||||
cfg.get('default_directory'),
|
||||
cfg.get('user'),
|
||||
cfg.get('ged_base_url')))
|
||||
return (t == 0)
|
||||
|
||||
|
||||
# try:
|
||||
# r = requests.post(cfg.get('ged_base_url') + '/@@fileimport',
|
||||
# auth=(cfg.get('ged_username'), cfg.get('ged_password')),
|
||||
# verify=False,
|
||||
# proxies={'https': 'http://172.23.3.30:3128'},
|
||||
# files={'form.widgets.file': (filename, file(ocr_filename))},
|
||||
# data={'form.buttons.import': 'Import',
|
||||
# 'form.widgets.portal_type': cfg.get('default_type'),
|
||||
# 'form.widgets.location': cfg.get('default_directory'),
|
||||
# 'form.widgets.owner': cfg.get('user'),
|
||||
# })
|
||||
# except Exception, e:
|
||||
# print e
|
||||
# return False
|
||||
#
|
||||
# if r.status_code != requests.codes.ok:
|
||||
# file('/tmp/error.html', 'w').write(r.text)
|
||||
# return (r.status_code == requests.codes.ok)
|
||||
|
||||
|
||||
while True:
|
||||
cfg = ConfigParser.ConfigParser()
|
||||
cfg.read('ocrloader.ini')
|
||||
for section in cfg.sections():
|
||||
print 'processing', section
|
||||
imap_server = cfg.get(section, 'imap_server')
|
||||
ssl = cfg.getboolean(section, 'ssl')
|
||||
ged_base_url = cfg.get(section, 'ged_base_url')
|
||||
|
||||
try:
|
||||
if ssl:
|
||||
M = imaplib.IMAP4_SSL(host=imap_server)
|
||||
else:
|
||||
M = imaplib.IMAP4(host_imap_server)
|
||||
except:
|
||||
print 'failed to connect to imap server'
|
||||
time.sleep(30)
|
||||
continue
|
||||
try:
|
||||
M.login(section, cfg.get(section, 'password'))
|
||||
except imaplib.IMAP4.error:
|
||||
continue
|
||||
M.select()
|
||||
typ, data = M.search(None, '(NOT SEEN)')
|
||||
for num in data[0].split():
|
||||
typ, data = M.fetch(num, '(RFC822)')
|
||||
msg = email.parser.Parser().parsestr(data[0][1])
|
||||
enable_ocr = True
|
||||
if 'disable_ocr' in msg['Subject']:
|
||||
enable_ocr = False
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == 'application/pdf':
|
||||
filename = part.get_filename()
|
||||
print ' handling', filename
|
||||
payload = part.get_payload(decode=True)
|
||||
if not process(dict(cfg.items(section)), filename, payload, enable_ocr):
|
||||
print ' error -> marking as unseen'
|
||||
M.store(num, '-FLAGS', r'\Seen')
|
||||
break
|
||||
M.close()
|
||||
M.logout()
|
||||
|
||||
print 'waiting a bit'
|
||||
time.sleep(30)
|
Reference in New Issue