This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
themis.ocrloader/themis/ocrloader/ocrloader.py

149 lines
5.9 KiB
Python

# -*- coding: utf-8 -*-
import datetime
import os
import logging
from DateTime import DateTime
from Products.CMFCore.utils import getToolByName
from Products.Five.browser import BrowserView
from plone.namedfile.file import NamedBlobFile
from zope.event import notify
from zope.lifecycleevent import ObjectAddedEvent, ObjectModifiedEvent
import transaction
import themis.config.utils
log = logging.getLogger('Plone')
class LoadFromOcr(BrowserView):
output_path = '/mnt/gedimport'
def get_folder(self, doctype):
portal = getToolByName(self.context, 'portal_url').getPortalObject()
folder = portal
for part in themis.config.utils.get_ocr_location(doctype).split('/'):
if not part:
continue
folder = getattr(folder, part)
return folder
def __call__(self):
# output path is the directory where the OCR system uploads the files
self.output_path = self.request.form.get('outputPath', self.output_path)
portal = getToolByName(self.context, 'portal_url').getPortalObject()
plone_utils = getToolByName(self.context, 'plone_utils')
for base, dirnames, filenames in os.walk(self.output_path):
for filename in filenames:
if not filename.lower().endswith('.pdf'):
continue
log.info('processing %s' % filename)
try:
code_cat, number, date, time = filename.split('_')
except ValueError:
# XXX: log
log.warning('unknown file name format (%s)' % filename)
continue
for doctype in ('incoming_mails', 'outgoing_mails',
'internal_documents', 'confidential_documents'):
try:
category, subcategory = \
themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
except TypeError:
continue
break
else:
log.warning('no suitable document type found for %s' % filename)
continue
folder = self.get_folder(doctype)
ocr_date = datetime.datetime(
int(date[0:4]), int(date[4:6]), int(date[6:]),
int(time[:2]),int(time[2:4]),int(time[4:6]))
if doctype in ('incoming_mails', 'outgoing_mails'):
ocr_title = u'%s %s du %s à %s' % (
(category or subcategory),
number,
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'%s %s du %s à %s' % (
(category or subcategory),
number,
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
else:
ocr_title = u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
if hasattr(folder, ocr_id):
logging.warning('document id already exists (%s)' % filename)
continue
if category:
category = [category]
else:
category = None
if subcategory:
subcategory = [subcategory]
else:
subcategory = None
ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
filename=unicode(filename))
if ocr_date:
ocr_date = datetime.date.fromordinal(ocr_date.toordinal())
if doctype == 'incoming_mails':
factory = 'courrier_entrant'
kwargs = {'numero_courrier': number,
'date_reception': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'outgoing_mails':
factory = 'courrier_sortant'
kwargs = {'numero_courrier': number,
'date_envoi': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'internal_documents':
factory = 'document_interne'
kwargs = {'no_du_dossier': number,
'categorie': category}
elif doctype == 'confidential_documents':
factory = 'document_confidentiel'
kwargs = {'no_du_dossier': number,
'categorie': category}
else:
logging.warning('unknown document type (%s, %s)' % (doctype, filename))
continue
folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
fichier=ocr_file, **kwargs)
os.rename(os.path.join(base, filename),
os.path.join(base, filename + '.processed'))
transaction.commit()
object = getattr(folder, ocr_id)
notify(ObjectAddedEvent(object))
return 'OK'