This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
themis.ocrloader/themis/ocrloader/ocrloader.py

143 lines
5.7 KiB
Python

# -*- coding: utf-8 -*-
import datetime
import os
import logging
from DateTime import DateTime
from Products.CMFCore.utils import getToolByName
from Products.Five.browser import BrowserView
from plone.namedfile.file import NamedBlobFile
import transaction
import themis.config.utils
log = logging.getLogger('Plone')
class LoadFromOcr(BrowserView):
output_path = '/mnt/gedimport'
def get_folder(self, doctype):
portal = getToolByName(self.context, 'portal_url').getPortalObject()
folder = portal
for part in themis.config.utils.get_ocr_location(doctype).split('/'):
if not part:
continue
folder = getattr(folder, part)
return folder
def __call__(self):
# output path is the directory where the OCR system uploads the files
self.output_path = self.request.form.get('outputPath', self.output_path)
portal = getToolByName(self.context, 'portal_url').getPortalObject()
plone_utils = getToolByName(self.context, 'plone_utils')
for base, dirnames, filenames in os.walk(self.output_path):
for filename in filenames:
if not filename.lower().endswith('.pdf'):
continue
log.info('processing %s' % filename)
try:
code_cat, number, date, time = filename.split('_')
except ValueError:
# XXX: log
log.warning('unknown file name format (%s)' % filename)
continue
for doctype in ('incoming_mails', 'outgoing_mails',
'internal_documents', 'confidential_documents'):
try:
category, subcategory = \
themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
except TypeError:
continue
break
else:
log.warning('no suitable document type found for %s' % filename)
continue
folder = self.get_folder(doctype)
ocr_date = datetime.datetime(
int(date[0:4]), int(date[4:6]), int(date[6:]),
int(time[:2]),int(time[2:4]),int(time[4:6]))
if doctype in ('incoming_mails', 'outgoing_mails'):
ocr_title = u'%s %s du %s à %s' % (
(category or subcategory),
number,
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'%s %s du %s à %s' % (
(category or subcategory),
number,
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
else:
ocr_title = u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
if hasattr(folder, ocr_id):
logging.warning('document id already exists (%s)' % filename)
continue
if category:
category = [category]
else:
category = None
if subcategory:
subcategory = [subcategory]
else:
subcategory = None
ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
filename=unicode(filename))
if ocr_date:
ocr_date = datetime.date.fromordinal(ocr_date.toordinal())
if doctype == 'incoming_mails':
factory = 'courrier_entrant'
kwargs = {'numero_courrier': number,
'date_reception': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'outgoing_mails':
factory = 'courrier_sortant'
kwargs = {'numero_courrier': number,
'date_envoi': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'internal_documents':
factory = 'document_interne'
kwargs = {'no_du_dossier': number,
'categorie': category}
elif doctype == 'confidential_documents':
factory = 'document_confidentiel'
kwargs = {'no_du_dossier': number,
'categorie': category}
else:
logging.warning('unknown document type (%s, %s)' % (doctype, filename))
continue
folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
fichier=ocr_file, **kwargs)
os.rename(os.path.join(base, filename),
os.path.join(base, filename + '.processed'))
transaction.commit()
return 'OK'