themis.ocrloader/themis/ocrloader/ocrloader.py

# -*- coding: utf-8 -*-

import datetime
import os
import logging

from DateTime import DateTime

from Products.CMFCore.utils import getToolByName
from Products.Five.browser import BrowserView
from plone.namedfile.file import NamedBlobFile

from zope.event import notify
from zope.lifecycleevent import ObjectAddedEvent, ObjectModifiedEvent


import transaction

import themis.config.utils

log = logging.getLogger('Plone')

class LoadFromOcr(BrowserView):
    output_path = '/mnt/gedimport'

    def get_folder(self, doctype):
        portal = getToolByName(self.context, 'portal_url').getPortalObject()
        folder = portal
        for part in themis.config.utils.get_ocr_location(doctype).split('/'):
            if not part:
                continue
            folder = getattr(folder, part)
        return folder

    def __call__(self):
        # output path is the directory where the OCR system uploads the files
        self.output_path = self.request.form.get('outputPath', self.output_path)
        portal = getToolByName(self.context, 'portal_url').getPortalObject()
        plone_utils = getToolByName(self.context, 'plone_utils')

        for base, dirnames, filenames in os.walk(self.output_path):
            for filename in filenames:
                if not filename.lower().endswith('.pdf'):
                    continue
                log.info('processing %s' % filename)
                try:
                    code_cat, number, date, time = filename.split('_')
                except ValueError:
                    # XXX: log
                    log.warning('unknown file name format (%s)' % filename)
                    continue

                for doctype in ('incoming_mails', 'outgoing_mails',
                                'internal_documents', 'confidential_documents'):
                    try:
                        category, subcategory = \
                            themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
                    except TypeError:
                        continue
                    break
                else:
                    log.warning('no suitable document type found for %s' % filename)
                    continue

                folder = self.get_folder(doctype)

                ocr_date = datetime.datetime(
                                int(date[0:4]), int(date[4:6]), int(date[6:]),
                                int(time[:2]),int(time[2:4]),int(time[4:6]))

                if doctype in ('incoming_mails', 'outgoing_mails'):
                    ocr_title = u'%s %s du %s à %s' % (
                                (category or subcategory),
                                number,
                                ocr_date.strftime('%d/%m/%Y'),
                                ocr_date.strftime('%H:%M:%S'))
                    ocr_id = plone_utils.normalizeString(
                                u'%s %s du %s à %s' % (
                                (category or subcategory),
                                number,
                                ocr_date.strftime('%Y-%m-%d'),
                                ocr_date.strftime('%H-%M-%S')))
                else:
                    ocr_title = u'Document %s du %s à %s' % (
                                number,
                                ocr_date.strftime('%d/%m/%Y'),
                                ocr_date.strftime('%H:%M:%S'))
                    ocr_id = plone_utils.normalizeString(
                                u'Document %s du %s à %s' % (
                                number,
                                ocr_date.strftime('%Y-%m-%d'),
                                ocr_date.strftime('%H-%M-%S')))

                if hasattr(folder, ocr_id):
                    logging.warning('document id already exists (%s)' % filename)
                    continue

                if category:
                    category = [category]
                else:
                    category = None

                if subcategory:
                    subcategory = [subcategory]
                else:
                    subcategory = None

                ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
                                filename=unicode(filename))

                if ocr_date:
                    ocr_date = datetime.date.fromordinal(ocr_date.toordinal())

                if doctype == 'incoming_mails':
                    factory = 'courrier_entrant'
                    kwargs = {'numero_courrier': number,
                              'date_reception': ocr_date,
                              'categorie_de_courrier': category,
                              'sous_categorie_de_courrier': subcategory}
                elif doctype == 'outgoing_mails':
                    factory = 'courrier_sortant'
                    kwargs = {'numero_courrier': number,
                              'date_envoi': ocr_date,
                              'categorie_de_courrier': category,
                              'sous_categorie_de_courrier': subcategory}
                elif doctype == 'internal_documents':
                    factory = 'document_interne'
                    kwargs = {'no_du_dossier': number,
                              'categorie': category}
                elif doctype == 'confidential_documents':
                    factory = 'document_confidentiel'
                    kwargs = {'no_du_dossier': number,
                              'categorie': category}
                else:
                    logging.warning('unknown document type (%s, %s)' % (doctype, filename))
                    continue

                folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
                                fichier=ocr_file, **kwargs)
                os.rename(os.path.join(base, filename),
                          os.path.join(base, filename + '.processed'))
                transaction.commit()

                object = getattr(folder, ocr_id)
                notify(ObjectAddedEvent(object))

        return 'OK'