From 1a03d61f8303b533a0b41199913ff357d0ae0b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20P=C3=A9ters?= Date: Wed, 18 Jul 2012 13:45:47 +0200 Subject: [PATCH] refactored ocr import to handle new document types --- themis/ocrloader/ocrloader.py | 109 +++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 34 deletions(-) diff --git a/themis/ocrloader/ocrloader.py b/themis/ocrloader/ocrloader.py index d2c3ab2..1003959 100644 --- a/themis/ocrloader/ocrloader.py +++ b/themis/ocrloader/ocrloader.py @@ -14,18 +14,21 @@ import themis.config.utils class LoadFromOcr(BrowserView): output_path = '/mnt/gedimport' + def get_folder(self, doctype): + portal = getToolByName(self.context, 'portal_url').getPortalObject() + folder = portal + for part in themis.config.utils.get_ocr_location(doctype).split('/'): + if not part: + continue + folder = getattr(folder, part) + return folder + def __call__(self): # output path is the directory where the OCR system uploads the files self.output_path = self.request.form.get('outputPath', self.output_path) portal = getToolByName(self.context, 'portal_url').getPortalObject() plone_utils = getToolByName(self.context, 'plone_utils') - mail_folder = portal - for part in themis.config.utils.get_incoming_mails_location().split('/'): - if not part: - continue - mail_folder = getattr(mail_folder, part) - for base, dirnames, filenames in os.walk(self.output_path): for filename in filenames: if not filename.lower().endswith('.pdf'): @@ -33,51 +36,89 @@ class LoadFromOcr(BrowserView): print 'processing', filename code_cat, number, date, time = filename.split('_') - try: - categorie_de_courrier, sous_categorie_de_courrier = \ - themis.config.utils.get_mail_categories_from_ocr_code(code_cat) - except TypeError: - # XXX: log + for doctype in ('incoming_mails', 'outgoing_mails', + 'internal_documents', 'confidential_documents'): + try: + category, subcategory = \ + themis.config.utils.get_categories_from_ocr_code(code_cat, doctype) + except TypeError: + continue + break + else: + # XXX: log? continue - date_reception = datetime.datetime( + folder = self.get_folder(doctype) + + ocr_date = datetime.datetime( int(date[0:4]), int(date[4:6]), int(date[6:]), int(time[:2]),int(time[2:4]),int(time[4:6])) - mail_title = u'%s %s du %s à %s' % ( - (categorie_de_courrier or sous_categorie_de_courrier), + if doctype in ('incoming_mails', 'outgoing_mails'): + ocr_title = u'%s %s du %s à %s' % ( + (category or subcategory), number, - date_reception.strftime('%d/%m/%Y'), - date_reception.strftime('%H:%M:%S')) - mail_id = plone_utils.normalizeString( + ocr_date.strftime('%d/%m/%Y'), + ocr_date.strftime('%H:%M:%S')) + ocr_id = plone_utils.normalizeString( u'%s %s du %s à %s' % ( - (categorie_de_courrier or sous_categorie_de_courrier), + (category or subcategory), number, - date_reception.strftime('%Y-%m-%d'), - date_reception.strftime('%H-%M-%S'))) + ocr_date.strftime('%Y-%m-%d'), + ocr_date.strftime('%H-%M-%S'))) + else: + ocr_title = u'Document %s du %s à %s' % ( + number, + ocr_date.strftime('%d/%m/%Y'), + ocr_date.strftime('%H:%M:%S')) + ocr_id = plone_utils.normalizeString( + u'Document %s du %s à %s' % ( + number, + ocr_date.strftime('%Y-%m-%d'), + ocr_date.strftime('%H-%M-%S'))) - if hasattr(mail_folder, mail_id): + if hasattr(folder, ocr_id): # already imported (log?) continue - if categorie_de_courrier: - categorie_de_courrier = [categorie_de_courrier] + if category: + category = [category] else: - categorie_de_courrier = None + category = None - if sous_categorie_de_courrier: - sous_categorie_de_courrier = [sous_categorie_de_courrier] + if subcategory: + subcategory = [subcategory] else: - sous_categorie_de_courrier = None + subcategory = None - mail_file = NamedBlobFile(file(os.path.join(base, filename)).read(), + ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(), filename=unicode(filename)) - mail_folder.invokeFactory('courrier_entrant', id=mail_id, title=mail_title, - numero_courrier=number, - date_reception=date_reception, - categorie_de_courrier=categorie_de_courrier, - sous_categorie_de_courrier=sous_categorie_de_courrier, - fichier=mail_file) + + if doctype == 'incoming_mails': + factory = 'courrier_entrant' + kwargs = {'numero_courrier': number, + 'date_reception': ocr_date, + 'categorie_de_courrier': category, + 'sous_categorie_de_courrier': subcategory} + elif doctype == 'outgoing_mails': + factory = 'courrier_sortant' + kwargs = {'numero_courrier': number, + 'date_envoi': ocr_date, + 'categorie_de_courrier': category, + 'sous_categorie_de_courrier': subcategory} + elif doctype == 'internal_documents': + factory = 'document_interne' + kwargs = {'no_du_dossier': number, + 'categorie': category} + elif doctype == 'confidential_documents': + factory = 'document_confidentiel' + kwargs = {'no_du_dossier': number, + 'categorie': category} + else: + continue + + folder.invokeFactory(factory, id=ocr_id, title=ocr_title, + fichier=ocr_file, **kwargs) os.rename(os.path.join(base, filename), os.path.join(base, filename + '.processed'))