refactored ocr import to handle new document types

This commit is contained in:
Frédéric Péters 2012-07-18 13:45:47 +02:00
parent 7962243e43
commit 1a03d61f83
1 changed files with 75 additions and 34 deletions

View File

@ -14,18 +14,21 @@ import themis.config.utils
class LoadFromOcr(BrowserView):
output_path = '/mnt/gedimport'
def get_folder(self, doctype):
portal = getToolByName(self.context, 'portal_url').getPortalObject()
folder = portal
for part in themis.config.utils.get_ocr_location(doctype).split('/'):
if not part:
continue
folder = getattr(folder, part)
return folder
def __call__(self):
# output path is the directory where the OCR system uploads the files
self.output_path = self.request.form.get('outputPath', self.output_path)
portal = getToolByName(self.context, 'portal_url').getPortalObject()
plone_utils = getToolByName(self.context, 'plone_utils')
mail_folder = portal
for part in themis.config.utils.get_incoming_mails_location().split('/'):
if not part:
continue
mail_folder = getattr(mail_folder, part)
for base, dirnames, filenames in os.walk(self.output_path):
for filename in filenames:
if not filename.lower().endswith('.pdf'):
@ -33,51 +36,89 @@ class LoadFromOcr(BrowserView):
print 'processing', filename
code_cat, number, date, time = filename.split('_')
try:
categorie_de_courrier, sous_categorie_de_courrier = \
themis.config.utils.get_mail_categories_from_ocr_code(code_cat)
except TypeError:
# XXX: log
for doctype in ('incoming_mails', 'outgoing_mails',
'internal_documents', 'confidential_documents'):
try:
category, subcategory = \
themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
except TypeError:
continue
break
else:
# XXX: log?
continue
date_reception = datetime.datetime(
folder = self.get_folder(doctype)
ocr_date = datetime.datetime(
int(date[0:4]), int(date[4:6]), int(date[6:]),
int(time[:2]),int(time[2:4]),int(time[4:6]))
mail_title = u'%s %s du %s à %s' % (
(categorie_de_courrier or sous_categorie_de_courrier),
if doctype in ('incoming_mails', 'outgoing_mails'):
ocr_title = u'%s %s du %s à %s' % (
(category or subcategory),
number,
date_reception.strftime('%d/%m/%Y'),
date_reception.strftime('%H:%M:%S'))
mail_id = plone_utils.normalizeString(
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'%s %s du %s à %s' % (
(categorie_de_courrier or sous_categorie_de_courrier),
(category or subcategory),
number,
date_reception.strftime('%Y-%m-%d'),
date_reception.strftime('%H-%M-%S')))
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
else:
ocr_title = u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%d/%m/%Y'),
ocr_date.strftime('%H:%M:%S'))
ocr_id = plone_utils.normalizeString(
u'Document %s du %s à %s' % (
number,
ocr_date.strftime('%Y-%m-%d'),
ocr_date.strftime('%H-%M-%S')))
if hasattr(mail_folder, mail_id):
if hasattr(folder, ocr_id):
# already imported (log?)
continue
if categorie_de_courrier:
categorie_de_courrier = [categorie_de_courrier]
if category:
category = [category]
else:
categorie_de_courrier = None
category = None
if sous_categorie_de_courrier:
sous_categorie_de_courrier = [sous_categorie_de_courrier]
if subcategory:
subcategory = [subcategory]
else:
sous_categorie_de_courrier = None
subcategory = None
mail_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
filename=unicode(filename))
mail_folder.invokeFactory('courrier_entrant', id=mail_id, title=mail_title,
numero_courrier=number,
date_reception=date_reception,
categorie_de_courrier=categorie_de_courrier,
sous_categorie_de_courrier=sous_categorie_de_courrier,
fichier=mail_file)
if doctype == 'incoming_mails':
factory = 'courrier_entrant'
kwargs = {'numero_courrier': number,
'date_reception': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'outgoing_mails':
factory = 'courrier_sortant'
kwargs = {'numero_courrier': number,
'date_envoi': ocr_date,
'categorie_de_courrier': category,
'sous_categorie_de_courrier': subcategory}
elif doctype == 'internal_documents':
factory = 'document_interne'
kwargs = {'no_du_dossier': number,
'categorie': category}
elif doctype == 'confidential_documents':
factory = 'document_confidentiel'
kwargs = {'no_du_dossier': number,
'categorie': category}
else:
continue
folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
fichier=ocr_file, **kwargs)
os.rename(os.path.join(base, filename),
os.path.join(base, filename + '.processed'))