refactored ocr import to handle new document types
This commit is contained in:
parent
7962243e43
commit
1a03d61f83
|
@ -14,18 +14,21 @@ import themis.config.utils
|
|||
class LoadFromOcr(BrowserView):
|
||||
output_path = '/mnt/gedimport'
|
||||
|
||||
def get_folder(self, doctype):
|
||||
portal = getToolByName(self.context, 'portal_url').getPortalObject()
|
||||
folder = portal
|
||||
for part in themis.config.utils.get_ocr_location(doctype).split('/'):
|
||||
if not part:
|
||||
continue
|
||||
folder = getattr(folder, part)
|
||||
return folder
|
||||
|
||||
def __call__(self):
|
||||
# output path is the directory where the OCR system uploads the files
|
||||
self.output_path = self.request.form.get('outputPath', self.output_path)
|
||||
portal = getToolByName(self.context, 'portal_url').getPortalObject()
|
||||
plone_utils = getToolByName(self.context, 'plone_utils')
|
||||
|
||||
mail_folder = portal
|
||||
for part in themis.config.utils.get_incoming_mails_location().split('/'):
|
||||
if not part:
|
||||
continue
|
||||
mail_folder = getattr(mail_folder, part)
|
||||
|
||||
for base, dirnames, filenames in os.walk(self.output_path):
|
||||
for filename in filenames:
|
||||
if not filename.lower().endswith('.pdf'):
|
||||
|
@ -33,51 +36,89 @@ class LoadFromOcr(BrowserView):
|
|||
print 'processing', filename
|
||||
code_cat, number, date, time = filename.split('_')
|
||||
|
||||
try:
|
||||
categorie_de_courrier, sous_categorie_de_courrier = \
|
||||
themis.config.utils.get_mail_categories_from_ocr_code(code_cat)
|
||||
except TypeError:
|
||||
# XXX: log
|
||||
for doctype in ('incoming_mails', 'outgoing_mails',
|
||||
'internal_documents', 'confidential_documents'):
|
||||
try:
|
||||
category, subcategory = \
|
||||
themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
|
||||
except TypeError:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
# XXX: log?
|
||||
continue
|
||||
|
||||
date_reception = datetime.datetime(
|
||||
folder = self.get_folder(doctype)
|
||||
|
||||
ocr_date = datetime.datetime(
|
||||
int(date[0:4]), int(date[4:6]), int(date[6:]),
|
||||
int(time[:2]),int(time[2:4]),int(time[4:6]))
|
||||
|
||||
mail_title = u'%s %s du %s à %s' % (
|
||||
(categorie_de_courrier or sous_categorie_de_courrier),
|
||||
if doctype in ('incoming_mails', 'outgoing_mails'):
|
||||
ocr_title = u'%s %s du %s à %s' % (
|
||||
(category or subcategory),
|
||||
number,
|
||||
date_reception.strftime('%d/%m/%Y'),
|
||||
date_reception.strftime('%H:%M:%S'))
|
||||
mail_id = plone_utils.normalizeString(
|
||||
ocr_date.strftime('%d/%m/%Y'),
|
||||
ocr_date.strftime('%H:%M:%S'))
|
||||
ocr_id = plone_utils.normalizeString(
|
||||
u'%s %s du %s à %s' % (
|
||||
(categorie_de_courrier or sous_categorie_de_courrier),
|
||||
(category or subcategory),
|
||||
number,
|
||||
date_reception.strftime('%Y-%m-%d'),
|
||||
date_reception.strftime('%H-%M-%S')))
|
||||
ocr_date.strftime('%Y-%m-%d'),
|
||||
ocr_date.strftime('%H-%M-%S')))
|
||||
else:
|
||||
ocr_title = u'Document %s du %s à %s' % (
|
||||
number,
|
||||
ocr_date.strftime('%d/%m/%Y'),
|
||||
ocr_date.strftime('%H:%M:%S'))
|
||||
ocr_id = plone_utils.normalizeString(
|
||||
u'Document %s du %s à %s' % (
|
||||
number,
|
||||
ocr_date.strftime('%Y-%m-%d'),
|
||||
ocr_date.strftime('%H-%M-%S')))
|
||||
|
||||
if hasattr(mail_folder, mail_id):
|
||||
if hasattr(folder, ocr_id):
|
||||
# already imported (log?)
|
||||
continue
|
||||
|
||||
if categorie_de_courrier:
|
||||
categorie_de_courrier = [categorie_de_courrier]
|
||||
if category:
|
||||
category = [category]
|
||||
else:
|
||||
categorie_de_courrier = None
|
||||
category = None
|
||||
|
||||
if sous_categorie_de_courrier:
|
||||
sous_categorie_de_courrier = [sous_categorie_de_courrier]
|
||||
if subcategory:
|
||||
subcategory = [subcategory]
|
||||
else:
|
||||
sous_categorie_de_courrier = None
|
||||
subcategory = None
|
||||
|
||||
mail_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
|
||||
ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
|
||||
filename=unicode(filename))
|
||||
mail_folder.invokeFactory('courrier_entrant', id=mail_id, title=mail_title,
|
||||
numero_courrier=number,
|
||||
date_reception=date_reception,
|
||||
categorie_de_courrier=categorie_de_courrier,
|
||||
sous_categorie_de_courrier=sous_categorie_de_courrier,
|
||||
fichier=mail_file)
|
||||
|
||||
if doctype == 'incoming_mails':
|
||||
factory = 'courrier_entrant'
|
||||
kwargs = {'numero_courrier': number,
|
||||
'date_reception': ocr_date,
|
||||
'categorie_de_courrier': category,
|
||||
'sous_categorie_de_courrier': subcategory}
|
||||
elif doctype == 'outgoing_mails':
|
||||
factory = 'courrier_sortant'
|
||||
kwargs = {'numero_courrier': number,
|
||||
'date_envoi': ocr_date,
|
||||
'categorie_de_courrier': category,
|
||||
'sous_categorie_de_courrier': subcategory}
|
||||
elif doctype == 'internal_documents':
|
||||
factory = 'document_interne'
|
||||
kwargs = {'no_du_dossier': number,
|
||||
'categorie': category}
|
||||
elif doctype == 'confidential_documents':
|
||||
factory = 'document_confidentiel'
|
||||
kwargs = {'no_du_dossier': number,
|
||||
'categorie': category}
|
||||
else:
|
||||
continue
|
||||
|
||||
folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
|
||||
fichier=ocr_file, **kwargs)
|
||||
|
||||
os.rename(os.path.join(base, filename),
|
||||
os.path.join(base, filename + '.processed'))
|
||||
|
|
Reference in New Issue