summaryrefslogtreecommitdiffstats
path: root/themis
diff options
context:
space:
mode:
authorFrédéric Péters <fpeters@entrouvert.com>2012-07-18 11:45:47 (GMT)
committerFrédéric Péters <fpeters@entrouvert.com>2012-07-18 11:45:47 (GMT)
commit1a03d61f8303b533a0b41199913ff357d0ae0b4c (patch)
treea30288334949356983820dc697d84d5ccc60fc6e /themis
parent7962243e439c88b9fe706cecbdf68c7597a8675a (diff)
downloadthemis.ocrloader-1a03d61f8303b533a0b41199913ff357d0ae0b4c.zip
themis.ocrloader-1a03d61f8303b533a0b41199913ff357d0ae0b4c.tar.gz
themis.ocrloader-1a03d61f8303b533a0b41199913ff357d0ae0b4c.tar.bz2
refactored ocr import to handle new document types
Diffstat (limited to 'themis')
-rw-r--r--themis/ocrloader/ocrloader.py109
1 files changed, 75 insertions, 34 deletions
diff --git a/themis/ocrloader/ocrloader.py b/themis/ocrloader/ocrloader.py
index d2c3ab2..1003959 100644
--- a/themis/ocrloader/ocrloader.py
+++ b/themis/ocrloader/ocrloader.py
@@ -14,18 +14,21 @@ import themis.config.utils
class LoadFromOcr(BrowserView):
output_path = '/mnt/gedimport'
+ def get_folder(self, doctype):
+ portal = getToolByName(self.context, 'portal_url').getPortalObject()
+ folder = portal
+ for part in themis.config.utils.get_ocr_location(doctype).split('/'):
+ if not part:
+ continue
+ folder = getattr(folder, part)
+ return folder
+
def __call__(self):
# output path is the directory where the OCR system uploads the files
self.output_path = self.request.form.get('outputPath', self.output_path)
portal = getToolByName(self.context, 'portal_url').getPortalObject()
plone_utils = getToolByName(self.context, 'plone_utils')
- mail_folder = portal
- for part in themis.config.utils.get_incoming_mails_location().split('/'):
- if not part:
- continue
- mail_folder = getattr(mail_folder, part)
-
for base, dirnames, filenames in os.walk(self.output_path):
for filename in filenames:
if not filename.lower().endswith('.pdf'):
@@ -33,51 +36,89 @@ class LoadFromOcr(BrowserView):
print 'processing', filename
code_cat, number, date, time = filename.split('_')
- try:
- categorie_de_courrier, sous_categorie_de_courrier = \
- themis.config.utils.get_mail_categories_from_ocr_code(code_cat)
- except TypeError:
- # XXX: log
+ for doctype in ('incoming_mails', 'outgoing_mails',
+ 'internal_documents', 'confidential_documents'):
+ try:
+ category, subcategory = \
+ themis.config.utils.get_categories_from_ocr_code(code_cat, doctype)
+ except TypeError:
+ continue
+ break
+ else:
+ # XXX: log?
continue
- date_reception = datetime.datetime(
+ folder = self.get_folder(doctype)
+
+ ocr_date = datetime.datetime(
int(date[0:4]), int(date[4:6]), int(date[6:]),
int(time[:2]),int(time[2:4]),int(time[4:6]))
- mail_title = u'%s %s du %s à %s' % (
- (categorie_de_courrier or sous_categorie_de_courrier),
+ if doctype in ('incoming_mails', 'outgoing_mails'):
+ ocr_title = u'%s %s du %s à %s' % (
+ (category or subcategory),
number,
- date_reception.strftime('%d/%m/%Y'),
- date_reception.strftime('%H:%M:%S'))
- mail_id = plone_utils.normalizeString(
+ ocr_date.strftime('%d/%m/%Y'),
+ ocr_date.strftime('%H:%M:%S'))
+ ocr_id = plone_utils.normalizeString(
u'%s %s du %s à %s' % (
- (categorie_de_courrier or sous_categorie_de_courrier),
+ (category or subcategory),
+ number,
+ ocr_date.strftime('%Y-%m-%d'),
+ ocr_date.strftime('%H-%M-%S')))
+ else:
+ ocr_title = u'Document %s du %s à %s' % (
+ number,
+ ocr_date.strftime('%d/%m/%Y'),
+ ocr_date.strftime('%H:%M:%S'))
+ ocr_id = plone_utils.normalizeString(
+ u'Document %s du %s à %s' % (
number,
- date_reception.strftime('%Y-%m-%d'),
- date_reception.strftime('%H-%M-%S')))
+ ocr_date.strftime('%Y-%m-%d'),
+ ocr_date.strftime('%H-%M-%S')))
- if hasattr(mail_folder, mail_id):
+ if hasattr(folder, ocr_id):
# already imported (log?)
continue
- if categorie_de_courrier:
- categorie_de_courrier = [categorie_de_courrier]
+ if category:
+ category = [category]
else:
- categorie_de_courrier = None
+ category = None
- if sous_categorie_de_courrier:
- sous_categorie_de_courrier = [sous_categorie_de_courrier]
+ if subcategory:
+ subcategory = [subcategory]
else:
- sous_categorie_de_courrier = None
+ subcategory = None
- mail_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
+ ocr_file = NamedBlobFile(file(os.path.join(base, filename)).read(),
filename=unicode(filename))
- mail_folder.invokeFactory('courrier_entrant', id=mail_id, title=mail_title,
- numero_courrier=number,
- date_reception=date_reception,
- categorie_de_courrier=categorie_de_courrier,
- sous_categorie_de_courrier=sous_categorie_de_courrier,
- fichier=mail_file)
+
+ if doctype == 'incoming_mails':
+ factory = 'courrier_entrant'
+ kwargs = {'numero_courrier': number,
+ 'date_reception': ocr_date,
+ 'categorie_de_courrier': category,
+ 'sous_categorie_de_courrier': subcategory}
+ elif doctype == 'outgoing_mails':
+ factory = 'courrier_sortant'
+ kwargs = {'numero_courrier': number,
+ 'date_envoi': ocr_date,
+ 'categorie_de_courrier': category,
+ 'sous_categorie_de_courrier': subcategory}
+ elif doctype == 'internal_documents':
+ factory = 'document_interne'
+ kwargs = {'no_du_dossier': number,
+ 'categorie': category}
+ elif doctype == 'confidential_documents':
+ factory = 'document_confidentiel'
+ kwargs = {'no_du_dossier': number,
+ 'categorie': category}
+ else:
+ continue
+
+ folder.invokeFactory(factory, id=ocr_id, title=ocr_title,
+ fichier=ocr_file, **kwargs)
os.rename(os.path.join(base, filename),
os.path.join(base, filename + '.processed'))