From 410ea2902b8aa70b6198c1a8623a1b3a01d7576d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20P=C3=A9ters?= Date: Tue, 23 Sep 2014 14:25:40 +0200 Subject: [PATCH] check ocr doesn't produce empty files --- ocrloader.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ocrloader.py b/ocrloader.py index 5da4210..975a26d 100755 --- a/ocrloader.py +++ b/ocrloader.py @@ -44,16 +44,23 @@ def process(cfg, filename, payload, enable_ocr=True): fd, tmpfilename = tempfile.mkstemp(suffix='.pdf', prefix='ocrloader-') os.write(fd, payload) os.close(fd) - logging.debug(' running OCR on file') - subprocess.call(['/opt/ABBYYOCR9/abbyyocr9', + logging.debug(' running OCR on file (%s / %s)' % (tmpfilename, ocr_filename)) + cmd = ['/opt/ABBYYOCR9/abbyyocr9', '-pi', '-rl', 'French', '-if', tmpfilename, '-f', 'PDF', '-pem', 'ImageOnText', '-pfpr', '150', - '-pfq', '100', '-of', ocr_filename]) + '-pfq', '100', '-of', ocr_filename] + logging.debug(' %s' % ' '.join(cmd)) + subprocess.call(cmd) + if os.stat(ocr_filename)[6] == 0: + os.unlink(ocr_filename) if not os.path.exists(ocr_filename): logging.error('failed to OCR %s', filename) file('/tmp/' + filename, 'w').write(payload) # keep it for inspection return False + if os.stat(ocr_filename)[6] == 0: + os.unlink(ocr_filename) + if cfg.get('store_path'): logging.debug(' storing file locally') shutil.copy(ocr_filename, os.path.join(cfg.get('store_path'), filename))