summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexis de Lattre <alexis.delattre@akretion.com>2019-04-22 21:29:50 (GMT)
committerAlexis de Lattre <alexis.delattre@akretion.com>2019-04-22 21:29:50 (GMT)
commit9cb1cb0bf2c2b466366dfb70200f049be8358d67 (patch)
tree7302c8acb436e5a7b4f25b1f7ac462de21e053c9
parent0fbe5046105c1e737dcd07b9a7e44939533c0eae (diff)
downloadfactur-x-9cb1cb0bf2c2b466366dfb70200f049be8358d67.zip
factur-x-9cb1cb0bf2c2b466366dfb70200f049be8358d67.tar.gz
factur-x-9cb1cb0bf2c2b466366dfb70200f049be8358d67.tar.bz2
Add support for /Kids
Bump version to 1.1
-rw-r--r--README.rst4
-rw-r--r--facturx/_version.py2
-rw-r--r--facturx/facturx.py111
3 files changed, 105 insertions, 12 deletions
diff --git a/README.rst b/README.rst
index f3f1bb0..e03f8dd 100644
--- a/README.rst
+++ b/README.rst
@@ -65,6 +65,10 @@ Contributors
Changelog
=========
+* Version 1.1 dated 2019-04-22
+
+ * Improve support for embedded files extraction by adding support for /Kids
+
* Version 1.0 dated 2019-01-26
* Use PyPDF4 instead of PyPDF2, because there are no new releases of PyPDF2 since May 2016 (cf https://github.com/mstamy2/PyPDF2/wiki/State-of-PyPDF2-and-Future-Plans), and we need a recent version of PyPDF2 to be able to generate fully compliant PDF/A-3 files.
diff --git a/facturx/_version.py b/facturx/_version.py
index e46aee1..439eb0c 100644
--- a/facturx/_version.py
+++ b/facturx/_version.py
@@ -1 +1 @@
-__version__ = '0.9'
+__version__ = '1.1'
diff --git a/facturx/facturx.py b/facturx/facturx.py
index 526d324..b2b449a 100644
--- a/facturx/facturx.py
+++ b/facturx/facturx.py
@@ -35,7 +35,7 @@ from tempfile import NamedTemporaryFile
from datetime import datetime
from PyPDF4 import PdfFileWriter, PdfFileReader
from PyPDF4.generic import DictionaryObject, DecodedStreamObject,\
- NameObject, createStringObject, ArrayObject
+ NameObject, createStringObject, ArrayObject, IndirectObject
from PyPDF4.utils import b_
from pkg_resources import resource_filename
import os.path
@@ -152,6 +152,88 @@ def check_facturx_xsd(
return True
+def _get_dict_entry(node, entry):
+ if not isinstance(node, dict):
+ raise ValueError('The node must be a dict')
+ dict_entry = node.get(entry)
+ if isinstance(dict_entry, dict):
+ return dict_entry
+ elif isinstance(dict_entry, IndirectObject):
+ res_dict_entry = dict_entry.getObject()
+ if isinstance(res_dict_entry, dict):
+ return res_dict_entry
+ else:
+ return False
+ else:
+ return False
+
+
+def _parse_embeddedfiles_kids_node(kids_node, level, res):
+ if level not in [1, 2]:
+ raise ValueError('Level argument should be 1 or 2')
+ if not isinstance(kids_node, list):
+ logger.error(
+ 'The /Kids entry of the EmbeddedFiles name tree must '
+ 'be an array')
+ return False
+ logger.debug("kids_node=%s", kids_node)
+ for kid_entry in kids_node:
+ if not isinstance(kid_entry, IndirectObject):
+ logger.error(
+ 'The /Kids entry of the EmbeddedFiles name tree '
+ 'must be a list of IndirectObjects')
+ return False
+ kids_node = kid_entry.getObject()
+ logger.debug('kids_node=%s', kids_node)
+ if not isinstance(kids_node, dict):
+ logger.error(
+ 'The /Kids entry of the EmbeddedFiles name tree '
+ 'must be a list of IndirectObjects that point to '
+ 'dict ojects')
+ return False
+ if '/Names' in kids_node:
+ if not isinstance(kids_node['/Names'], list):
+ logger.error(
+ 'The /Names entry in EmbeddedFiles must be an array')
+ res += kids_node['/Names']
+ elif '/Kids' in kids_node and level == 1:
+ kids_node_l2 = kids_node['/Kids']
+ _parse_embeddedfiles_kids_node(kids_node_l2, 2, res)
+ else:
+ logger.error('/Kids node should have a /Names or /Kids entry')
+ return False
+ return True
+
+
+def _get_embeddedfiles(embeddedfiles_node):
+ if not isinstance(embeddedfiles_node, dict):
+ raise ValueError('The EmbeddedFiles node must be a dict')
+ res = []
+ if '/Names' in embeddedfiles_node:
+ if not isinstance(embeddedfiles_node['/Names'], list):
+ logger.error(
+ 'The /Names entry of the EmbeddedFiles name tree must '
+ 'be an array')
+ return False
+ res = embeddedfiles_node['/Names']
+ elif '/Kids' in embeddedfiles_node:
+ kids_node = embeddedfiles_node['/Kids']
+ parse_result = _parse_embeddedfiles_kids_node(kids_node, 1, res)
+ if parse_result is False:
+ return (None, None)
+ else:
+ logger.error(
+ 'The EmbeddedFiles name tree should have either a /Names '
+ 'or a /Kids entry')
+ return False
+ if len(res) % 2 != 0:
+ logger.error(
+ 'The EmbeddedFiles name tree should point to an even number of '
+ 'elements')
+ return False
+ return res
+
+
def get_facturx_xml_from_pdf(pdf_invoice, check_xsd=True):
logger.debug(
'get_facturx_xml_from_pdf with factur-x lib %s', __version__)
@@ -168,17 +250,24 @@ def get_facturx_xml_from_pdf(pdf_invoice, check_xsd=True):
"The first argument of the method get_facturx_xml_from_pdf must "
"be either a string or a file (it is a %s)." % type(pdf_invoice))
xml_string = xml_filename = False
+ pdf = PdfFileReader(pdf_file)
+ pdf_root = pdf.trailer['/Root'] # = Catalog
+ logger.debug('pdf_root=%s', pdf_root)
+ catalog_name = _get_dict_entry(pdf_root, '/Names')
+ if not catalog_name:
+ logger.info('No Names entry in Catalog')
+ return (None, None)
+ embeddedfiles_node = _get_dict_entry(catalog_name, '/EmbeddedFiles')
+ if not embeddedfiles_node:
+ logger.info('No EmbeddedFiles entry in the /Names of the Catalog')
+ return (None, None)
+ embeddedfiles = _get_embeddedfiles(embeddedfiles_node)
+ logger.debug('embeddedfiles=%s', embeddedfiles)
+ if not embeddedfiles:
+ return (None, None)
+ embeddedfiles_by_two = list(zip(embeddedfiles, embeddedfiles[1:]))[::2]
+ logger.debug('embeddedfiles_by_two=%s', embeddedfiles_by_two)
try:
- pdf = PdfFileReader(pdf_file)
- pdf_root = pdf.trailer['/Root']
- logger.debug('pdf_root=%s', pdf_root)
- embeddedfiles = pdf_root['/Names']['/EmbeddedFiles']['/Names']
- logger.debug('embeddedfiles=%s', embeddedfiles)
- # embeddedfiles must contain an even number of elements
- if len(embeddedfiles) % 2 != 0:
- raise
- embeddedfiles_by_two = list(zip(embeddedfiles, embeddedfiles[1:]))[::2]
- logger.debug('embeddedfiles_by_two=%s', embeddedfiles_by_two)
for (filename, file_obj) in embeddedfiles_by_two:
logger.debug('found filename=%s', filename)
if filename in (FACTURX_FILENAME, 'ZUGFeRD-invoice.xml'):