Fix XMP metadata structure

Now fully PDF/A-3 compliant when the input PDF file is PDF/A compliant (tested with veraPDF). This implied copying /OutputIntents and /ID datas from source PDF to Factur-X PDF. Fix support for additionnal attachments: they can now all be saved with Acrobat Reader Improve XML extraction from PDF Factur-x file
2018-03-29 00:59:32 +02:00 · 2018-03-29 00:59:32 +02:00 · 39fe7e9289
parent a3ebfa4165
commit 39fe7e9289
4 changed files with 112 additions and 37 deletions
--- a/README.rst
+++ b/README.rst
@ -52,3 +52,22 @@ Contributors
 ============

 * Alexis de Lattre <alexis.delattre@akretion.com>
+
+Changelog
+=========
+
+* Version 0.5 dated 2018-03-29
+
+ * Fix XMP metadata structure
+ * Now fully PDF/A-3 compliant when the input PDF file is PDF/A compliant (tested with veraPDF). This implied copying /OutputIntents and /ID datas from source PDF to Factur-X PDF.
+ * Fix support for additionnal attachments: they can now all be saved with Acrobat Reader
+ * Improve XML extraction from PDF Factur-x file
+
+* Version 0.4 dated 2018-03-27
+
+  * Factur-x specs say /AFRelationship must be /Data (and not /Alternative)
+  * Update Factur-X XSD to v1.0 final
+  * Add support for additionnal attachments
+  * Add factur-x lib version in Creator metadata table
+  * Add /PageMode = /UseAttachments, so that the attachments are displayed by default when opening Factur-X PDF invoice with Acrobat Reader
+  * Improve and enrich PDF objects (ModDate, CheckSum, Size)
--- a/bin/facturx-pdfgen
+++ b/bin/facturx-pdfgen
@ -128,8 +128,11 @@ if __name__ == '__main__':
            "metadata only apply if none of the meta-* arguments are used."
    epilog = "Author: %s\n\nVersion: %s" % (__author__, __version__)
    description = "This script generate a Factur-X PDF invoice from a "\
-                  "regular PDF invoice and a Factur-X XML file."\
-                  "It can also include additional embedded files in the PDF."
+                  "regular PDF/A invoice and a Factur-X XML file."\
+                  "It can also include additional embedded files in the PDF."\
+                  "To generate a valid PDF/A-3 invoice as requested by the "\
+                  "Factur-X standard, you need to give a valid PDF/A "\
+                  "regular invoice as input."
    parser = OptionParser(usage=usage, epilog=epilog, description=description)
    for option in options:
        param = option['names']
--- a/facturx/_version.py
+++ b/facturx/_version.py
@ -1 +1 @@
-__version__ = '0.4'
+__version__ = '0.5'
--- a/facturx/facturx.py
+++ b/facturx/facturx.py
@ -27,6 +27,7 @@
 # TODO list:
 # - have both python2 and python3 support
 # - add automated tests (currently, we only have tests at odoo module level)
+# - keep original metadata by copy of pdf_tailer[/Info] ?

 from ._version import __version__
 from io import BytesIO
@ -163,23 +164,29 @@ def get_facturx_xml_from_pdf(pdf_invoice, check_xsd=True):
        pdf_root = pdf.trailer['/Root']
        logger.debug('pdf_root=%s', pdf_root)
        embeddedfiles = pdf_root['/Names']['/EmbeddedFiles']['/Names']
-        i = 0
-        for embeddedfile in embeddedfiles[:-1]:
-            if embeddedfile in (FACTURX_FILENAME, 'ZUGFeRD-invoice.xml'):
-                xml_file_dict = embeddedfiles[i+1].getObject()
+        logger.debug('embeddedfiles=%s', embeddedfiles)
+        # embeddedfiles must contain an even number of elements
+        if len(embeddedfiles) % 2 != 0:
+            raise
+        embeddedfiles_by_two = zip(embeddedfiles, embeddedfiles[1:])[::2]
+        logger.debug('embeddedfiles_by_two=%s', embeddedfiles_by_two)
+        for (filename, file_obj) in embeddedfiles_by_two:
+            logger.debug('found filename=%s', filename)
+            if filename in (FACTURX_FILENAME, 'ZUGFeRD-invoice.xml'):
+                xml_file_dict = file_obj.getObject()
                logger.debug('xml_file_dict=%s', xml_file_dict)
                tmp_xml_string = xml_file_dict['/EF']['/F'].getData()
                xml_root = etree.fromstring(tmp_xml_string)
                logger.info(
                    'A valid XML file %s has been found in the PDF file',
-                    embeddedfile)
+                    filename)
                if check_xsd:
                    check_facturx_xsd(xml_root)
                    xml_string = tmp_xml_string
-                    xml_filename = embeddedfile
+                    xml_filename = filename
                else:
                    xml_string = tmp_xml_string
-                    xml_filename = embeddedfile
+                    xml_filename = filename
                break
    except:
        logger.error('No valid XML file found in the PDF')
@ -220,6 +227,7 @@ def _prepare_pdf_metadata_txt(pdf_metadata):


 def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
+    nsmap_x = {'x': 'adobe:ns:meta/'}
    nsmap_rdf = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}
    nsmap_dc = {'dc': 'http://purl.org/dc/elements/1.1/'}
    nsmap_pdf = {'pdf': 'http://ns.adobe.com/pdf/1.3/'}
@ -227,6 +235,7 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
    nsmap_pdfaid = {'pdfaid': 'http://www.aiim.org/pdfa/ns/id/'}
    nsmap_fx = {
        'fx': 'urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#'}
+    ns_x = '{%s}' % nsmap_x['x']
    ns_dc = '{%s}' % nsmap_dc['dc']
    ns_rdf = '{%s}' % nsmap_rdf['rdf']
    ns_pdf = '{%s}' % nsmap_pdf['pdf']
@ -235,16 +244,18 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
    ns_fx = '{%s}' % nsmap_fx['fx']
    ns_xml = '{http://www.w3.org/XML/1998/namespace}'

-    root = etree.Element(ns_rdf + 'RDF', nsmap=nsmap_rdf)
+    root = etree.Element(ns_x + 'xmpmeta', nsmap=nsmap_x)
+    rdf = etree.SubElement(
+        root, ns_rdf + 'RDF', nsmap=nsmap_rdf)
    desc_pdfaid = etree.SubElement(
-        root, ns_rdf + 'Description', nsmap=nsmap_pdfaid)
+        rdf, ns_rdf + 'Description', nsmap=nsmap_pdfaid)
    desc_pdfaid.set(ns_rdf + 'about', '')
    etree.SubElement(
        desc_pdfaid, ns_pdfaid + 'part').text = '3'
    etree.SubElement(
        desc_pdfaid, ns_pdfaid + 'conformance').text = 'B'
    desc_dc = etree.SubElement(
-        root, ns_rdf + 'Description', nsmap=nsmap_dc)
+        rdf, ns_rdf + 'Description', nsmap=nsmap_dc)
    desc_dc.set(ns_rdf + 'about', '')
    dc_title = etree.SubElement(desc_dc, ns_dc + 'title')
    dc_title_alt = etree.SubElement(dc_title, ns_rdf + 'Alt')
@ -263,13 +274,13 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
    dc_desc_alt_li.text = pdf_metadata.get('subject', '')
    dc_desc_alt_li.set(ns_xml + 'lang', 'x-default')
    desc_adobe = etree.SubElement(
-        root, ns_rdf + 'Description', nsmap=nsmap_pdf)
+        rdf, ns_rdf + 'Description', nsmap=nsmap_pdf)
    desc_adobe.set(ns_rdf + 'about', '')
    producer = etree.SubElement(
        desc_adobe, ns_pdf + 'Producer')
    producer.text = 'PyPDF2'
    desc_xmp = etree.SubElement(
-        root, ns_rdf + 'Description', nsmap=nsmap_xmp)
+        rdf, ns_rdf + 'Description', nsmap=nsmap_xmp)
    desc_xmp.set(ns_rdf + 'about', '')
    creator = etree.SubElement(
        desc_xmp, ns_xmp + 'CreatorTool')
@ -284,10 +295,10 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
    # The Factur-X extension schema must be embedded into each PDF document
    facturx_ext_schema_desc_xpath = facturx_ext_schema_root.xpath(
        '//rdf:Description', namespaces=nsmap_rdf)
-    root.append(facturx_ext_schema_desc_xpath[1])
+    rdf.append(facturx_ext_schema_desc_xpath[1])
    # Now is the Factur-X description tag
    facturx_desc = etree.SubElement(
-        root, ns_rdf + 'Description', nsmap=nsmap_fx)
+        rdf, ns_rdf + 'Description', nsmap=nsmap_fx)
    facturx_desc.set(ns_rdf + 'about', '')
    facturx_desc.set(
        ns_fx + 'ConformanceLevel', FACTURX_LEVEL2xmp[facturx_level])
@ -295,11 +306,16 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
    facturx_desc.set(ns_fx + 'DocumentType', 'INVOICE')
    facturx_desc.set(ns_fx + 'Version', '1.0')

+    # TODO: should be UTF-16be ??
    xml_str = etree.tostring(
        root, pretty_print=True, encoding="UTF-8", xml_declaration=False)
+    head = u'<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>'.encode(
+        'utf-8')
+    tail = u'<?xpacket end="w"?>'.encode('utf-8')
+    xml_final_str = head + xml_str + tail
    logger.debug('metadata XML:')
-    logger.debug(xml_str)
-    return xml_str
+    logger.debug(xml_final_str)
+    return xml_final_str


 # def createByteObject(string):
@ -308,7 +324,8 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata):
 #    return ByteStringObject(x)


-def _filespec_additional_attachments(pdf_filestream, file_dict, file_bin):
+def _filespec_additional_attachments(
+        pdf_filestream, name_arrayobj_cdict, file_dict, file_bin):
    filename = file_dict['filename']
    logger.debug('_filespec_additional_attachments filename=%s', filename)
    mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date'])
@ -344,12 +361,12 @@ def _filespec_additional_attachments(pdf_filestream, file_dict, file_bin):
        NameObject("/UF"): fname_obj,
        })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
-    return (filespec_obj, fname_obj)
+    name_arrayobj_cdict[fname_obj] = filespec_obj


 def _facturx_update_metadata_add_attachment(
        pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level,
-        additional_attachments={}):
+        output_intents=[], additional_attachments={}):
    '''This method is inspired from the code of the addAttachment()
    method of the PyPDF2 lib'''
    # The entry for the file
@ -385,18 +402,13 @@ def _facturx_update_metadata_add_attachment(
        NameObject("/UF"): fname_obj,
        })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
-    name_arrayobj_content = [(fname_obj, filespec_obj)]
+    name_arrayobj_cdict = {fname_obj: filespec_obj}
    for attach_bin, attach_dict in additional_attachments.items():
-        additional_filespec_obj, additional_fname_obj =\
-            _filespec_additional_attachments(
-                pdf_filestream, attach_dict, attach_bin)
-        name_arrayobj_content.append((
-            additional_fname_obj,
-            additional_filespec_obj,
-            ))
-    logger.debug('name_arrayobj_content=%s', name_arrayobj_content)
+        _filespec_additional_attachments(
+            pdf_filestream, name_arrayobj_cdict, attach_dict, attach_bin)
+    logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict)
    name_arrayobj_content_sort = list(
-        sorted(name_arrayobj_content, key=lambda x: x[0]))
+        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
    logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort)
    name_arrayobj_content_final = []
    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
@ -404,14 +416,23 @@ def _facturx_update_metadata_add_attachment(
    embedded_files_names_dict = DictionaryObject({
        NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
        })
-    embedded_files_names_obj = pdf_filestream._addObject(
-        embedded_files_names_dict)
    # Then create the entry for the root, as it needs a
    # reference to the Filespec
    embedded_files_dict = DictionaryObject({
-        NameObject("/EmbeddedFiles"): embedded_files_names_obj,
+        NameObject("/EmbeddedFiles"): embedded_files_names_dict,
        })
-    embedded_files_obj = pdf_filestream._addObject(embedded_files_dict)
+    res_output_intents = []
+    logger.debug('output_intents=%s', output_intents)
+    for output_intent_dict, dest_output_profile_dict in output_intents:
+        dest_output_profile_obj = pdf_filestream._addObject(
+            dest_output_profile_dict)
+        # TODO detect if there are no other objects in output_intent_dest_obj
+        # than /DestOutputProfile
+        output_intent_dict.update({
+            NameObject("/DestOutputProfile"): dest_output_profile_obj,
+            })
+        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
+        res_output_intents.append(output_intent_obj)
    # Update the root
    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
    metadata_file_entry = DecodedStreamObject()
@ -426,10 +447,15 @@ def _facturx_update_metadata_add_attachment(
    pdf_filestream._root_object.update({
        NameObject("/AF"): af_value_obj,
        NameObject("/Metadata"): metadata_obj,
-        NameObject("/Names"): embedded_files_obj,
+        NameObject("/Names"): embedded_files_dict,
        # show attachments when opening PDF
        NameObject("/PageMode"): NameObject("/UseAttachments"),
        })
+    logger.debug('res_output_intents=%s', res_output_intents)
+    if res_output_intents:
+        pdf_filestream._root_object.update({
+            NameObject("/OutputIntents"): ArrayObject(res_output_intents),
+        })
    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
    pdf_filestream.addMetadata(metadata_txt_dict)

@ -520,6 +546,24 @@ def get_facturx_flavor(facturx_xml_etree):
    return flavor


+def _get_original_output_intents(original_pdf):
+    output_intents = []
+    try:
+        pdf_root = original_pdf.trailer['/Root']
+        ori_output_intents = pdf_root['/OutputIntents']
+        logger.debug('output_intents_list=%s', ori_output_intents)
+        for ori_output_intent in ori_output_intents:
+            ori_output_intent_dict = ori_output_intent.getObject()
+            logger.debug('ori_output_intents_dict=%s', ori_output_intent_dict)
+            dest_output_profile_dict =\
+                ori_output_intent_dict['/DestOutputProfile'].getObject()
+            output_intents.append(
+                (ori_output_intent_dict, dest_output_profile_dict))
+    except:
+        pass
+    return output_intents
+
+
 def generate_facturx_from_binary(
        pdf_invoice, facturx_xml, facturx_level='autodetect',
        check_xsd=True, pdf_metadata=None):
@ -704,10 +748,19 @@ def generate_facturx_from_file(
        check_facturx_xsd(
            xml_string, flavor='factur-x', facturx_level=facturx_level)
    original_pdf = PdfFileReader(pdf_invoice)
+    # Extract /OutputIntents obj from original invoice
+    output_intents = _get_original_output_intents(original_pdf)
    new_pdf_filestream = PdfFileWriter()
    new_pdf_filestream.appendPagesFromReader(original_pdf)
+
+    original_pdf_id = original_pdf.trailer.get('/ID')
+    logger.debug('original_pdf_id=%s', original_pdf_id)
+    if original_pdf_id:
+        new_pdf_filestream._ID = original_pdf_id
+        # else : generate some ?
    _facturx_update_metadata_add_attachment(
        new_pdf_filestream, xml_string, pdf_metadata, facturx_level,
+        output_intents=output_intents,
        additional_attachments=additional_attachments_read)
    if output_pdf_file:
        with open(output_pdf_file, 'wb') as output_f: