diff --git a/README.rst b/README.rst index 2892026..ef33210 100644 --- a/README.rst +++ b/README.rst @@ -52,3 +52,22 @@ Contributors ============ * Alexis de Lattre + +Changelog +========= + +* Version 0.5 dated 2018-03-29 + + * Fix XMP metadata structure + * Now fully PDF/A-3 compliant when the input PDF file is PDF/A compliant (tested with veraPDF). This implied copying /OutputIntents and /ID datas from source PDF to Factur-X PDF. + * Fix support for additionnal attachments: they can now all be saved with Acrobat Reader + * Improve XML extraction from PDF Factur-x file + +* Version 0.4 dated 2018-03-27 + + * Factur-x specs say /AFRelationship must be /Data (and not /Alternative) + * Update Factur-X XSD to v1.0 final + * Add support for additionnal attachments + * Add factur-x lib version in Creator metadata table + * Add /PageMode = /UseAttachments, so that the attachments are displayed by default when opening Factur-X PDF invoice with Acrobat Reader + * Improve and enrich PDF objects (ModDate, CheckSum, Size) diff --git a/bin/facturx-pdfgen b/bin/facturx-pdfgen index 090a0bb..97f72f5 100755 --- a/bin/facturx-pdfgen +++ b/bin/facturx-pdfgen @@ -128,8 +128,11 @@ if __name__ == '__main__': "metadata only apply if none of the meta-* arguments are used." epilog = "Author: %s\n\nVersion: %s" % (__author__, __version__) description = "This script generate a Factur-X PDF invoice from a "\ - "regular PDF invoice and a Factur-X XML file."\ - "It can also include additional embedded files in the PDF." + "regular PDF/A invoice and a Factur-X XML file."\ + "It can also include additional embedded files in the PDF."\ + "To generate a valid PDF/A-3 invoice as requested by the "\ + "Factur-X standard, you need to give a valid PDF/A "\ + "regular invoice as input." parser = OptionParser(usage=usage, epilog=epilog, description=description) for option in options: param = option['names'] diff --git a/facturx/_version.py b/facturx/_version.py index 58d168b..5a6f84c 100644 --- a/facturx/_version.py +++ b/facturx/_version.py @@ -1 +1 @@ -__version__ = '0.4' +__version__ = '0.5' diff --git a/facturx/facturx.py b/facturx/facturx.py index 2e86fe2..d21d835 100644 --- a/facturx/facturx.py +++ b/facturx/facturx.py @@ -27,6 +27,7 @@ # TODO list: # - have both python2 and python3 support # - add automated tests (currently, we only have tests at odoo module level) +# - keep original metadata by copy of pdf_tailer[/Info] ? from ._version import __version__ from io import BytesIO @@ -163,23 +164,29 @@ def get_facturx_xml_from_pdf(pdf_invoice, check_xsd=True): pdf_root = pdf.trailer['/Root'] logger.debug('pdf_root=%s', pdf_root) embeddedfiles = pdf_root['/Names']['/EmbeddedFiles']['/Names'] - i = 0 - for embeddedfile in embeddedfiles[:-1]: - if embeddedfile in (FACTURX_FILENAME, 'ZUGFeRD-invoice.xml'): - xml_file_dict = embeddedfiles[i+1].getObject() + logger.debug('embeddedfiles=%s', embeddedfiles) + # embeddedfiles must contain an even number of elements + if len(embeddedfiles) % 2 != 0: + raise + embeddedfiles_by_two = zip(embeddedfiles, embeddedfiles[1:])[::2] + logger.debug('embeddedfiles_by_two=%s', embeddedfiles_by_two) + for (filename, file_obj) in embeddedfiles_by_two: + logger.debug('found filename=%s', filename) + if filename in (FACTURX_FILENAME, 'ZUGFeRD-invoice.xml'): + xml_file_dict = file_obj.getObject() logger.debug('xml_file_dict=%s', xml_file_dict) tmp_xml_string = xml_file_dict['/EF']['/F'].getData() xml_root = etree.fromstring(tmp_xml_string) logger.info( 'A valid XML file %s has been found in the PDF file', - embeddedfile) + filename) if check_xsd: check_facturx_xsd(xml_root) xml_string = tmp_xml_string - xml_filename = embeddedfile + xml_filename = filename else: xml_string = tmp_xml_string - xml_filename = embeddedfile + xml_filename = filename break except: logger.error('No valid XML file found in the PDF') @@ -220,6 +227,7 @@ def _prepare_pdf_metadata_txt(pdf_metadata): def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): + nsmap_x = {'x': 'adobe:ns:meta/'} nsmap_rdf = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'} nsmap_dc = {'dc': 'http://purl.org/dc/elements/1.1/'} nsmap_pdf = {'pdf': 'http://ns.adobe.com/pdf/1.3/'} @@ -227,6 +235,7 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): nsmap_pdfaid = {'pdfaid': 'http://www.aiim.org/pdfa/ns/id/'} nsmap_fx = { 'fx': 'urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0#'} + ns_x = '{%s}' % nsmap_x['x'] ns_dc = '{%s}' % nsmap_dc['dc'] ns_rdf = '{%s}' % nsmap_rdf['rdf'] ns_pdf = '{%s}' % nsmap_pdf['pdf'] @@ -235,16 +244,18 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): ns_fx = '{%s}' % nsmap_fx['fx'] ns_xml = '{http://www.w3.org/XML/1998/namespace}' - root = etree.Element(ns_rdf + 'RDF', nsmap=nsmap_rdf) + root = etree.Element(ns_x + 'xmpmeta', nsmap=nsmap_x) + rdf = etree.SubElement( + root, ns_rdf + 'RDF', nsmap=nsmap_rdf) desc_pdfaid = etree.SubElement( - root, ns_rdf + 'Description', nsmap=nsmap_pdfaid) + rdf, ns_rdf + 'Description', nsmap=nsmap_pdfaid) desc_pdfaid.set(ns_rdf + 'about', '') etree.SubElement( desc_pdfaid, ns_pdfaid + 'part').text = '3' etree.SubElement( desc_pdfaid, ns_pdfaid + 'conformance').text = 'B' desc_dc = etree.SubElement( - root, ns_rdf + 'Description', nsmap=nsmap_dc) + rdf, ns_rdf + 'Description', nsmap=nsmap_dc) desc_dc.set(ns_rdf + 'about', '') dc_title = etree.SubElement(desc_dc, ns_dc + 'title') dc_title_alt = etree.SubElement(dc_title, ns_rdf + 'Alt') @@ -263,13 +274,13 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): dc_desc_alt_li.text = pdf_metadata.get('subject', '') dc_desc_alt_li.set(ns_xml + 'lang', 'x-default') desc_adobe = etree.SubElement( - root, ns_rdf + 'Description', nsmap=nsmap_pdf) + rdf, ns_rdf + 'Description', nsmap=nsmap_pdf) desc_adobe.set(ns_rdf + 'about', '') producer = etree.SubElement( desc_adobe, ns_pdf + 'Producer') producer.text = 'PyPDF2' desc_xmp = etree.SubElement( - root, ns_rdf + 'Description', nsmap=nsmap_xmp) + rdf, ns_rdf + 'Description', nsmap=nsmap_xmp) desc_xmp.set(ns_rdf + 'about', '') creator = etree.SubElement( desc_xmp, ns_xmp + 'CreatorTool') @@ -284,10 +295,10 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): # The Factur-X extension schema must be embedded into each PDF document facturx_ext_schema_desc_xpath = facturx_ext_schema_root.xpath( '//rdf:Description', namespaces=nsmap_rdf) - root.append(facturx_ext_schema_desc_xpath[1]) + rdf.append(facturx_ext_schema_desc_xpath[1]) # Now is the Factur-X description tag facturx_desc = etree.SubElement( - root, ns_rdf + 'Description', nsmap=nsmap_fx) + rdf, ns_rdf + 'Description', nsmap=nsmap_fx) facturx_desc.set(ns_rdf + 'about', '') facturx_desc.set( ns_fx + 'ConformanceLevel', FACTURX_LEVEL2xmp[facturx_level]) @@ -295,11 +306,16 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): facturx_desc.set(ns_fx + 'DocumentType', 'INVOICE') facturx_desc.set(ns_fx + 'Version', '1.0') + # TODO: should be UTF-16be ?? xml_str = etree.tostring( root, pretty_print=True, encoding="UTF-8", xml_declaration=False) + head = u''.encode( + 'utf-8') + tail = u''.encode('utf-8') + xml_final_str = head + xml_str + tail logger.debug('metadata XML:') - logger.debug(xml_str) - return xml_str + logger.debug(xml_final_str) + return xml_final_str # def createByteObject(string): @@ -308,7 +324,8 @@ def _prepare_pdf_metadata_xml(facturx_level, pdf_metadata): # return ByteStringObject(x) -def _filespec_additional_attachments(pdf_filestream, file_dict, file_bin): +def _filespec_additional_attachments( + pdf_filestream, name_arrayobj_cdict, file_dict, file_bin): filename = file_dict['filename'] logger.debug('_filespec_additional_attachments filename=%s', filename) mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date']) @@ -344,12 +361,12 @@ def _filespec_additional_attachments(pdf_filestream, file_dict, file_bin): NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) - return (filespec_obj, fname_obj) + name_arrayobj_cdict[fname_obj] = filespec_obj def _facturx_update_metadata_add_attachment( pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, - additional_attachments={}): + output_intents=[], additional_attachments={}): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file @@ -385,18 +402,13 @@ def _facturx_update_metadata_add_attachment( NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) - name_arrayobj_content = [(fname_obj, filespec_obj)] + name_arrayobj_cdict = {fname_obj: filespec_obj} for attach_bin, attach_dict in additional_attachments.items(): - additional_filespec_obj, additional_fname_obj =\ - _filespec_additional_attachments( - pdf_filestream, attach_dict, attach_bin) - name_arrayobj_content.append(( - additional_fname_obj, - additional_filespec_obj, - )) - logger.debug('name_arrayobj_content=%s', name_arrayobj_content) + _filespec_additional_attachments( + pdf_filestream, name_arrayobj_cdict, attach_dict, attach_bin) + logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict) name_arrayobj_content_sort = list( - sorted(name_arrayobj_content, key=lambda x: x[0])) + sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort) name_arrayobj_content_final = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: @@ -404,14 +416,23 @@ def _facturx_update_metadata_add_attachment( embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) - embedded_files_names_obj = pdf_filestream._addObject( - embedded_files_names_dict) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ - NameObject("/EmbeddedFiles"): embedded_files_names_obj, + NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) - embedded_files_obj = pdf_filestream._addObject(embedded_files_dict) + res_output_intents = [] + logger.debug('output_intents=%s', output_intents) + for output_intent_dict, dest_output_profile_dict in output_intents: + dest_output_profile_obj = pdf_filestream._addObject( + dest_output_profile_dict) + # TODO detect if there are no other objects in output_intent_dest_obj + # than /DestOutputProfile + output_intent_dict.update({ + NameObject("/DestOutputProfile"): dest_output_profile_obj, + }) + output_intent_obj = pdf_filestream._addObject(output_intent_dict) + res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() @@ -426,10 +447,15 @@ def _facturx_update_metadata_add_attachment( pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, - NameObject("/Names"): embedded_files_obj, + NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) + logger.debug('res_output_intents=%s', res_output_intents) + if res_output_intents: + pdf_filestream._root_object.update({ + NameObject("/OutputIntents"): ArrayObject(res_output_intents), + }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict) @@ -520,6 +546,24 @@ def get_facturx_flavor(facturx_xml_etree): return flavor +def _get_original_output_intents(original_pdf): + output_intents = [] + try: + pdf_root = original_pdf.trailer['/Root'] + ori_output_intents = pdf_root['/OutputIntents'] + logger.debug('output_intents_list=%s', ori_output_intents) + for ori_output_intent in ori_output_intents: + ori_output_intent_dict = ori_output_intent.getObject() + logger.debug('ori_output_intents_dict=%s', ori_output_intent_dict) + dest_output_profile_dict =\ + ori_output_intent_dict['/DestOutputProfile'].getObject() + output_intents.append( + (ori_output_intent_dict, dest_output_profile_dict)) + except: + pass + return output_intents + + def generate_facturx_from_binary( pdf_invoice, facturx_xml, facturx_level='autodetect', check_xsd=True, pdf_metadata=None): @@ -704,10 +748,19 @@ def generate_facturx_from_file( check_facturx_xsd( xml_string, flavor='factur-x', facturx_level=facturx_level) original_pdf = PdfFileReader(pdf_invoice) + # Extract /OutputIntents obj from original invoice + output_intents = _get_original_output_intents(original_pdf) new_pdf_filestream = PdfFileWriter() new_pdf_filestream.appendPagesFromReader(original_pdf) + + original_pdf_id = original_pdf.trailer.get('/ID') + logger.debug('original_pdf_id=%s', original_pdf_id) + if original_pdf_id: + new_pdf_filestream._ID = original_pdf_id + # else : generate some ? _facturx_update_metadata_add_attachment( new_pdf_filestream, xml_string, pdf_metadata, facturx_level, + output_intents=output_intents, additional_attachments=additional_attachments_read) if output_pdf_file: with open(output_pdf_file, 'wb') as output_f: