pdf: add fill-form enpoint (#73544)
gitea-wip/passerelle/pipeline/pr-main This commit looks good Details
gitea/passerelle/pipeline/head Something is wrong with the build of this commit Details

This commit is contained in:
Thomas NOËL 2023-02-01 17:01:54 +01:00
parent 91fa126653
commit 0d9e35cc2f
4 changed files with 340 additions and 9 deletions

View File

@ -0,0 +1,28 @@
# Generated by Django 2.2.26 on 2023-02-01 17:19
from django.db import migrations, models
import passerelle.apps.pdf.models
import passerelle.utils.models
class Migration(migrations.Migration):
dependencies = [
('pdf', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='resource',
name='fill_form_file',
field=models.FileField(
blank=True,
help_text='PDF file, used if not input-form in fill-form payload',
null=True,
upload_to=passerelle.utils.models.resource_file_upload_to,
validators=[passerelle.apps.pdf.models.validate_pdf],
verbose_name='Fill Form default input file',
),
),
]

View File

@ -18,15 +18,19 @@ import base64
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from collections import OrderedDict
from django.conf import settings
from django.core.exceptions import ValidationError
from django.db import models
from django.http.response import HttpResponse
from django.utils.translation import gettext_lazy as _
from passerelle.base.models import BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
from passerelle.utils.models import resource_file_upload_to
PDF_FILE_OBJECT = {
'type': 'object',
@ -76,10 +80,50 @@ ASSEMBLE_SCHEMA = {
),
}
FILL_FORM_SCHEMA = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': '',
'description': '',
'type': 'object',
'required': ['filename', 'fields'],
'unflatten': True,
'properties': OrderedDict(
{
'filename': {
'description': _('output PDF filename'),
'type': 'string',
},
'input-form': PDF_FILE_OBJECT,
'fields': {
'description': _('hierarchical dictionary of fields'),
'type': 'object',
},
}
),
}
def validate_pdf(fieldfile):
fieldfile.open()
if fieldfile.read(5) != b'%PDF-':
raise ValidationError(
_('%(value)s is not a PDF file'),
params={'value': fieldfile},
)
class Resource(BaseResource):
category = _('Misc')
fill_form_file = models.FileField(
_('Fill Form default input file'),
upload_to=resource_file_upload_to,
help_text=_('PDF file, used if not input-form in fill-form payload'),
validators=[validate_pdf],
null=True,
blank=True,
)
class Meta:
verbose_name = _('PDF')
@ -137,3 +181,78 @@ class Resource(BaseResource):
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response
@endpoint(
name='fill-form',
description=_('Fills the input PDF form with fields'),
perm='can_access',
methods=['post'],
post={
'request_body': {'schema': {'application/json': FILL_FORM_SCHEMA}},
'input_example': {
'filename': 'filled.pdf',
'fields/Page1[0]/FirstName[0]': 'John',
'fields/Page1[0]/LastName[0]': 'Doe',
'fields/Page2[0]/Checkbox[0]': '0',
'fields/Page2[0]/Checkbox[1]': '1',
},
},
)
def fill_form(self, request, post_data):
filename = post_data.pop('filename')
fields = post_data.pop('fields')
xfdf_root = ET.Element('xfdf')
xfdf_root.attrib['xmlns'] = 'http://ns.adobe.com/xfdf/'
xfdf_root.attrib['xml:space'] = 'preserve'
xfdf_f = ET.SubElement(xfdf_root, 'f')
xfdf_fields = ET.SubElement(xfdf_root, 'fields')
def add_fields(element, fields):
if isinstance(fields, dict):
for key in fields:
field = ET.SubElement(element, 'field')
field.attrib['name'] = key
add_fields(field, fields[key])
else:
value = ET.SubElement(element, 'value')
value.text = str(fields)
add_fields(xfdf_fields, fields)
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-fill-form-' % self.id) as tmpdir:
if isinstance(post_data.get('input-form'), dict) and post_data['input-form'].get('content'):
input_filename = os.path.join(tmpdir, 'input-form.pdf')
with open(input_filename, mode='wb') as fd:
fd.write(base64.b64decode(post_data['input-form']['content']))
elif self.fill_form_file:
input_filename = self.fill_form_file.path
else:
raise APIError("missing or bad 'input-form' property", http_status=400)
# create xfdf
xfdf_filename = os.path.join(tmpdir, 'fields.xfdf')
xfdf_f.attrib['href'] = input_filename
with open(xfdf_filename, mode='wb') as fd:
ET.indent(xfdf_root)
ET.ElementTree(xfdf_root).write(fd, encoding='UTF-8', xml_declaration=True)
# call pdftk fill_form
pdf_content = self.run_pdftk(args=[input_filename, 'fill_form', xfdf_filename])
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response
def pdftk_dump_data_fields_utf8(self):
if not self.fill_form_file:
return
try:
dump = self.run_pdftk(args=[self.fill_form_file.path, 'dump_data_fields_utf8']).decode()
except APIError as apierror:
return 'Error: %r' % apierror
unflatten_separated = ''
for line in dump.splitlines():
unflatten_separated += '<br>%s' % line
if line.startswith('FieldName: '):
unflatten_separated += ' → <b>fields/%s</b>' % line[11:].replace('.', '/')
return unflatten_separated

View File

@ -0,0 +1,20 @@
{% extends "passerelle/manage/service_view.html" %}
{% load i18n passerelle %}
{% block extra-tab-buttons %}
{% if user.is_staff and object.fill_form_file %}
<button role="tab" aria-selected="false" aria-controls="panel-dumpfields" id="tab-dumpfields"
tabindex="-1">{% trans "Fill Form default PDF Fields" %}</button>
{% endif %}
{% endblock %}
{% block extra-tab-panels %}
{% if user.is_staff and object.fill_form_file %}
<div id="panel-dumpfields" role="tabpanel" tabindex="-1" aria-labelledby="tab-dumpfields" hidden>
<div>
<p>{% blocktrans with file=object.fill_form_file %}PDFtk {{ file }} dump_data_fields_utf8 output{% endblocktrans %}</p>
<p>{{ object.pdftk_dump_data_fields_utf8|safe }}</p>
</div>
</div>
{% endif %}
{% endblock %}

View File

@ -17,28 +17,38 @@
import base64
import os
import subprocess
import xml.etree.ElementTree as ET
from io import BytesIO
from unittest import mock
import pytest
from django.core.exceptions import ValidationError
from django.core.files import File
from django.urls import reverse
from pdfrw import PdfReader
from passerelle.apps.pdf.models import Resource
from tests.test_manager import login
from tests.utils import generic_endpoint_url, setup_access_rights
with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
pdf_content = base64.b64encode(fd.read()).decode()
pdf_content = fd.read()
pdf_b64content = base64.b64encode(pdf_content).decode()
with open(os.path.join(os.path.dirname(__file__), 'data', 'pdf-form.pdf'), 'rb') as fd:
acroform_content = fd.read()
acroform_b64content = base64.b64encode(acroform_content).decode()
@pytest.fixture
def pdf(db):
return setup_access_rights(Resource.objects.create(slug='test'))
return setup_access_rights(Resource.objects.create(slug='test', title='test', description='test'))
@mock.patch('subprocess.check_output')
def test_pdf_assemble(mocked_check_output, app, pdf):
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_content}}
payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_b64content}}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
@ -55,9 +65,9 @@ def test_pdf_assemble(mocked_check_output, app, pdf):
payload = {
'filename': 'bar.pdf',
'files/0': {'content': ''},
'files/1': {'content': pdf_content},
'files/1': {'content': pdf_b64content},
'files/2': None,
'files/3': pdf_content,
'files/3': pdf_b64content,
'files/4': '',
}
mocked_check_output.reset_mock()
@ -72,7 +82,7 @@ def test_pdf_assemble(mocked_check_output, app, pdf):
assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2
# pdftk errors (faked)
payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_content}}
payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_b64content}}
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
resp = app.post_json(endpoint, params=payload, status=200)
@ -112,18 +122,172 @@ def test_pdf_assemble(mocked_check_output, app, pdf):
resp = app.get(endpoint, status=405)
def test_pdf_real_pdftk_call(app, pdf, settings):
def test_pdf_real_pdftk_assemble(app, pdf, settings):
if not os.path.exists(settings.PDFTK_PATH):
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
payload = {
'filename': 'twopages.pdf',
'files/0': {'content': pdf_content},
'files/1': {'content': pdf_content},
'files/0': {'content': pdf_b64content},
'files/1': {'content': pdf_b64content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
assert resp.content[:5] == b'%PDF-'
assert PdfReader(fdata=resp.content).numPages == 2
@mock.patch('subprocess.check_output')
def test_pdf_fill_form(mocked_check_output, app, pdf):
endpoint = generic_endpoint_url('pdf', 'fill-form', slug=pdf.slug)
def check_xml(args, **kwargs):
# check XML FDF file
xfdf = ET.parse(args[3]).getroot()
assert xfdf.tag == '{http://ns.adobe.com/xfdf/}xfdf'
assert xfdf.find('{http://ns.adobe.com/xfdf/}f').attrib['href'].endswith('.pdf')
field = xfdf.find('{http://ns.adobe.com/xfdf/}fields').find('{http://ns.adobe.com/xfdf/}field')
assert field.attrib['name'] == 'fname'
assert field.find('{http://ns.adobe.com/xfdf/}value').text == 'John'
payload = {
'filename': 'foo.pdf',
'fields/fname': 'John',
'input-form': {'content': acroform_b64content},
}
mocked_check_output.side_effect = check_xml
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/input-form.pdf')
assert pdftk_call[2] == 'fill_form'
assert pdftk_call[3].endswith('/fields.xfdf')
assert pdftk_call[4] == 'output'
assert pdftk_call[5] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
pdf.fill_form_file = File(BytesIO(acroform_content), 'default.pdf')
pdf.save()
payload = {
'filename': 'bar.pdf',
'fields/fname': 'John',
}
mocked_check_output.reset_mock()
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('media/pdf/test/default.pdf')
assert pdftk_call[2] == 'fill_form'
assert pdftk_call[3].endswith('/fields.xfdf')
assert pdftk_call[4] == 'output'
assert pdftk_call[5] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
# pdftk errors (faked)
payload = {
'filename': 'foo.pdf',
'fields/fname': 'Bill',
'input-form': {'content': acroform_b64content},
}
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
assert 'ooops' in resp.json['err_desc']
# bad calls errors
resp = app.post(endpoint, status=400)
assert resp.headers['content-type'].startswith('application/json')
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('could not decode body to json')
payload = {}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'filename' is a required property"
payload = {'filename': 'out.pdf'}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'fields' is a required property"
payload = {'filename': 'out.pdf', 'fields': 'not-a-dict'}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "fields: 'not-a-dict' is not of type 'object'"
pdf.fill_form_file = None # no default PDF form
pdf.save()
payload = {
'filename': 'bar.pdf',
'fields/fname': 'Alice',
}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "missing or bad 'input-form' property"
resp = app.get(endpoint, status=405)
def test_pdf_real_pdftk_fillform(admin_user, app, pdf, settings):
if not os.path.exists(settings.PDFTK_PATH):
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
endpoint = generic_endpoint_url('pdf', 'fill-form', slug=pdf.slug)
payload = {
'filename': 'filled.pdf',
'fields/fname': 'ThisIsMyFirstName',
'input-form': {'content': acroform_b64content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="filled.pdf"'
assert PdfReader(fdata=resp.content).numPages == 1
assert resp.content[:5] == b'%PDF-'
# TODO: found an easy way to verify 'ThisIsMyFirstName' in resp.content
# dump fields in manager view
pdf.fill_form_file = File(BytesIO(acroform_content), 'pdf-form.pdf')
pdf.save()
manage_url = reverse('view-connector', kwargs={'connector': 'pdf', 'slug': pdf.slug})
resp = app.get(manage_url)
assert 'panel-dumpfields' not in resp.text
assert '<b>fields/fname</b>' not in resp.text
app = login(app)
resp = app.get(manage_url)
assert 'panel-dumpfields' in resp.text
assert '<b>fields/fname</b>' in resp.text
def test_pdf_validator(pdf):
pdf.fill_form_file = File(BytesIO(pdf_content), 'default.pdf')
pdf.save()
pdf.full_clean()
pdf.fill_form_file = File(BytesIO(acroform_content), 'default.pdf')
pdf.save()
pdf.full_clean()
pdf.fill_form_file = File(BytesIO(b'not a pdf'), 'test.txt')
pdf.save()
with pytest.raises(ValidationError):
pdf.full_clean()