passerelle/tests/test_pdf.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

354 lines
14 KiB
Python
Raw Normal View History

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import base64
import os
import subprocess
2023-02-01 17:01:54 +01:00
from io import BytesIO
from unittest import mock
import pytest
2023-02-01 17:01:54 +01:00
from django.core.exceptions import ValidationError
from django.core.files import File
from django.core.files.base import ContentFile
from pdfrw import PdfReader
from passerelle.apps.pdf.models import Resource
from passerelle.utils.pdf import PDF
2023-02-01 17:01:54 +01:00
from tests.test_manager import login
from tests.utils import generic_endpoint_url, setup_access_rights
with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
2023-02-01 17:01:54 +01:00
pdf_content = fd.read()
pdf_b64content = base64.b64encode(pdf_content).decode()
2023-02-01 17:01:54 +01:00
with open(os.path.join(os.path.dirname(__file__), 'data', 'pdf-form.pdf'), 'rb') as fd:
acroform_content = fd.read()
acroform_b64content = base64.b64encode(acroform_content).decode()
@pytest.fixture
def pdf(db):
2023-02-01 17:01:54 +01:00
return setup_access_rights(Resource.objects.create(slug='test', title='test', description='test'))
@mock.patch('subprocess.check_output')
def test_pdf_assemble(mocked_check_output, app, pdf):
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
2023-02-01 17:01:54 +01:00
payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_b64content}}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 5
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-0.pdf')
assert pdftk_call[2] == 'cat'
assert pdftk_call[3] == 'output'
assert pdftk_call[4] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
payload = {
'filename': 'bar.pdf',
'files/0': {'content': ''},
2023-02-01 17:01:54 +01:00
'files/1': {'content': pdf_b64content},
'files/2': None,
2023-02-01 17:01:54 +01:00
'files/3': pdf_b64content,
'files/4': '',
}
mocked_check_output.reset_mock()
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-1.pdf') # file 0
assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2
# pdftk errors (faked)
2023-02-01 17:01:54 +01:00
payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_b64content}}
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
assert 'ooops' in resp.json['err_desc']
# bad calls errors
resp = app.post(endpoint, status=400)
assert resp.headers['content-type'].startswith('application/json')
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('could not decode body to json')
payload = {}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'filename' is a required property"
payload = {'filename': 'out.pdf'}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'files' is a required property"
payload = {'filename': 'out.pdf', 'files/0': 42}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_class'] == 'passerelle.utils.json.JSONValidationError'
resp = app.get(endpoint, status=405)
2023-02-01 17:01:54 +01:00
def test_pdf_real_pdftk_assemble(app, pdf, settings):
if not os.path.exists(settings.PDFTK_PATH):
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
payload = {
'filename': 'twopages.pdf',
2023-02-01 17:01:54 +01:00
'files/0': {'content': pdf_b64content},
'files/1': {'content': pdf_b64content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
assert resp.content[:5] == b'%PDF-'
assert PdfReader(fdata=resp.content).numPages == 2
2023-02-01 17:01:54 +01:00
2023-04-03 15:25:01 +02:00
@mock.patch('subprocess.check_output')
def test_pdf_watermark(mocked_check_output, app, pdf):
endpoint = generic_endpoint_url('pdf', 'watermark', slug=pdf.slug)
payload = {
'filename': 'foo.pdf',
'file': {'content': pdf_b64content},
'stamp': {'content': pdf_b64content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-file.pdf')
assert pdftk_call[2] == 'background'
assert pdftk_call[3].endswith('/pdf-stamp.pdf')
assert pdftk_call[4] == 'output'
assert pdftk_call[5] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
payload['mode'] = 'multistamp'
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
assert mocked_check_output.call_count == 2
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-file.pdf')
assert pdftk_call[2] == 'multistamp'
assert pdftk_call[3].endswith('/pdf-stamp.pdf')
assert pdftk_call[4] == 'output'
assert pdftk_call[5] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
# bad calls errors
resp = app.post(endpoint, status=400)
assert resp.headers['content-type'].startswith('application/json')
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('could not decode body to json')
payload = {}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'filename' is a required property"
payload = {'filename': 'out.pdf'}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'file' is a required property"
payload = {'filename': 'out.pdf', 'file': {'content': pdf_b64content}}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'stamp' is a required property"
payload = {'filename': 'out.pdf', 'file': {'content': pdf_b64content}, 'stamp': 42}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "stamp: 42 is not of type 'object'"
payload = {
'filename': 'out.pdf',
'file': {'content': pdf_b64content},
'stamp': {'content': pdf_b64content},
'mode': 'foobar',
}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert (
resp.json['err_desc']
== "mode: 'foobar' is not one of ['background', 'multibackground', 'stamp', 'multistamp']"
)
resp = app.get(endpoint, status=405)
def test_pdf_real_pdftk_watermark(app, pdf, settings):
if not os.path.exists(settings.PDFTK_PATH):
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
endpoint = generic_endpoint_url('pdf', 'watermark', slug=pdf.slug)
payload = {
'filename': 'watermark.pdf',
'file': {'content': pdf_b64content},
'stamp': {'content': pdf_b64content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="watermark.pdf"'
assert resp.content[:5] == b'%PDF-'
assert PdfReader(fdata=resp.content).numPages == 1
2023-02-01 17:01:54 +01:00
def test_pdf_validator(pdf):
pdf.fill_form_file = File(BytesIO(pdf_content), 'default.pdf')
pdf.save()
pdf.full_clean()
pdf.fill_form_file = File(BytesIO(acroform_content), 'default.pdf')
pdf.save()
pdf.full_clean()
pdf.fill_form_file = File(BytesIO(b'not a pdf'), 'test.txt')
pdf.save()
with pytest.raises(ValidationError):
pdf.full_clean()
@pytest.fixture
def cerfa_content():
with open('tests/data/cerfa_10072-02.pdf', 'rb') as fd:
return fd.read()
def test_fill_form_no_pdf(app, admin_user, pdf):
resp = app.get('/pdf/test')
assert 'Fill form: Edit fields mapping' not in resp.text
resp = app.get('/pdf/test/fields-mapping/edit/', status=404)
resp = app.post_json('/pdf/test/fill-form/', params={'a': 1})
assert resp.json == {
'data': None,
'err': 1,
'err_class': 'passerelle.utils.jsonresponse.APIError',
'err_desc': 'not PDF file configured',
}
def test_fill_form_no_fields_mapping(app, admin_user, pdf, cerfa_content):
pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
resp = app.post_json('/pdf/test/fill-form/', params={'a': 1})
assert resp.json == {
'data': None,
'err': 1,
'err_class': 'passerelle.utils.jsonresponse.APIError',
'err_desc': 'no fields mapping configured',
}
def test_fill_form_ok(app, admin_user, pdf, cerfa_content):
pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
app = login(app)
resp = app.get('/pdf/test/')
resp = resp.click('Fill form: Edit fields mapping')
img_tags = resp.pyquery('img')
image_resp = app.get(img_tags[0].attrib['src'])
assert b'PNG' in image_resp.content
pdf_ = PDF(cerfa_content)
page = pdf_.page(0)
checkbox_field = [field for field in page.fields if field.widget_type == 'checkbox'][0]
text_field = [field for field in page.fields if field.widget_type == 'text'][0]
assert checkbox_field.value is False
assert text_field.value == ''
resp.form.set(f'field_{checkbox_field.digest_id}', 'testme == "a"')
resp.form.set(f'field_{text_field.digest_id}', '{{ prenom }} {{ nom }}')
resp.form.submit().follow()
resp = app.post_json('/pdf/test/fill-form/', params={'testme': 'a', 'prenom': 'Jean', 'nom': 'Dupont'})
pdf_ = PDF(resp.content)
page = pdf_.page(0)
checkbox_field = [field for field in page.fields if field.widget_type == 'checkbox'][0]
text_field = [field for field in page.fields if field.widget_type == 'text'][0]
assert checkbox_field.value is True
assert text_field.value == 'Jean Dupont'
resp = app.post_json(
'/pdf/test/fill-form/?flatten=1', params={'testme': 'a', 'prenom': 'Jean', 'nom': 'Dupont'}
)
pdf_ = PDF(resp.content)
page = pdf_.page(0)
assert not page.fields
def test_add_or_edit(app, admin_user, pdf, cerfa_content):
app = login(app)
resp = app.get('/manage/pdf/add')
assert b'fields_mapping' not in resp.content
pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
resp = app.get('/manage/pdf/test/edit')
assert b'fields_mapping' not in resp.content
def test_thumbnail_url_contains_hash(app, admin_user, pdf, cerfa_content):
pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
app = login(app)
resp = app.get('/pdf/test/')
resp = resp.click('Fill form: Edit fields mapping')
img_tags = resp.pyquery('img')
urls = [img_tag.attrib['src'] for img_tag in img_tags]
assert len(urls) == 5 # cerfa_content has 5 pages
assert all('?hash=' in url for url in urls)
assert not any(url.endswith('?hash=0') for url in urls)
first_url = urls[0]
pdf.fill_form_file.save('form.pdf', ContentFile(acroform_content))
resp = app.get('/pdf/test/')
resp = resp.click('Fill form: Edit fields mapping')
img_tags = resp.pyquery('img')
urls = [img_tag.attrib['src'] for img_tag in img_tags]
assert len(urls) == 1 # acroform_content has 1 page
assert all('?hash=' in url for url in urls)
assert not any(url.endswith('?hash=0') for url in urls)
assert urls[0] != first_url