passerelle/tests/test_pdf.py

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import base64
import os
import subprocess
from io import BytesIO
from unittest import mock

import pytest
from django.core.exceptions import ValidationError
from django.core.files import File
from django.core.files.base import ContentFile
from pdfrw import PdfReader

from passerelle.apps.pdf.models import Resource
from passerelle.utils.pdf import PDF
from tests.test_manager import login
from tests.utils import generic_endpoint_url, setup_access_rights

with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
    pdf_content = fd.read()
    pdf_b64content = base64.b64encode(pdf_content).decode()

with open(os.path.join(os.path.dirname(__file__), 'data', 'pdf-form.pdf'), 'rb') as fd:
    acroform_content = fd.read()
    acroform_b64content = base64.b64encode(acroform_content).decode()


@pytest.fixture
def pdf(db):
    return setup_access_rights(Resource.objects.create(slug='test', title='test', description='test'))


@mock.patch('subprocess.check_output')
def test_pdf_assemble(mocked_check_output, app, pdf):
    endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)

    payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_b64content}}
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
    assert mocked_check_output.call_count == 1
    pdftk_call = mocked_check_output.call_args.args[0]
    assert len(pdftk_call) == 5
    assert pdftk_call[0] == '/usr/bin/pdftk'
    assert pdftk_call[1].endswith('/pdf-0.pdf')
    assert pdftk_call[2] == 'cat'
    assert pdftk_call[3] == 'output'
    assert pdftk_call[4] == '-'
    assert mocked_check_output.call_args.kwargs['timeout'] == 20

    payload = {
        'filename': 'bar.pdf',
        'files/0': {'content': ''},
        'files/1': {'content': pdf_b64content},
        'files/2': None,
        'files/3': pdf_b64content,
        'files/4': '',
    }
    mocked_check_output.reset_mock()
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
    assert mocked_check_output.call_count == 1
    pdftk_call = mocked_check_output.call_args.args[0]
    assert len(pdftk_call) == 6
    assert pdftk_call[0] == '/usr/bin/pdftk'
    assert pdftk_call[1].endswith('/pdf-1.pdf')  # file 0
    assert pdftk_call[2].endswith('/pdf-3.pdf')  # file 2

    # pdftk errors (faked)
    payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_b64content}}
    mocked_check_output.reset_mock()
    mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
    resp = app.post_json(endpoint, params=payload, status=200)
    assert mocked_check_output.call_count == 1
    assert resp.json['err'] == 1
    assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')

    mocked_check_output.reset_mock()
    mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
    resp = app.post_json(endpoint, params=payload, status=200)
    assert mocked_check_output.call_count == 1
    assert resp.json['err'] == 1
    assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
    assert 'ooops' in resp.json['err_desc']

    # bad calls errors
    resp = app.post(endpoint, status=400)
    assert resp.headers['content-type'].startswith('application/json')
    assert resp.json['err'] == 1
    assert resp.json['err_desc'].startswith('could not decode body to json')

    payload = {}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "'filename' is a required property"

    payload = {'filename': 'out.pdf'}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "'files' is a required property"

    payload = {'filename': 'out.pdf', 'files/0': 42}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_class'] == 'passerelle.utils.json.JSONValidationError'

    resp = app.get(endpoint, status=405)


def test_pdf_real_pdftk_assemble(app, pdf, settings):
    if not os.path.exists(settings.PDFTK_PATH):
        pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)

    endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
    payload = {
        'filename': 'twopages.pdf',
        'files/0': {'content': pdf_b64content},
        'files/1': {'content': pdf_b64content},
    }
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
    assert resp.content[:5] == b'%PDF-'
    assert PdfReader(fdata=resp.content).numPages == 2


@mock.patch('subprocess.check_output')
def test_pdf_watermark(mocked_check_output, app, pdf):
    endpoint = generic_endpoint_url('pdf', 'watermark', slug=pdf.slug)

    payload = {
        'filename': 'foo.pdf',
        'file': {'content': pdf_b64content},
        'stamp': {'content': pdf_b64content},
    }
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
    assert mocked_check_output.call_count == 1
    pdftk_call = mocked_check_output.call_args.args[0]
    assert len(pdftk_call) == 6
    assert pdftk_call[0] == '/usr/bin/pdftk'
    assert pdftk_call[1].endswith('/pdf-file.pdf')
    assert pdftk_call[2] == 'background'
    assert pdftk_call[3].endswith('/pdf-stamp.pdf')
    assert pdftk_call[4] == 'output'
    assert pdftk_call[5] == '-'
    assert mocked_check_output.call_args.kwargs['timeout'] == 20

    payload['mode'] = 'multistamp'
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
    assert mocked_check_output.call_count == 2
    pdftk_call = mocked_check_output.call_args.args[0]
    assert len(pdftk_call) == 6
    assert pdftk_call[0] == '/usr/bin/pdftk'
    assert pdftk_call[1].endswith('/pdf-file.pdf')
    assert pdftk_call[2] == 'multistamp'
    assert pdftk_call[3].endswith('/pdf-stamp.pdf')
    assert pdftk_call[4] == 'output'
    assert pdftk_call[5] == '-'
    assert mocked_check_output.call_args.kwargs['timeout'] == 20

    # bad calls errors
    resp = app.post(endpoint, status=400)
    assert resp.headers['content-type'].startswith('application/json')
    assert resp.json['err'] == 1
    assert resp.json['err_desc'].startswith('could not decode body to json')

    payload = {}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "'filename' is a required property"

    payload = {'filename': 'out.pdf'}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "'file' is a required property"

    payload = {'filename': 'out.pdf', 'file': {'content': pdf_b64content}}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "'stamp' is a required property"

    payload = {'filename': 'out.pdf', 'file': {'content': pdf_b64content}, 'stamp': 42}
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert resp.json['err_desc'] == "stamp: 42 is not of type 'object'"

    payload = {
        'filename': 'out.pdf',
        'file': {'content': pdf_b64content},
        'stamp': {'content': pdf_b64content},
        'mode': 'foobar',
    }
    resp = app.post_json(endpoint, params=payload, status=400)
    assert resp.json['err'] == 1
    assert (
        resp.json['err_desc']
        == "mode: 'foobar' is not one of ['background', 'multibackground', 'stamp', 'multistamp']"
    )

    resp = app.get(endpoint, status=405)


def test_pdf_real_pdftk_watermark(app, pdf, settings):
    if not os.path.exists(settings.PDFTK_PATH):
        pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)

    endpoint = generic_endpoint_url('pdf', 'watermark', slug=pdf.slug)
    payload = {
        'filename': 'watermark.pdf',
        'file': {'content': pdf_b64content},
        'stamp': {'content': pdf_b64content},
    }
    resp = app.post_json(endpoint, params=payload, status=200)
    assert resp.headers['content-type'] == 'application/pdf'
    assert resp.headers['content-disposition'] == 'attachment; filename="watermark.pdf"'
    assert resp.content[:5] == b'%PDF-'
    assert PdfReader(fdata=resp.content).numPages == 1


def test_pdf_validator(pdf):
    pdf.fill_form_file = File(BytesIO(pdf_content), 'default.pdf')
    pdf.save()
    pdf.full_clean()

    pdf.fill_form_file = File(BytesIO(acroform_content), 'default.pdf')
    pdf.save()
    pdf.full_clean()

    pdf.fill_form_file = File(BytesIO(b'not a pdf'), 'test.txt')
    pdf.save()
    with pytest.raises(ValidationError):
        pdf.full_clean()


@pytest.fixture
def cerfa_content():
    with open('tests/data/cerfa_10072-02.pdf', 'rb') as fd:
        return fd.read()


def test_fill_form_no_pdf(app, admin_user, pdf):
    resp = app.get('/pdf/test')
    assert 'Fill form: Edit fields mapping' not in resp.text

    resp = app.get('/pdf/test/fields-mapping/edit/', status=404)

    resp = app.post_json('/pdf/test/fill-form/', params={'a': 1})
    assert resp.json == {
        'data': None,
        'err': 1,
        'err_class': 'passerelle.utils.jsonresponse.APIError',
        'err_desc': 'not PDF file configured',
    }


def test_fill_form_no_fields_mapping(app, admin_user, pdf, cerfa_content):
    pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
    resp = app.post_json('/pdf/test/fill-form/', params={'a': 1})
    assert resp.json == {
        'data': None,
        'err': 1,
        'err_class': 'passerelle.utils.jsonresponse.APIError',
        'err_desc': 'no fields mapping configured',
    }


def test_fill_form_ok(app, admin_user, pdf, cerfa_content):
    pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
    app = login(app)
    resp = app.get('/pdf/test/')
    resp = resp.click('Fill form: Edit fields mapping')
    img_tags = resp.pyquery('img')
    image_resp = app.get(img_tags[0].attrib['src'])
    assert b'PNG' in image_resp.content
    pdf_ = PDF(cerfa_content)
    page = pdf_.page(0)
    checkbox_field = [field for field in page.fields if field.widget_type == 'checkbox'][0]
    text_field = [field for field in page.fields if field.widget_type == 'text'][0]
    assert checkbox_field.value is False
    assert text_field.value == ''
    resp.form.set(f'field_{checkbox_field.digest_id}', 'testme == "a"')
    resp.form.set(f'field_{text_field.digest_id}', '{{ prenom }} {{ nom }}')
    resp.form.submit().follow()

    resp = app.post_json('/pdf/test/fill-form/', params={'testme': 'a', 'prenom': 'Jean', 'nom': 'Dupont'})
    pdf_ = PDF(resp.content)
    page = pdf_.page(0)
    checkbox_field = [field for field in page.fields if field.widget_type == 'checkbox'][0]
    text_field = [field for field in page.fields if field.widget_type == 'text'][0]
    assert checkbox_field.value is True
    assert text_field.value == 'Jean Dupont'

    resp = app.post_json(
        '/pdf/test/fill-form/?flatten=1', params={'testme': 'a', 'prenom': 'Jean', 'nom': 'Dupont'}
    )
    pdf_ = PDF(resp.content)
    page = pdf_.page(0)
    assert not page.fields


def test_add_or_edit(app, admin_user, pdf, cerfa_content):
    app = login(app)
    resp = app.get('/manage/pdf/add')
    assert b'fields_mapping' not in resp.content

    pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
    resp = app.get('/manage/pdf/test/edit')
    assert b'fields_mapping' not in resp.content


def test_thumbnail_url_contains_hash(app, admin_user, pdf, cerfa_content):
    pdf.fill_form_file.save('form.pdf', ContentFile(cerfa_content))
    app = login(app)
    resp = app.get('/pdf/test/')
    resp = resp.click('Fill form: Edit fields mapping')
    img_tags = resp.pyquery('img')
    urls = [img_tag.attrib['src'] for img_tag in img_tags]
    assert len(urls) == 5  # cerfa_content has 5 pages
    assert all('?hash=' in url for url in urls)
    assert not any(url.endswith('?hash=0') for url in urls)

    first_url = urls[0]

    pdf.fill_form_file.save('form.pdf', ContentFile(acroform_content))

    resp = app.get('/pdf/test/')
    resp = resp.click('Fill form: Edit fields mapping')
    img_tags = resp.pyquery('img')
    urls = [img_tag.attrib['src'] for img_tag in img_tags]
    assert len(urls) == 1  # acroform_content has 1 page
    assert all('?hash=' in url for url in urls)
    assert not any(url.endswith('?hash=0') for url in urls)
    assert urls[0] != first_url