diff --git a/debian/control b/debian/control index 8c6d8c40..b09dfa7a 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,8 @@ Homepage: https://dev.entrouvert.org/projects/passerelle Package: python3-passerelle Architecture: all -Depends: python3-cmislib, +Depends: pdftk, + python3-cmislib, python3-dateutil, python3-distutils, python3-django (>= 2:2.2), diff --git a/passerelle/apps/pdf/__init__.py b/passerelle/apps/pdf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/passerelle/apps/pdf/migrations/0001_initial.py b/passerelle/apps/pdf/migrations/0001_initial.py new file mode 100644 index 00000000..460f11d9 --- /dev/null +++ b/passerelle/apps/pdf/migrations/0001_initial.py @@ -0,0 +1,39 @@ +# Generated by Django 2.2.26 on 2023-01-20 12:35 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('base', '0030_resourcelog_base_resour_appname_298cbc_idx'), + ] + + operations = [ + migrations.CreateModel( + name='Resource', + fields=[ + ( + 'id', + models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'), + ), + ('title', models.CharField(max_length=50, verbose_name='Title')), + ('slug', models.SlugField(unique=True, verbose_name='Identifier')), + ('description', models.TextField(verbose_name='Description')), + ( + 'users', + models.ManyToManyField( + blank=True, + related_name='_resource_users_+', + related_query_name='+', + to='base.ApiUser', + ), + ), + ], + options={ + 'verbose_name': 'PDF', + }, + ), + ] diff --git a/passerelle/apps/pdf/migrations/__init__.py b/passerelle/apps/pdf/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/passerelle/apps/pdf/models.py b/passerelle/apps/pdf/models.py new file mode 100644 index 00000000..ab22fe72 --- /dev/null +++ b/passerelle/apps/pdf/models.py @@ -0,0 +1,139 @@ +# passerelle - uniform access to multiple data sources and services +# Copyright (C) 2023 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import base64 +import os +import subprocess +import tempfile +from collections import OrderedDict + +from django.conf import settings +from django.http.response import HttpResponse +from django.utils.translation import gettext_lazy as _ + +from passerelle.base.models import BaseResource +from passerelle.utils.api import endpoint +from passerelle.utils.jsonresponse import APIError + +PDF_FILE_OBJECT = { + 'type': 'object', + 'description': _('PDF file'), + 'required': ['content'], + 'properties': { + 'filename': { + 'type': 'string', + 'description': _('file name'), + }, + 'content_type': { + 'type': 'string', + 'description': _('MIME content-type'), + }, + 'content': { + 'type': 'string', + 'description': _('file content, base64 encoded'), + }, + }, +} + +ASSEMBLE_SCHEMA = { + '$schema': 'http://json-schema.org/draft-04/schema#', + 'title': '', + 'description': '', + 'type': 'object', + 'required': ['filename', 'files'], + 'unflatten': True, + 'properties': OrderedDict( + { + 'filename': { + 'description': _('output PDF filename'), + 'type': 'string', + }, + 'files': { + 'type': 'array', + 'description': _('PDF files to catenate'), + 'items': { + 'oneOf': [ + PDF_FILE_OBJECT, + {'type': 'string', 'description': _('PDF content, base64 encoded')}, + {'type': 'null', 'description': _('empty file, do not consider')}, + ] + }, + }, + } + ), +} + + +class Resource(BaseResource): + category = _('Misc') + + class Meta: + verbose_name = _('PDF') + + def run_pdftk(self, args): + args = [settings.PDFTK_PATH] + args + ['output', '-'] + try: + return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT) + except subprocess.TimeoutExpired as e: + raise APIError('pdftk timed out after %s seconds' % e.timeout) + except subprocess.CalledProcessError as e: + raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output)) + + @endpoint( + description=_('Returns the assembly of received PDF files'), + perm='can_access', + methods=['post'], + post={ + 'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}}, + 'input_example': { + 'filename': 'output.pdf', + 'files/0': { + 'filename': 'example-1.pdf', + 'content_type': 'application/pdf', + 'content': 'JVBERi0xL...(base64 PDF)...', + }, + 'files/1': { + 'filename': 'example-2.pdf', + 'content_type': 'application/pdf', + 'content': '//4lUERGL...(base64 PDF)...', + }, + 'files/2': '//4lUERGL...(base64 PDF)', + }, + }, + ) + def assemble(self, request, post_data): + filename = post_data.pop('filename') + + with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir: + infiles = [] + for i, infile in enumerate(post_data['files']): + if isinstance(infile, dict) and infile.get('content'): + b64content = infile['content'] + elif isinstance(infile, str) and infile: + b64content = infile + else: + continue + infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i) + with open(infile_filename, mode='wb') as fd: + fd.write(base64.b64decode(b64content)) + infiles.append(infile_filename) + if not infiles: + raise APIError("no valid file found in 'files' property", http_status=400) + pdf_content = self.run_pdftk(args=infiles + ['cat']) + + response = HttpResponse(pdf_content, content_type='application/pdf') + response['Content-Disposition'] = 'attachment; filename="%s"' % filename + return response diff --git a/passerelle/settings.py b/passerelle/settings.py index 8b3621d0..9308c053 100644 --- a/passerelle/settings.py +++ b/passerelle/settings.py @@ -166,6 +166,7 @@ INSTALLED_APPS = ( 'passerelle.apps.orange', 'passerelle.apps.ovh', 'passerelle.apps.oxyd', + 'passerelle.apps.pdf', 'passerelle.apps.phonecalls', 'passerelle.apps.photon', 'passerelle.apps.plone_restapi', @@ -192,6 +193,10 @@ PASSERELLE_APP_STRASBOURG_EU_ENABLED = False PASSERELLE_APP_CLICRDV_LEGACY = True PASSERELLE_APP_SOLIS_APA_LEGACY = True +# passerelle.apps.pdf configuration +PDFTK_PATH = '/usr/bin/pdftk' +PDFTK_TIMEOUT = 20 + # Authentication settings try: import mellon diff --git a/tests/test_pdf.py b/tests/test_pdf.py new file mode 100644 index 00000000..1508e70f --- /dev/null +++ b/tests/test_pdf.py @@ -0,0 +1,129 @@ +# passerelle - uniform access to multiple data sources and services +# Copyright (C) 2023 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import base64 +import os +import subprocess +from unittest import mock + +import pytest +from pdfrw import PdfReader + +from passerelle.apps.pdf.models import Resource +from tests.utils import generic_endpoint_url, setup_access_rights + +with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd: + pdf_content = base64.b64encode(fd.read()).decode() + + +@pytest.fixture +def pdf(db): + return setup_access_rights(Resource.objects.create(slug='test')) + + +@mock.patch('subprocess.check_output') +def test_pdf_assemble(mocked_check_output, app, pdf): + endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug) + + payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_content}} + resp = app.post_json(endpoint, params=payload, status=200) + assert resp.headers['content-type'] == 'application/pdf' + assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"' + assert mocked_check_output.call_count == 1 + pdftk_call = mocked_check_output.call_args.args[0] + assert len(pdftk_call) == 5 + assert pdftk_call[0] == '/usr/bin/pdftk' + assert pdftk_call[1].endswith('/pdf-0.pdf') + assert pdftk_call[2] == 'cat' + assert pdftk_call[3] == 'output' + assert pdftk_call[4] == '-' + assert mocked_check_output.call_args.kwargs['timeout'] == 20 + + payload = { + 'filename': 'bar.pdf', + 'files/0': {'content': ''}, + 'files/1': {'content': pdf_content}, + 'files/2': None, + 'files/3': pdf_content, + 'files/4': '', + } + mocked_check_output.reset_mock() + resp = app.post_json(endpoint, params=payload, status=200) + assert resp.headers['content-type'] == 'application/pdf' + assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"' + assert mocked_check_output.call_count == 1 + pdftk_call = mocked_check_output.call_args.args[0] + assert len(pdftk_call) == 6 + assert pdftk_call[0] == '/usr/bin/pdftk' + assert pdftk_call[1].endswith('/pdf-1.pdf') # file 0 + assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2 + + # pdftk errors (faked) + payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_content}} + mocked_check_output.reset_mock() + mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20) + resp = app.post_json(endpoint, params=payload, status=200) + assert mocked_check_output.call_count == 1 + assert resp.json['err'] == 1 + assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds') + + mocked_check_output.reset_mock() + mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops') + resp = app.post_json(endpoint, params=payload, status=200) + assert mocked_check_output.call_count == 1 + assert resp.json['err'] == 1 + assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42') + assert 'ooops' in resp.json['err_desc'] + + # bad calls errors + resp = app.post(endpoint, status=400) + assert resp.headers['content-type'].startswith('application/json') + assert resp.json['err'] == 1 + assert resp.json['err_desc'].startswith('could not decode body to json') + + payload = {} + resp = app.post_json(endpoint, params=payload, status=400) + assert resp.json['err'] == 1 + assert resp.json['err_desc'] == "'filename' is a required property" + + payload = {'filename': 'out.pdf'} + resp = app.post_json(endpoint, params=payload, status=400) + assert resp.json['err'] == 1 + assert resp.json['err_desc'] == "'files' is a required property" + + payload = {'filename': 'out.pdf', 'files/0': 42} + resp = app.post_json(endpoint, params=payload, status=400) + assert resp.json['err'] == 1 + assert resp.json['err_desc'] == "42 is not of type 'object'" + + resp = app.get(endpoint, status=405) + + +def test_pdf_real_pdftk_call(app, pdf, settings): + if not os.path.exists(settings.PDFTK_PATH): + pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH) + + endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug) + payload = { + 'filename': 'twopages.pdf', + 'files/0': {'content': pdf_content}, + 'files/1': {'content': pdf_content}, + } + resp = app.post_json(endpoint, params=payload, status=200) + assert resp.headers['content-type'] == 'application/pdf' + assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"' + assert resp.content[:5] == b'%PDF-' + assert PdfReader(fdata=resp.content).numPages == 2