diff --git a/debian/control b/debian/control
index 8c6d8c40..b09dfa7a 100644
--- a/debian/control
+++ b/debian/control
@@ -13,7 +13,8 @@ Homepage: https://dev.entrouvert.org/projects/passerelle
Package: python3-passerelle
Architecture: all
-Depends: python3-cmislib,
+Depends: pdftk,
+ python3-cmislib,
python3-dateutil,
python3-distutils,
python3-django (>= 2:2.2),
diff --git a/passerelle/apps/pdf/__init__.py b/passerelle/apps/pdf/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/passerelle/apps/pdf/migrations/0001_initial.py b/passerelle/apps/pdf/migrations/0001_initial.py
new file mode 100644
index 00000000..460f11d9
--- /dev/null
+++ b/passerelle/apps/pdf/migrations/0001_initial.py
@@ -0,0 +1,39 @@
+# Generated by Django 2.2.26 on 2023-01-20 12:35
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ('base', '0030_resourcelog_base_resour_appname_298cbc_idx'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='Resource',
+ fields=[
+ (
+ 'id',
+ models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
+ ),
+ ('title', models.CharField(max_length=50, verbose_name='Title')),
+ ('slug', models.SlugField(unique=True, verbose_name='Identifier')),
+ ('description', models.TextField(verbose_name='Description')),
+ (
+ 'users',
+ models.ManyToManyField(
+ blank=True,
+ related_name='_resource_users_+',
+ related_query_name='+',
+ to='base.ApiUser',
+ ),
+ ),
+ ],
+ options={
+ 'verbose_name': 'PDF',
+ },
+ ),
+ ]
diff --git a/passerelle/apps/pdf/migrations/__init__.py b/passerelle/apps/pdf/migrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/passerelle/apps/pdf/models.py b/passerelle/apps/pdf/models.py
new file mode 100644
index 00000000..ab22fe72
--- /dev/null
+++ b/passerelle/apps/pdf/models.py
@@ -0,0 +1,139 @@
+# passerelle - uniform access to multiple data sources and services
+# Copyright (C) 2023 Entr'ouvert
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+import base64
+import os
+import subprocess
+import tempfile
+from collections import OrderedDict
+
+from django.conf import settings
+from django.http.response import HttpResponse
+from django.utils.translation import gettext_lazy as _
+
+from passerelle.base.models import BaseResource
+from passerelle.utils.api import endpoint
+from passerelle.utils.jsonresponse import APIError
+
+PDF_FILE_OBJECT = {
+ 'type': 'object',
+ 'description': _('PDF file'),
+ 'required': ['content'],
+ 'properties': {
+ 'filename': {
+ 'type': 'string',
+ 'description': _('file name'),
+ },
+ 'content_type': {
+ 'type': 'string',
+ 'description': _('MIME content-type'),
+ },
+ 'content': {
+ 'type': 'string',
+ 'description': _('file content, base64 encoded'),
+ },
+ },
+}
+
+ASSEMBLE_SCHEMA = {
+ '$schema': 'http://json-schema.org/draft-04/schema#',
+ 'title': '',
+ 'description': '',
+ 'type': 'object',
+ 'required': ['filename', 'files'],
+ 'unflatten': True,
+ 'properties': OrderedDict(
+ {
+ 'filename': {
+ 'description': _('output PDF filename'),
+ 'type': 'string',
+ },
+ 'files': {
+ 'type': 'array',
+ 'description': _('PDF files to catenate'),
+ 'items': {
+ 'oneOf': [
+ PDF_FILE_OBJECT,
+ {'type': 'string', 'description': _('PDF content, base64 encoded')},
+ {'type': 'null', 'description': _('empty file, do not consider')},
+ ]
+ },
+ },
+ }
+ ),
+}
+
+
+class Resource(BaseResource):
+ category = _('Misc')
+
+ class Meta:
+ verbose_name = _('PDF')
+
+ def run_pdftk(self, args):
+ args = [settings.PDFTK_PATH] + args + ['output', '-']
+ try:
+ return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
+ except subprocess.TimeoutExpired as e:
+ raise APIError('pdftk timed out after %s seconds' % e.timeout)
+ except subprocess.CalledProcessError as e:
+ raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))
+
+ @endpoint(
+ description=_('Returns the assembly of received PDF files'),
+ perm='can_access',
+ methods=['post'],
+ post={
+ 'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
+ 'input_example': {
+ 'filename': 'output.pdf',
+ 'files/0': {
+ 'filename': 'example-1.pdf',
+ 'content_type': 'application/pdf',
+ 'content': 'JVBERi0xL...(base64 PDF)...',
+ },
+ 'files/1': {
+ 'filename': 'example-2.pdf',
+ 'content_type': 'application/pdf',
+ 'content': '//4lUERGL...(base64 PDF)...',
+ },
+ 'files/2': '//4lUERGL...(base64 PDF)',
+ },
+ },
+ )
+ def assemble(self, request, post_data):
+ filename = post_data.pop('filename')
+
+ with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
+ infiles = []
+ for i, infile in enumerate(post_data['files']):
+ if isinstance(infile, dict) and infile.get('content'):
+ b64content = infile['content']
+ elif isinstance(infile, str) and infile:
+ b64content = infile
+ else:
+ continue
+ infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
+ with open(infile_filename, mode='wb') as fd:
+ fd.write(base64.b64decode(b64content))
+ infiles.append(infile_filename)
+ if not infiles:
+ raise APIError("no valid file found in 'files' property", http_status=400)
+ pdf_content = self.run_pdftk(args=infiles + ['cat'])
+
+ response = HttpResponse(pdf_content, content_type='application/pdf')
+ response['Content-Disposition'] = 'attachment; filename="%s"' % filename
+ return response
diff --git a/passerelle/settings.py b/passerelle/settings.py
index 8b3621d0..9308c053 100644
--- a/passerelle/settings.py
+++ b/passerelle/settings.py
@@ -166,6 +166,7 @@ INSTALLED_APPS = (
'passerelle.apps.orange',
'passerelle.apps.ovh',
'passerelle.apps.oxyd',
+ 'passerelle.apps.pdf',
'passerelle.apps.phonecalls',
'passerelle.apps.photon',
'passerelle.apps.plone_restapi',
@@ -192,6 +193,10 @@ PASSERELLE_APP_STRASBOURG_EU_ENABLED = False
PASSERELLE_APP_CLICRDV_LEGACY = True
PASSERELLE_APP_SOLIS_APA_LEGACY = True
+# passerelle.apps.pdf configuration
+PDFTK_PATH = '/usr/bin/pdftk'
+PDFTK_TIMEOUT = 20
+
# Authentication settings
try:
import mellon
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
new file mode 100644
index 00000000..1508e70f
--- /dev/null
+++ b/tests/test_pdf.py
@@ -0,0 +1,129 @@
+# passerelle - uniform access to multiple data sources and services
+# Copyright (C) 2023 Entr'ouvert
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+import base64
+import os
+import subprocess
+from unittest import mock
+
+import pytest
+from pdfrw import PdfReader
+
+from passerelle.apps.pdf.models import Resource
+from tests.utils import generic_endpoint_url, setup_access_rights
+
+with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
+ pdf_content = base64.b64encode(fd.read()).decode()
+
+
+@pytest.fixture
+def pdf(db):
+ return setup_access_rights(Resource.objects.create(slug='test'))
+
+
+@mock.patch('subprocess.check_output')
+def test_pdf_assemble(mocked_check_output, app, pdf):
+ endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
+
+ payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_content}}
+ resp = app.post_json(endpoint, params=payload, status=200)
+ assert resp.headers['content-type'] == 'application/pdf'
+ assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
+ assert mocked_check_output.call_count == 1
+ pdftk_call = mocked_check_output.call_args.args[0]
+ assert len(pdftk_call) == 5
+ assert pdftk_call[0] == '/usr/bin/pdftk'
+ assert pdftk_call[1].endswith('/pdf-0.pdf')
+ assert pdftk_call[2] == 'cat'
+ assert pdftk_call[3] == 'output'
+ assert pdftk_call[4] == '-'
+ assert mocked_check_output.call_args.kwargs['timeout'] == 20
+
+ payload = {
+ 'filename': 'bar.pdf',
+ 'files/0': {'content': ''},
+ 'files/1': {'content': pdf_content},
+ 'files/2': None,
+ 'files/3': pdf_content,
+ 'files/4': '',
+ }
+ mocked_check_output.reset_mock()
+ resp = app.post_json(endpoint, params=payload, status=200)
+ assert resp.headers['content-type'] == 'application/pdf'
+ assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
+ assert mocked_check_output.call_count == 1
+ pdftk_call = mocked_check_output.call_args.args[0]
+ assert len(pdftk_call) == 6
+ assert pdftk_call[0] == '/usr/bin/pdftk'
+ assert pdftk_call[1].endswith('/pdf-1.pdf') # file 0
+ assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2
+
+ # pdftk errors (faked)
+ payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_content}}
+ mocked_check_output.reset_mock()
+ mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
+ resp = app.post_json(endpoint, params=payload, status=200)
+ assert mocked_check_output.call_count == 1
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')
+
+ mocked_check_output.reset_mock()
+ mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
+ resp = app.post_json(endpoint, params=payload, status=200)
+ assert mocked_check_output.call_count == 1
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
+ assert 'ooops' in resp.json['err_desc']
+
+ # bad calls errors
+ resp = app.post(endpoint, status=400)
+ assert resp.headers['content-type'].startswith('application/json')
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'].startswith('could not decode body to json')
+
+ payload = {}
+ resp = app.post_json(endpoint, params=payload, status=400)
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'] == "'filename' is a required property"
+
+ payload = {'filename': 'out.pdf'}
+ resp = app.post_json(endpoint, params=payload, status=400)
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'] == "'files' is a required property"
+
+ payload = {'filename': 'out.pdf', 'files/0': 42}
+ resp = app.post_json(endpoint, params=payload, status=400)
+ assert resp.json['err'] == 1
+ assert resp.json['err_desc'] == "42 is not of type 'object'"
+
+ resp = app.get(endpoint, status=405)
+
+
+def test_pdf_real_pdftk_call(app, pdf, settings):
+ if not os.path.exists(settings.PDFTK_PATH):
+ pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
+
+ endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
+ payload = {
+ 'filename': 'twopages.pdf',
+ 'files/0': {'content': pdf_content},
+ 'files/1': {'content': pdf_content},
+ }
+ resp = app.post_json(endpoint, params=payload, status=200)
+ assert resp.headers['content-type'] == 'application/pdf'
+ assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
+ assert resp.content[:5] == b'%PDF-'
+ assert PdfReader(fdata=resp.content).numPages == 2