add pdf connector (assemble endpoint) (#73540)
gitea-wip/passerelle/pipeline/pr-main There was a failure building this commit Details
gitea/passerelle/pipeline/head Something is wrong with the build of this commit Details

This commit is contained in:
Thomas NOËL 2023-01-20 13:36:24 +01:00
parent 791eedd3f5
commit 95192de819
7 changed files with 314 additions and 1 deletions

3
debian/control vendored
View File

@ -13,7 +13,8 @@ Homepage: https://dev.entrouvert.org/projects/passerelle
Package: python3-passerelle
Architecture: all
Depends: python3-cmislib,
Depends: pdftk,
python3-cmislib,
python3-dateutil,
python3-distutils,
python3-django (>= 2:2.2),

View File

View File

@ -0,0 +1,39 @@
# Generated by Django 2.2.26 on 2023-01-20 12:35
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('base', '0030_resourcelog_base_resour_appname_298cbc_idx'),
]
operations = [
migrations.CreateModel(
name='Resource',
fields=[
(
'id',
models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
),
('title', models.CharField(max_length=50, verbose_name='Title')),
('slug', models.SlugField(unique=True, verbose_name='Identifier')),
('description', models.TextField(verbose_name='Description')),
(
'users',
models.ManyToManyField(
blank=True,
related_name='_resource_users_+',
related_query_name='+',
to='base.ApiUser',
),
),
],
options={
'verbose_name': 'PDF',
},
),
]

View File

@ -0,0 +1,139 @@
# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import base64
import os
import subprocess
import tempfile
from collections import OrderedDict
from django.conf import settings
from django.http.response import HttpResponse
from django.utils.translation import gettext_lazy as _
from passerelle.base.models import BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
PDF_FILE_OBJECT = {
'type': 'object',
'description': _('PDF file'),
'required': ['content'],
'properties': {
'filename': {
'type': 'string',
'description': _('file name'),
},
'content_type': {
'type': 'string',
'description': _('MIME content-type'),
},
'content': {
'type': 'string',
'description': _('file content, base64 encoded'),
},
},
}
ASSEMBLE_SCHEMA = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': '',
'description': '',
'type': 'object',
'required': ['filename', 'files'],
'unflatten': True,
'properties': OrderedDict(
{
'filename': {
'description': _('output PDF filename'),
'type': 'string',
},
'files': {
'type': 'array',
'description': _('PDF files to catenate'),
'items': {
'oneOf': [
PDF_FILE_OBJECT,
{'type': 'string', 'description': _('PDF content, base64 encoded')},
{'type': 'null', 'description': _('empty file, do not consider')},
]
},
},
}
),
}
class Resource(BaseResource):
category = _('Misc')
class Meta:
verbose_name = _('PDF')
def run_pdftk(self, args):
args = [settings.PDFTK_PATH] + args + ['output', '-']
try:
return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
except subprocess.TimeoutExpired as e:
raise APIError('pdftk timed out after %s seconds' % e.timeout)
except subprocess.CalledProcessError as e:
raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))
@endpoint(
description=_('Returns the assembly of received PDF files'),
perm='can_access',
methods=['post'],
post={
'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
'input_example': {
'filename': 'output.pdf',
'files/0': {
'filename': 'example-1.pdf',
'content_type': 'application/pdf',
'content': 'JVBERi0xL...(base64 PDF)...',
},
'files/1': {
'filename': 'example-2.pdf',
'content_type': 'application/pdf',
'content': '//4lUERGL...(base64 PDF)...',
},
'files/2': '//4lUERGL...(base64 PDF)',
},
},
)
def assemble(self, request, post_data):
filename = post_data.pop('filename')
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
infiles = []
for i, infile in enumerate(post_data['files']):
if isinstance(infile, dict) and infile.get('content'):
b64content = infile['content']
elif isinstance(infile, str) and infile:
b64content = infile
else:
continue
infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
with open(infile_filename, mode='wb') as fd:
fd.write(base64.b64decode(b64content))
infiles.append(infile_filename)
if not infiles:
raise APIError("no valid file found in 'files' property", http_status=400)
pdf_content = self.run_pdftk(args=infiles + ['cat'])
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response

View File

@ -166,6 +166,7 @@ INSTALLED_APPS = (
'passerelle.apps.orange',
'passerelle.apps.ovh',
'passerelle.apps.oxyd',
'passerelle.apps.pdf',
'passerelle.apps.phonecalls',
'passerelle.apps.photon',
'passerelle.apps.plone_restapi',
@ -192,6 +193,10 @@ PASSERELLE_APP_STRASBOURG_EU_ENABLED = False
PASSERELLE_APP_CLICRDV_LEGACY = True
PASSERELLE_APP_SOLIS_APA_LEGACY = True
# passerelle.apps.pdf configuration
PDFTK_PATH = '/usr/bin/pdftk'
PDFTK_TIMEOUT = 20
# Authentication settings
try:
import mellon

129
tests/test_pdf.py Normal file
View File

@ -0,0 +1,129 @@
# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import base64
import os
import subprocess
from unittest import mock
import pytest
from pdfrw import PdfReader
from passerelle.apps.pdf.models import Resource
from tests.utils import generic_endpoint_url, setup_access_rights
with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
pdf_content = base64.b64encode(fd.read()).decode()
@pytest.fixture
def pdf(db):
return setup_access_rights(Resource.objects.create(slug='test'))
@mock.patch('subprocess.check_output')
def test_pdf_assemble(mocked_check_output, app, pdf):
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_content}}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 5
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-0.pdf')
assert pdftk_call[2] == 'cat'
assert pdftk_call[3] == 'output'
assert pdftk_call[4] == '-'
assert mocked_check_output.call_args.kwargs['timeout'] == 20
payload = {
'filename': 'bar.pdf',
'files/0': {'content': ''},
'files/1': {'content': pdf_content},
'files/2': None,
'files/3': pdf_content,
'files/4': '',
}
mocked_check_output.reset_mock()
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
assert mocked_check_output.call_count == 1
pdftk_call = mocked_check_output.call_args.args[0]
assert len(pdftk_call) == 6
assert pdftk_call[0] == '/usr/bin/pdftk'
assert pdftk_call[1].endswith('/pdf-1.pdf') # file 0
assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2
# pdftk errors (faked)
payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_content}}
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')
mocked_check_output.reset_mock()
mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
resp = app.post_json(endpoint, params=payload, status=200)
assert mocked_check_output.call_count == 1
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
assert 'ooops' in resp.json['err_desc']
# bad calls errors
resp = app.post(endpoint, status=400)
assert resp.headers['content-type'].startswith('application/json')
assert resp.json['err'] == 1
assert resp.json['err_desc'].startswith('could not decode body to json')
payload = {}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'filename' is a required property"
payload = {'filename': 'out.pdf'}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "'files' is a required property"
payload = {'filename': 'out.pdf', 'files/0': 42}
resp = app.post_json(endpoint, params=payload, status=400)
assert resp.json['err'] == 1
assert resp.json['err_desc'] == "42 is not of type 'object'"
resp = app.get(endpoint, status=405)
def test_pdf_real_pdftk_call(app, pdf, settings):
if not os.path.exists(settings.PDFTK_PATH):
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
payload = {
'filename': 'twopages.pdf',
'files/0': {'content': pdf_content},
'files/1': {'content': pdf_content},
}
resp = app.post_json(endpoint, params=payload, status=200)
assert resp.headers['content-type'] == 'application/pdf'
assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
assert resp.content[:5] == b'%PDF-'
assert PdfReader(fdata=resp.content).numPages == 2