add pdf connector (assemble endpoint) (#73540)
This commit is contained in:
parent
791eedd3f5
commit
95192de819
|
@ -13,7 +13,8 @@ Homepage: https://dev.entrouvert.org/projects/passerelle
|
|||
|
||||
Package: python3-passerelle
|
||||
Architecture: all
|
||||
Depends: python3-cmislib,
|
||||
Depends: pdftk,
|
||||
python3-cmislib,
|
||||
python3-dateutil,
|
||||
python3-distutils,
|
||||
python3-django (>= 2:2.2),
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
# Generated by Django 2.2.26 on 2023-01-20 12:35
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('base', '0030_resourcelog_base_resour_appname_298cbc_idx'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Resource',
|
||||
fields=[
|
||||
(
|
||||
'id',
|
||||
models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
('title', models.CharField(max_length=50, verbose_name='Title')),
|
||||
('slug', models.SlugField(unique=True, verbose_name='Identifier')),
|
||||
('description', models.TextField(verbose_name='Description')),
|
||||
(
|
||||
'users',
|
||||
models.ManyToManyField(
|
||||
blank=True,
|
||||
related_name='_resource_users_+',
|
||||
related_query_name='+',
|
||||
to='base.ApiUser',
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'PDF',
|
||||
},
|
||||
),
|
||||
]
|
|
@ -0,0 +1,139 @@
|
|||
# passerelle - uniform access to multiple data sources and services
|
||||
# Copyright (C) 2023 Entr'ouvert
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Affero General Public License as published
|
||||
# by the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import base64
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
|
||||
from django.conf import settings
|
||||
from django.http.response import HttpResponse
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from passerelle.base.models import BaseResource
|
||||
from passerelle.utils.api import endpoint
|
||||
from passerelle.utils.jsonresponse import APIError
|
||||
|
||||
PDF_FILE_OBJECT = {
|
||||
'type': 'object',
|
||||
'description': _('PDF file'),
|
||||
'required': ['content'],
|
||||
'properties': {
|
||||
'filename': {
|
||||
'type': 'string',
|
||||
'description': _('file name'),
|
||||
},
|
||||
'content_type': {
|
||||
'type': 'string',
|
||||
'description': _('MIME content-type'),
|
||||
},
|
||||
'content': {
|
||||
'type': 'string',
|
||||
'description': _('file content, base64 encoded'),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ASSEMBLE_SCHEMA = {
|
||||
'$schema': 'http://json-schema.org/draft-04/schema#',
|
||||
'title': '',
|
||||
'description': '',
|
||||
'type': 'object',
|
||||
'required': ['filename', 'files'],
|
||||
'unflatten': True,
|
||||
'properties': OrderedDict(
|
||||
{
|
||||
'filename': {
|
||||
'description': _('output PDF filename'),
|
||||
'type': 'string',
|
||||
},
|
||||
'files': {
|
||||
'type': 'array',
|
||||
'description': _('PDF files to catenate'),
|
||||
'items': {
|
||||
'oneOf': [
|
||||
PDF_FILE_OBJECT,
|
||||
{'type': 'string', 'description': _('PDF content, base64 encoded')},
|
||||
{'type': 'null', 'description': _('empty file, do not consider')},
|
||||
]
|
||||
},
|
||||
},
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class Resource(BaseResource):
|
||||
category = _('Misc')
|
||||
|
||||
class Meta:
|
||||
verbose_name = _('PDF')
|
||||
|
||||
def run_pdftk(self, args):
|
||||
args = [settings.PDFTK_PATH] + args + ['output', '-']
|
||||
try:
|
||||
return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
raise APIError('pdftk timed out after %s seconds' % e.timeout)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))
|
||||
|
||||
@endpoint(
|
||||
description=_('Returns the assembly of received PDF files'),
|
||||
perm='can_access',
|
||||
methods=['post'],
|
||||
post={
|
||||
'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
|
||||
'input_example': {
|
||||
'filename': 'output.pdf',
|
||||
'files/0': {
|
||||
'filename': 'example-1.pdf',
|
||||
'content_type': 'application/pdf',
|
||||
'content': 'JVBERi0xL...(base64 PDF)...',
|
||||
},
|
||||
'files/1': {
|
||||
'filename': 'example-2.pdf',
|
||||
'content_type': 'application/pdf',
|
||||
'content': '//4lUERGL...(base64 PDF)...',
|
||||
},
|
||||
'files/2': '//4lUERGL...(base64 PDF)',
|
||||
},
|
||||
},
|
||||
)
|
||||
def assemble(self, request, post_data):
|
||||
filename = post_data.pop('filename')
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
|
||||
infiles = []
|
||||
for i, infile in enumerate(post_data['files']):
|
||||
if isinstance(infile, dict) and infile.get('content'):
|
||||
b64content = infile['content']
|
||||
elif isinstance(infile, str) and infile:
|
||||
b64content = infile
|
||||
else:
|
||||
continue
|
||||
infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
|
||||
with open(infile_filename, mode='wb') as fd:
|
||||
fd.write(base64.b64decode(b64content))
|
||||
infiles.append(infile_filename)
|
||||
if not infiles:
|
||||
raise APIError("no valid file found in 'files' property", http_status=400)
|
||||
pdf_content = self.run_pdftk(args=infiles + ['cat'])
|
||||
|
||||
response = HttpResponse(pdf_content, content_type='application/pdf')
|
||||
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
|
||||
return response
|
|
@ -166,6 +166,7 @@ INSTALLED_APPS = (
|
|||
'passerelle.apps.orange',
|
||||
'passerelle.apps.ovh',
|
||||
'passerelle.apps.oxyd',
|
||||
'passerelle.apps.pdf',
|
||||
'passerelle.apps.phonecalls',
|
||||
'passerelle.apps.photon',
|
||||
'passerelle.apps.plone_restapi',
|
||||
|
@ -192,6 +193,10 @@ PASSERELLE_APP_STRASBOURG_EU_ENABLED = False
|
|||
PASSERELLE_APP_CLICRDV_LEGACY = True
|
||||
PASSERELLE_APP_SOLIS_APA_LEGACY = True
|
||||
|
||||
# passerelle.apps.pdf configuration
|
||||
PDFTK_PATH = '/usr/bin/pdftk'
|
||||
PDFTK_TIMEOUT = 20
|
||||
|
||||
# Authentication settings
|
||||
try:
|
||||
import mellon
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
# passerelle - uniform access to multiple data sources and services
|
||||
# Copyright (C) 2023 Entr'ouvert
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Affero General Public License as published
|
||||
# by the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import base64
|
||||
import os
|
||||
import subprocess
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from pdfrw import PdfReader
|
||||
|
||||
from passerelle.apps.pdf.models import Resource
|
||||
from tests.utils import generic_endpoint_url, setup_access_rights
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf'), 'rb') as fd:
|
||||
pdf_content = base64.b64encode(fd.read()).decode()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pdf(db):
|
||||
return setup_access_rights(Resource.objects.create(slug='test'))
|
||||
|
||||
|
||||
@mock.patch('subprocess.check_output')
|
||||
def test_pdf_assemble(mocked_check_output, app, pdf):
|
||||
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
|
||||
|
||||
payload = {'filename': 'foo.pdf', 'files/0': {'content': pdf_content}}
|
||||
resp = app.post_json(endpoint, params=payload, status=200)
|
||||
assert resp.headers['content-type'] == 'application/pdf'
|
||||
assert resp.headers['content-disposition'] == 'attachment; filename="foo.pdf"'
|
||||
assert mocked_check_output.call_count == 1
|
||||
pdftk_call = mocked_check_output.call_args.args[0]
|
||||
assert len(pdftk_call) == 5
|
||||
assert pdftk_call[0] == '/usr/bin/pdftk'
|
||||
assert pdftk_call[1].endswith('/pdf-0.pdf')
|
||||
assert pdftk_call[2] == 'cat'
|
||||
assert pdftk_call[3] == 'output'
|
||||
assert pdftk_call[4] == '-'
|
||||
assert mocked_check_output.call_args.kwargs['timeout'] == 20
|
||||
|
||||
payload = {
|
||||
'filename': 'bar.pdf',
|
||||
'files/0': {'content': ''},
|
||||
'files/1': {'content': pdf_content},
|
||||
'files/2': None,
|
||||
'files/3': pdf_content,
|
||||
'files/4': '',
|
||||
}
|
||||
mocked_check_output.reset_mock()
|
||||
resp = app.post_json(endpoint, params=payload, status=200)
|
||||
assert resp.headers['content-type'] == 'application/pdf'
|
||||
assert resp.headers['content-disposition'] == 'attachment; filename="bar.pdf"'
|
||||
assert mocked_check_output.call_count == 1
|
||||
pdftk_call = mocked_check_output.call_args.args[0]
|
||||
assert len(pdftk_call) == 6
|
||||
assert pdftk_call[0] == '/usr/bin/pdftk'
|
||||
assert pdftk_call[1].endswith('/pdf-1.pdf') # file 0
|
||||
assert pdftk_call[2].endswith('/pdf-3.pdf') # file 2
|
||||
|
||||
# pdftk errors (faked)
|
||||
payload = {'filename': 'out.pdf', 'files/0': {'content': pdf_content}}
|
||||
mocked_check_output.reset_mock()
|
||||
mocked_check_output.side_effect = subprocess.TimeoutExpired(cmd=[], timeout=20)
|
||||
resp = app.post_json(endpoint, params=payload, status=200)
|
||||
assert mocked_check_output.call_count == 1
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'].startswith('pdftk timed out after 20 seconds')
|
||||
|
||||
mocked_check_output.reset_mock()
|
||||
mocked_check_output.side_effect = subprocess.CalledProcessError(cmd=[], returncode=42, output='ooops')
|
||||
resp = app.post_json(endpoint, params=payload, status=200)
|
||||
assert mocked_check_output.call_count == 1
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'].startswith('pdftk returned non-zero exit status 42')
|
||||
assert 'ooops' in resp.json['err_desc']
|
||||
|
||||
# bad calls errors
|
||||
resp = app.post(endpoint, status=400)
|
||||
assert resp.headers['content-type'].startswith('application/json')
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'].startswith('could not decode body to json')
|
||||
|
||||
payload = {}
|
||||
resp = app.post_json(endpoint, params=payload, status=400)
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'] == "'filename' is a required property"
|
||||
|
||||
payload = {'filename': 'out.pdf'}
|
||||
resp = app.post_json(endpoint, params=payload, status=400)
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'] == "'files' is a required property"
|
||||
|
||||
payload = {'filename': 'out.pdf', 'files/0': 42}
|
||||
resp = app.post_json(endpoint, params=payload, status=400)
|
||||
assert resp.json['err'] == 1
|
||||
assert resp.json['err_desc'] == "42 is not of type 'object'"
|
||||
|
||||
resp = app.get(endpoint, status=405)
|
||||
|
||||
|
||||
def test_pdf_real_pdftk_call(app, pdf, settings):
|
||||
if not os.path.exists(settings.PDFTK_PATH):
|
||||
pytest.skip('pdftk (%s) not found' % settings.PDFTK_PATH)
|
||||
|
||||
endpoint = generic_endpoint_url('pdf', 'assemble', slug=pdf.slug)
|
||||
payload = {
|
||||
'filename': 'twopages.pdf',
|
||||
'files/0': {'content': pdf_content},
|
||||
'files/1': {'content': pdf_content},
|
||||
}
|
||||
resp = app.post_json(endpoint, params=payload, status=200)
|
||||
assert resp.headers['content-type'] == 'application/pdf'
|
||||
assert resp.headers['content-disposition'] == 'attachment; filename="twopages.pdf"'
|
||||
assert resp.content[:5] == b'%PDF-'
|
||||
assert PdfReader(fdata=resp.content).numPages == 2
|
Loading…
Reference in New Issue