passerelle/passerelle/apps/pdf/models.py

140 lines
4.9 KiB
Python

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import base64
import os
import subprocess
import tempfile
from collections import OrderedDict
from django.conf import settings
from django.http.response import HttpResponse
from django.utils.translation import gettext_lazy as _
from passerelle.base.models import BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
PDF_FILE_OBJECT = {
'type': 'object',
'description': _('PDF file'),
'required': ['content'],
'properties': {
'filename': {
'type': 'string',
'description': _('file name'),
},
'content_type': {
'type': 'string',
'description': _('MIME content-type'),
},
'content': {
'type': 'string',
'description': _('file content, base64 encoded'),
},
},
}
ASSEMBLE_SCHEMA = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': '',
'description': '',
'type': 'object',
'required': ['filename', 'files'],
'unflatten': True,
'properties': OrderedDict(
{
'filename': {
'description': _('output PDF filename'),
'type': 'string',
},
'files': {
'type': 'array',
'description': _('PDF files to catenate'),
'items': {
'oneOf': [
PDF_FILE_OBJECT,
{'type': 'string', 'description': _('PDF content, base64 encoded')},
{'type': 'null', 'description': _('empty file, do not consider')},
]
},
},
}
),
}
class Resource(BaseResource):
category = _('Misc')
class Meta:
verbose_name = _('PDF')
def run_pdftk(self, args):
args = [settings.PDFTK_PATH] + args + ['output', '-']
try:
return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
except subprocess.TimeoutExpired as e:
raise APIError('pdftk timed out after %s seconds' % e.timeout)
except subprocess.CalledProcessError as e:
raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))
@endpoint(
description=_('Returns the assembly of received PDF files'),
perm='can_access',
methods=['post'],
post={
'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
'input_example': {
'filename': 'output.pdf',
'files/0': {
'filename': 'example-1.pdf',
'content_type': 'application/pdf',
'content': 'JVBERi0xL...(base64 PDF)...',
},
'files/1': {
'filename': 'example-2.pdf',
'content_type': 'application/pdf',
'content': '//4lUERGL...(base64 PDF)...',
},
'files/2': '//4lUERGL...(base64 PDF)',
},
},
)
def assemble(self, request, post_data):
filename = post_data.pop('filename')
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
infiles = []
for i, infile in enumerate(post_data['files']):
if isinstance(infile, dict) and infile.get('content'):
b64content = infile['content']
elif isinstance(infile, str) and infile:
b64content = infile
else:
continue
infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
with open(infile_filename, mode='wb') as fd:
fd.write(base64.b64decode(b64content))
infiles.append(infile_filename)
if not infiles:
raise APIError("no valid file found in 'files' property", http_status=400)
pdf_content = self.run_pdftk(args=infiles + ['cat'])
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response