passerelle/passerelle/apps/pdf/models.py

287 lines
11 KiB
Python

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import base64
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from collections import OrderedDict
from django.conf import settings
from django.core.exceptions import ValidationError
from django.db import models
from django.http.response import HttpResponse
from django.template.base import VariableDoesNotExist
from django.utils.translation import gettext_lazy as _
from passerelle.base.models import BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
from passerelle.utils.models import resource_file_upload_to
from passerelle.utils.templates import render_to_string, validate_template
PDF_FILE_OBJECT = {
'type': 'object',
'description': _('PDF file'),
'required': ['content'],
'properties': {
'filename': {
'type': 'string',
'description': _('file name'),
},
'content_type': {
'type': 'string',
'description': _('MIME content-type'),
},
'content': {
'type': 'string',
'description': _('file content, base64 encoded'),
},
},
}
ASSEMBLE_SCHEMA = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': '',
'description': '',
'type': 'object',
'required': ['filename', 'files'],
'unflatten': True,
'properties': OrderedDict(
{
'filename': {
'description': _('output PDF filename'),
'type': 'string',
},
'files': {
'type': 'array',
'description': _('PDF files to catenate'),
'items': {
'oneOf': [
PDF_FILE_OBJECT,
{'type': 'string', 'description': _('PDF content, base64 encoded')},
{'type': 'null', 'description': _('empty file, do not consider')},
]
},
},
}
),
}
FILL_FORM_SCHEMA = {
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': '',
'description': '',
'type': 'object',
'required': ['filename'],
'unflatten': True,
'properties': OrderedDict(
{
'filename': {
'description': _('output PDF filename'),
'type': 'string',
},
'input-form': PDF_FILE_OBJECT,
'xfdf': {
'description': _('hierarchical dictionary of fields'),
'type': 'object',
},
}
),
}
def validate_pdf(fieldfile):
fieldfile.open()
if fieldfile.read(5) != b'%PDF-':
raise ValidationError(
_('%(value)s is not a PDF file'),
params={'value': fieldfile},
)
class Resource(BaseResource):
category = _('Misc')
fill_form_file = models.FileField(
_('Fill Form default input file'),
upload_to=resource_file_upload_to,
help_text=_('PDF file, used if not input-form in fill-form payload'),
validators=[validate_pdf],
null=True,
blank=True,
)
xfdf_template = models.FileField(
_('XFDF Template'),
upload_to=resource_file_upload_to,
help_text=_('Django template, used to create a XFDF for fill-form, rendered with payload'),
validators=[validate_template],
null=True,
blank=True,
)
class Meta:
verbose_name = _('PDF')
def run_pdftk(self, args):
args = [settings.PDFTK_PATH] + args + ['output', '-']
try:
return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
except subprocess.TimeoutExpired as e:
raise APIError('pdftk timed out after %s seconds' % e.timeout)
except subprocess.CalledProcessError as e:
raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))
@endpoint(
description=_('Returns the assembly of received PDF files'),
perm='can_access',
methods=['post'],
post={
'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
'input_example': {
'filename': 'output.pdf',
'files/0': {
'filename': 'example-1.pdf',
'content_type': 'application/pdf',
'content': 'JVBERi0xL...(base64 PDF)...',
},
'files/1': {
'filename': 'example-2.pdf',
'content_type': 'application/pdf',
'content': '//4lUERGL...(base64 PDF)...',
},
'files/2': '//4lUERGL...(base64 PDF)',
},
},
)
def assemble(self, request, post_data):
filename = post_data.pop('filename')
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
infiles = []
for i, infile in enumerate(post_data['files']):
if isinstance(infile, dict) and infile.get('content'):
b64content = infile['content']
elif isinstance(infile, str) and infile:
b64content = infile
else:
continue
infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
with open(infile_filename, mode='wb') as fd:
fd.write(base64.b64decode(b64content))
infiles.append(infile_filename)
if not infiles:
raise APIError("no valid file found in 'files' property", http_status=400)
pdf_content = self.run_pdftk(args=infiles + ['cat'])
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response
@endpoint(
name='fill-form',
description=_('Fills the input PDF form with fields'),
perm='can_access',
methods=['post'],
post={
'request_body': {'schema': {'application/json': FILL_FORM_SCHEMA}},
'input_example': {
'filename': 'filled.pdf',
'xfdf/Page1[0]/FirstName[0]': 'John',
'xfdf/Page1[0]/LastName[0]': 'Doe',
'xfdf/Page2[0]/Checkbox[0]': '0',
'xfdf/Page2[0]/Checkbox[1]': '1',
},
},
)
def fill_form(self, request, post_data):
filename = post_data['filename']
if 'xfdf' in post_data:
fields = post_data.pop('xfdf')
elif self.xfdf_template:
fields = None
else:
raise APIError("missing 'xfdf' property (no XFDF template)", http_status=400)
if fields is not None:
xfdf_root = ET.Element('xfdf')
xfdf_root.attrib['xmlns'] = 'http://ns.adobe.com/xfdf/'
xfdf_root.attrib['xml:space'] = 'preserve'
xfdf_f = ET.SubElement(xfdf_root, 'f')
xfdf_fields = ET.SubElement(xfdf_root, 'fields')
def add_fields(element, fields):
if isinstance(fields, dict):
for key in fields:
field = ET.SubElement(element, 'field')
field.attrib['name'] = key
add_fields(field, fields[key])
else:
value = ET.SubElement(element, 'value')
value.text = str(fields)
add_fields(xfdf_fields, fields)
with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-fill-form-' % self.id) as tmpdir:
if isinstance(post_data.get('input-form'), dict) and post_data['input-form'].get('content'):
input_filename = os.path.join(tmpdir, 'input-form.pdf')
with open(input_filename, mode='wb') as fd:
fd.write(base64.b64decode(post_data['input-form']['content']))
elif self.fill_form_file:
input_filename = self.fill_form_file.path
else:
raise APIError(
"missing or bad 'input-form' property (no default input file)", http_status=400
)
# create xfdf
xfdf_filename = os.path.join(tmpdir, 'fields.xfdf')
if fields is not None:
xfdf_f.attrib['href'] = input_filename
with open(xfdf_filename, mode='wb') as fd:
ET.indent(xfdf_root)
ET.ElementTree(xfdf_root).write(fd, encoding='UTF-8', xml_declaration=True)
else:
self.xfdf_template.seek(0)
xfdf_template = self.xfdf_template.read().decode()
try:
xfdf_content = render_to_string(xfdf_template, post_data)
except VariableDoesNotExist as exc:
raise APIError("cannot render XFDF template: %s" % exc, http_status=400)
with open(xfdf_filename, mode='w') as fd:
fd.write(xfdf_content)
# call pdftk fill_form
pdf_content = self.run_pdftk(args=[input_filename, 'fill_form', xfdf_filename])
response = HttpResponse(pdf_content, content_type='application/pdf')
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
return response
def pdftk_dump_data_fields_utf8(self):
if not self.fill_form_file:
return
try:
dump = self.run_pdftk(args=[self.fill_form_file.path, 'dump_data_fields_utf8']).decode()
except APIError as apierror:
return 'Error: %r' % apierror
unflatten_separated = ''
for line in dump.splitlines():
unflatten_separated += '<br>%s' % line
if line.startswith('FieldName: '):
unflatten_separated += ' → <b>xfdf/%s</b>' % line[11:].replace('.', '/')
return unflatten_separated