passerelle/passerelle/apps/pdf/models.py

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2023 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import base64
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from collections import OrderedDict

from django.conf import settings
from django.core.exceptions import ValidationError
from django.db import models
from django.http.response import HttpResponse
from django.template.base import VariableDoesNotExist
from django.utils.translation import gettext_lazy as _

from passerelle.base.models import BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
from passerelle.utils.models import resource_file_upload_to
from passerelle.utils.templates import render_to_string, validate_template

PDF_FILE_OBJECT = {
    'type': 'object',
    'description': _('PDF file'),
    'required': ['content'],
    'properties': {
        'filename': {
            'type': 'string',
            'description': _('file name'),
        },
        'content_type': {
            'type': 'string',
            'description': _('MIME content-type'),
        },
        'content': {
            'type': 'string',
            'description': _('file content, base64 encoded'),
        },
    },
}

ASSEMBLE_SCHEMA = {
    '$schema': 'http://json-schema.org/draft-04/schema#',
    'title': '',
    'description': '',
    'type': 'object',
    'required': ['filename', 'files'],
    'unflatten': True,
    'properties': OrderedDict(
        {
            'filename': {
                'description': _('output PDF filename'),
                'type': 'string',
            },
            'files': {
                'type': 'array',
                'description': _('PDF files to catenate'),
                'items': {
                    'oneOf': [
                        PDF_FILE_OBJECT,
                        {'type': 'string', 'description': _('PDF content, base64 encoded')},
                        {'type': 'null', 'description': _('empty file, do not consider')},
                    ]
                },
            },
        }
    ),
}

FILL_FORM_SCHEMA = {
    '$schema': 'http://json-schema.org/draft-04/schema#',
    'title': '',
    'description': '',
    'type': 'object',
    'required': ['filename'],
    'unflatten': True,
    'properties': OrderedDict(
        {
            'filename': {
                'description': _('output PDF filename'),
                'type': 'string',
            },
            'input-form': PDF_FILE_OBJECT,
            'xfdf': {
                'description': _('hierarchical dictionary of fields'),
                'type': 'object',
            },
        }
    ),
}


def validate_pdf(fieldfile):
    fieldfile.open()
    if fieldfile.read(5) != b'%PDF-':
        raise ValidationError(
            _('%(value)s is not a PDF file'),
            params={'value': fieldfile},
        )


class Resource(BaseResource):
    category = _('Misc')

    fill_form_file = models.FileField(
        _('Fill Form default input file'),
        upload_to=resource_file_upload_to,
        help_text=_('PDF file, used if not input-form in fill-form payload'),
        validators=[validate_pdf],
        null=True,
        blank=True,
    )
    xfdf_template = models.FileField(
        _('XFDF Template'),
        upload_to=resource_file_upload_to,
        help_text=_('Django template, used to create a XFDF for fill-form, rendered with payload'),
        validators=[validate_template],
        null=True,
        blank=True,
    )

    class Meta:
        verbose_name = _('PDF')

    def run_pdftk(self, args):
        args = [settings.PDFTK_PATH] + args + ['output', '-']
        try:
            return subprocess.check_output(args, timeout=settings.PDFTK_TIMEOUT, stderr=subprocess.STDOUT)
        except subprocess.TimeoutExpired as e:
            raise APIError('pdftk timed out after %s seconds' % e.timeout)
        except subprocess.CalledProcessError as e:
            raise APIError('pdftk returned non-zero exit status %s (%r)' % (e.returncode, e.output))

    @endpoint(
        description=_('Returns the assembly of received PDF files'),
        perm='can_access',
        methods=['post'],
        post={
            'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
            'input_example': {
                'filename': 'output.pdf',
                'files/0': {
                    'filename': 'example-1.pdf',
                    'content_type': 'application/pdf',
                    'content': 'JVBERi0xL...(base64 PDF)...',
                },
                'files/1': {
                    'filename': 'example-2.pdf',
                    'content_type': 'application/pdf',
                    'content': '//4lUERGL...(base64 PDF)...',
                },
                'files/2': '//4lUERGL...(base64 PDF)',
            },
        },
    )
    def assemble(self, request, post_data):
        filename = post_data.pop('filename')

        with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-assemble-' % self.id) as tmpdir:
            infiles = []
            for i, infile in enumerate(post_data['files']):
                if isinstance(infile, dict) and infile.get('content'):
                    b64content = infile['content']
                elif isinstance(infile, str) and infile:
                    b64content = infile
                else:
                    continue
                infile_filename = os.path.join(tmpdir, 'pdf-%d.pdf' % i)
                with open(infile_filename, mode='wb') as fd:
                    fd.write(base64.b64decode(b64content))
                infiles.append(infile_filename)
            if not infiles:
                raise APIError("no valid file found in 'files' property", http_status=400)
            pdf_content = self.run_pdftk(args=infiles + ['cat'])

        response = HttpResponse(pdf_content, content_type='application/pdf')
        response['Content-Disposition'] = 'attachment; filename="%s"' % filename
        return response

    @endpoint(
        name='fill-form',
        description=_('Fills the input PDF form with fields'),
        perm='can_access',
        methods=['post'],
        post={
            'request_body': {'schema': {'application/json': FILL_FORM_SCHEMA}},
            'input_example': {
                'filename': 'filled.pdf',
                'xfdf/Page1[0]/FirstName[0]': 'John',
                'xfdf/Page1[0]/LastName[0]': 'Doe',
                'xfdf/Page2[0]/Checkbox[0]': '0',
                'xfdf/Page2[0]/Checkbox[1]': '1',
            },
        },
    )
    def fill_form(self, request, post_data):
        filename = post_data['filename']
        if 'xfdf' in post_data:
            fields = post_data.pop('xfdf')
        elif self.xfdf_template:
            fields = None
        else:
            raise APIError("missing 'xfdf' property (no XFDF template)", http_status=400)

        if fields is not None:
            xfdf_root = ET.Element('xfdf')
            xfdf_root.attrib['xmlns'] = 'http://ns.adobe.com/xfdf/'
            xfdf_root.attrib['xml:space'] = 'preserve'
            xfdf_f = ET.SubElement(xfdf_root, 'f')
            xfdf_fields = ET.SubElement(xfdf_root, 'fields')

            def add_fields(element, fields):
                if isinstance(fields, dict):
                    for key in fields:
                        field = ET.SubElement(element, 'field')
                        field.attrib['name'] = key
                        add_fields(field, fields[key])
                else:
                    value = ET.SubElement(element, 'value')
                    value.text = str(fields)

            add_fields(xfdf_fields, fields)

        with tempfile.TemporaryDirectory(prefix='passerelle-pdftk-%s-fill-form-' % self.id) as tmpdir:
            if isinstance(post_data.get('input-form'), dict) and post_data['input-form'].get('content'):
                input_filename = os.path.join(tmpdir, 'input-form.pdf')
                with open(input_filename, mode='wb') as fd:
                    fd.write(base64.b64decode(post_data['input-form']['content']))
            elif self.fill_form_file:
                input_filename = self.fill_form_file.path
            else:
                raise APIError(
                    "missing or bad 'input-form' property (no default input file)", http_status=400
                )
            # create xfdf
            xfdf_filename = os.path.join(tmpdir, 'fields.xfdf')
            if fields is not None:
                xfdf_f.attrib['href'] = input_filename
                with open(xfdf_filename, mode='wb') as fd:
                    ET.indent(xfdf_root)
                    ET.ElementTree(xfdf_root).write(fd, encoding='UTF-8', xml_declaration=True)
            else:
                self.xfdf_template.seek(0)
                xfdf_template = self.xfdf_template.read().decode()
                try:
                    xfdf_content = render_to_string(xfdf_template, post_data)
                except VariableDoesNotExist as exc:
                    raise APIError("cannot render XFDF template: %s" % exc, http_status=400)
                with open(xfdf_filename, mode='w') as fd:
                    fd.write(xfdf_content)

            # call pdftk fill_form
            pdf_content = self.run_pdftk(args=[input_filename, 'fill_form', xfdf_filename])

        response = HttpResponse(pdf_content, content_type='application/pdf')
        response['Content-Disposition'] = 'attachment; filename="%s"' % filename
        return response

    def pdftk_dump_data_fields_utf8(self):
        if not self.fill_form_file:
            return
        try:
            dump = self.run_pdftk(args=[self.fill_form_file.path, 'dump_data_fields_utf8']).decode()
        except APIError as apierror:
            return 'Error: %r' % apierror
        unflatten_separated = ''
        for line in dump.splitlines():
            unflatten_separated += '<br>%s' % line
            if line.startswith('FieldName: '):
                unflatten_separated += ' → <b>xfdf/%s</b>' % line[11:].replace('.', '/')
        return unflatten_separated