debian-weasyprint/weasyprint/pdf.py

# coding: utf-8
"""
    weasyprint.pdf
    --------------

    Post-process the PDF files created by cairo and add metadata such as
    hyperlinks and bookmarks.

    :copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

from __future__ import division, unicode_literals

import hashlib
import io
import mimetypes
import sys
import zlib

import cairocffi as cairo
from pdfrw import PdfArray, PdfDict, PdfName, PdfReader, PdfString, PdfWriter

from . import VERSION_STRING, Attachment
from .compat import izip, unquote
from .html import W3C_DATE_RE
from .logger import LOGGER
from .urls import URLFetchingError, iri_to_uri, urlsplit


def convert_bookmarks_units(bookmarks, matrices):
    converted_bookmarks = []
    for label, target, children in bookmarks:
        page, x, y = target
        x, y = matrices[target[0]].transform_point(x, y)
        children = convert_bookmarks_units(children, matrices)
        converted_bookmarks.append((label, (page, x, y), children))
    return converted_bookmarks


def prepare_metadata(document, scale, pages):
    """Change metadata into data structures closer to the PDF objects.

    In particular, convert from WeasyPrint units (CSS pixels from
    the top-left corner) to PDF units (points from the bottom-left corner.)

    :param scale:
        PDF points per CSS pixels.
        Defaults to 0.75, but is affected by `zoom` in
        :meth:`weasyprint.document.Document.write_pdf`.

    """
    # X and width unchanged;  Y’ = page_height - Y;  height’ = -height
    matrices = [cairo.Matrix(xx=scale, yy=-scale, y0=page.height * scale)
                for page in document.pages]
    links = []
    for page_links, matrix in izip(document.resolve_links(), matrices):
        new_page_links = []
        for link_type, target, rectangle in page_links:
            if link_type == 'internal':
                target_page, target_x, target_y = target
                target = (
                    (pages[target_page].indirect,) +
                    matrices[target_page].transform_point(target_x, target_y))
            rect_x, rect_y, width, height = rectangle
            rect_x, rect_y = matrix.transform_point(rect_x, rect_y)
            width, height = matrix.transform_distance(width, height)
            # x, y, w, h => x0, y0, x1, y1
            rectangle = rect_x, rect_y, rect_x + width, rect_y + height
            new_page_links.append((link_type, target, rectangle))
        links.append(new_page_links)

    bookmarks = convert_bookmarks_units(
        document.make_bookmark_tree(), matrices)

    return bookmarks, links


def _create_compressed_file_object(source):
    """
    Create a file like object as ``/EmbeddedFile`` compressing it with deflate.

    :return:
        the object representing the compressed file stream object
    """
    md5 = hashlib.md5()
    compress = zlib.compressobj()

    pdf_file_object = PdfDict(
        Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode'))

    # pdfrw needs Latin-1-decoded unicode strings in object.stream
    pdf_file_object.stream = ''
    size = 0
    for data in iter(lambda: source.read(4096), b''):
        size += len(data)
        md5.update(data)
        pdf_file_object.stream += compress.compress(data).decode('latin-1')
    pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
    pdf_file_object.Params = PdfDict(
        CheckSum=PdfString('<{}>'.format(md5.hexdigest())), Size=size)
    return pdf_file_object


def _get_filename_from_result(url, result):
    """
    Derives a filename from a fetched resource. This is either the filename
    returned by the URL fetcher, the last URL path component or a synthetic
    name if the URL has no path
    """

    filename = None

    # A given filename will always take precedence
    if result:
        filename = result.get('filename')
        if filename:
            return filename

    # The URL path likely contains a filename, which is a good second guess
    if url:
        split = urlsplit(url)
        if split.scheme != 'data':
            filename = split.path.split("/")[-1]
            if filename == '':
                filename = None

    if filename is None:
        # The URL lacks a path altogether. Use a synthetic name.

        # Using guess_extension is a great idea, but sadly the extension is
        # probably random, depending on the alignment of the stars, which car
        # you're driving and which software has been installed on your machine.
        #
        # Unfortuneatly this isn't even imdepodent on one machine, because the
        # extension can depend on PYTHONHASHSEED if mimetypes has multiple
        # extensions to offer
        extension = None
        if result:
            mime_type = result.get('mime_type')
            if mime_type == 'text/plain':
                # text/plain has a phletora of extensions - all garbage
                extension = '.txt'
            else:
                extension = mimetypes.guess_extension(mime_type) or '.bin'
        else:
            extension = '.bin'

        filename = 'attachment' + extension
    else:
        if sys.version_info[0] < 3:
            # Python 3 unquotes with UTF-8 per default, here we have to do it
            # manually
            # TODO: this assumes that the filename has been quoted as UTF-8.
            # I'm not sure if this assumption holds, as there is some magic
            # involved with filesystem encoding in other parts of the code
            filename = unquote(filename)
            if not isinstance(filename, bytes):
                filename = filename.encode('latin1')
            filename = filename.decode('utf-8')
        else:
            filename = unquote(filename)

    return filename


def _create_pdf_attachment(attachment, url_fetcher):
    """
    Create an attachment to the PDF stream

    :return:
        the object representing the ``/Filespec`` object or :obj:`None` if the
        attachment couldn't be read.
    """
    try:
        # Attachments from document links like <link> or <a> can only be URLs.
        # They're passed in as tuples
        if isinstance(attachment, tuple):
            url, description = attachment
            attachment = Attachment(
                url=url, url_fetcher=url_fetcher, description=description)
        elif not isinstance(attachment, Attachment):
            attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

        with attachment.source as (source_type, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
            pdf_file_object = _create_compressed_file_object(source)
    except URLFetchingError as exc:
        LOGGER.error('Failed to load attachment: %s', exc)
        return None

    # TODO: Use the result object from a URL fetch operation to provide more
    # details on the possible filename
    return PdfDict(
        Type=PdfName('Filespec'), F=PdfString.encode(''),
        UF=PdfString.encode(_get_filename_from_result(url, None)),
        EF=PdfDict(F=pdf_file_object),
        Desc=PdfString.encode(attachment.description or ''))


def create_bookmarks(bookmarks, pages, parent=None):
    count = len(bookmarks)
    bookmark_objects = []
    for label, target, children in bookmarks:
        destination = (
            pages[target[0]].indirect,
            PdfName('XYZ'), target[1], target[2], 0)
        bookmark_object = PdfDict(
            Title=PdfString.encode(label), A=PdfDict(
                Type=PdfName('Action'), S=PdfName('GoTo'),
                D=PdfArray(destination)))
        bookmark_object.indirect = True
        children_objects, children_count = create_bookmarks(
            children, pages, parent=bookmark_object)
        bookmark_object.Count = 1 + children_count
        if bookmark_objects:
            bookmark_object.Prev = bookmark_objects[-1]
            bookmark_objects[-1].Next = bookmark_object
        if children_objects:
            bookmark_object.First = children_objects[0]
            bookmark_object.Last = children_objects[-1]
        if parent is not None:
            bookmark_object.Parent = parent
        count += children_count
        bookmark_objects.append(bookmark_object)
    return bookmark_objects, count


def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
                       url_fetcher):
    """Append to a seekable file-like object to add PDF metadata."""
    fileobj.seek(0)
    trailer = PdfReader(fileobj)
    pages = trailer.Root.Pages.Kids

    bookmarks, links = prepare_metadata(document, scale, pages)
    if bookmarks:
        bookmark_objects, count = create_bookmarks(bookmarks, pages)
        trailer.Root.Outlines = PdfDict(
            Type=PdfName('Outlines'), Count=count,
            First=bookmark_objects[0], Last=bookmark_objects[-1])

    attachments = metadata.attachments + (attachments or [])
    if attachments:
        embedded_files = []
        for attachment in attachments:
            attachment_object = _create_pdf_attachment(attachment, url_fetcher)
            if attachment_object is not None:
                embedded_files.append(PdfString.encode('attachment'))
                embedded_files.append(attachment_object)
        if embedded_files:
            trailer.Root.Names = PdfDict(
                EmbeddedFiles=PdfDict(Names=PdfArray(embedded_files)))

    # A single link can be split in multiple regions. We don't want to embedded
    # a file multiple times of course, so keep a reference to every embedded
    # URL and reuse the object number.
    # TODO: If we add support for descriptions this won't always be correct,
    # because two links might have the same href, but different titles.
    annot_files = {}
    for page_links in links:
        for link_type, target, rectangle in page_links:
            if link_type == 'attachment' and target not in annot_files:
                # TODO: use the title attribute as description
                annot_files[target] = _create_pdf_attachment(
                    (target, None), url_fetcher)

    # TODO: splitting a link into multiple independent rectangular annotations
    # works well for pure links, but rather mediocre for other annotations and
    # fails completely for transformed (CSS) or complex link shapes (area).
    # It would be better to use /AP for all links and coalesce link shapes that
    # originate from the same HTML link. This would give a feeling similiar to
    # what browsers do with links that span multiple lines.
    for page, page_links in zip(pages, links):
        annotations = PdfArray()
        for link_type, target, rectangle in page_links:
            if link_type != 'attachment' or annot_files[target] is None:
                annotation = PdfDict(
                    Type=PdfName('Annot'), Subtype=PdfName('Link'),
                    Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)))
                if link_type == 'internal':
                    destination = (
                        target[0], PdfName('XYZ'), target[1], target[2], 0)
                    annotation.A = PdfDict(
                        Type=PdfName('Action'), S=PdfName('GoTo'),
                        D=PdfArray(destination))
                else:
                    annotation.A = PdfDict(
                        Type=PdfName('Action'), S=PdfName('URI'),
                        URI=PdfString.encode(iri_to_uri(target)))
            else:
                assert annot_files[target] is not None
                ap = PdfDict(N=PdfDict(
                    BBox=PdfArray(rectangle), Subtype=PdfName('Form'),
                    Type=PdfName('XObject')))
                # evince needs /T or fails on an internal assertion. PDF
                # doesn't require it.
                annotation = PdfDict(
                    Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'),
                    T=PdfString.encode(''), Rect=PdfArray(rectangle),
                    Border=PdfArray((0, 0, 0)), FS=annot_files[target],
                    AP=ap)
            annotations.append(annotation)

        if annotations:
            page.Annots = annotations

    trailer.Info.Producer = VERSION_STRING
    for attr, key in (('title', 'Title'), ('description', 'Subject'),
                      ('generator', 'Creator')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, value)
    for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
    for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
        value = w3c_date_to_pdf(getattr(metadata, attr), attr)
        if value is not None:
            setattr(trailer.Info, key, value)

    for page, document_page in zip(pages, document.pages):
        left, top, right, bottom = (float(value) for value in page.MediaBox)
        # Convert pixels into points
        bleed = {
            key: value * 0.75 for key, value in document_page.bleed.items()}

        trim_left = left + bleed['left']
        trim_top = top + bleed['top']
        trim_right = right - bleed['right']
        trim_bottom = bottom - bleed['bottom']
        page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))

        # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
        # CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
        bleed_left = trim_left - min(10, bleed['left'])
        bleed_top = trim_top - min(10, bleed['top'])
        bleed_right = trim_right + min(10, bleed['right'])
        bleed_bottom = trim_bottom + min(10, bleed['bottom'])
        page.BleedBox = PdfArray(
            (bleed_left, bleed_top, bleed_right, bleed_bottom))

    fileobj.seek(0)
    PdfWriter().write(fileobj, trailer=trailer)
    fileobj.truncate()


def w3c_date_to_pdf(string, attr_name):
    """
    YYYYMMDDHHmmSSOHH'mm'

    """
    if string is None:
        return None
    match = W3C_DATE_RE.match(string)
    if match is None:
        LOGGER.warning('Invalid %s date: %r', attr_name, string)
        return None
    groups = match.groupdict()
    pdf_date = (groups['year'] +
                (groups['month'] or '') +
                (groups['day'] or '') +
                (groups['hour'] or '') +
                (groups['minute'] or '') +
                (groups['second'] or ''))
    if groups['hour']:
        assert groups['minute']
        if not groups['second']:
            pdf_date += '00'
        if groups['tz_hour']:
            assert groups['tz_hour'].startswith(('+', '-'))
            assert groups['tz_minute']
            pdf_date += "%s'%s'" % (groups['tz_hour'], groups['tz_minute'])
        else:
            pdf_date += 'Z'  # UTC
    return pdf_date