tabellioOOo/legi2pdf/lib/legi2pdf/pdfGenerator.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Tabellio -- software suite for deliberative assemblies
#          -- suite logicielle pour assemblées délibératives
#          -- http://www.tabellio.org/
# Copyright (C) 2006 Parlement de la Communauté française de Belgique

# This file is part of Tabellio.

# Tabellio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# Tabellio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

import sys
import os
import re
import threading
import time
import string
import math
import tempfile
import getopt
import cStringIO
import shutil
import libxml2
import libxslt
import logging
import subprocess

try:
    import elementtree.ElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

NAME_CONTENT_LEGI = 'contents.xml'

# from pythonlib
import xmlutils
from xmlutils import parseStream, applyStylesheet2
from epsutils import eps2pdf
from ziputils import unzipToDirectory
from utf8utils import utf8encode
import magic

try:
    import PIL
    import PIL.Image
except ImportError:
    PIL = None

tempfilelock = threading.Lock()
tempfile.template = "F%ld-" % time.time()

log = logging.getLogger("")

abspath = os.path.abspath(os.path.dirname(__file__))
xslpath = os.path.join(abspath,"..","..","xsl")

xslPreprocessingFilterName = os.path.join(xslpath,"pre_proc.xsl")
xslPostprocessingFilterName = os.path.join(xslpath,"post_proc.xsl")
xslMainprocessingFilterName = os.path.join(xslpath,"main.xsl")
xslAnnexPreprocessingFilterName = os.path.join(xslpath,"annex_pre_proc.xsl")
xslCopyProcessingFilterName = os.path.join(xslpath,"copy_proc.xsl")

class PdfGeneratorException:
    def __init__(self,message=""):
        self.message = message
    def __str__(self):
        return self.message


def _writeToFile(filename, data):
    f = None
    try:
        f = open(filename, "wb")
        f.write(data)
    finally:
        if f != None:
            f.close()

def makeTempFileName(suffix="",subdir=None):
    tmpd = tempfile.tempdir
    tempfile.tempdir = os.path.abspath(os.path.join(tempfile.gettempdir(),"legi2pdf"))
    if not os.path.exists(tempfile.tempdir):
        os.mkdir(tempfile.tempdir)
    tempfilelock.acquire()
    try:
        if not subdir:
            return tempfile.mktemp()+suffix
        else:
            old = tempfile.tempdir
            tempfile.tempdir = os.path.join(old,subdir)
            r = tempfile.mktemp()+suffix
            tempfile.tempdir = old
            return r
    finally:
        tempfile.tempdir = tmpd
        tempfilelock.release()

def purgeDirectory(d):
    try:
        for f in os.listdir(os.path.abspath(d)):
            os.remove(os.path.join(d, f))
        os.rmdir(d)
    finally:
        pass

bboxRE = re.compile(r'\d+(?:\.\d+)?')

def _convertBboxVal(val):
    r = bboxRE.findall(val)
    f = string.atof(r[0])
    return "%d" % math.floor(f + 0.5)

def unzipAnnexToDir(annexes, tempDir):
    # unzip annex files
    for i in range(len(annexes)):
        annexeFileName = annexes[i][0]
        if annexeFileName[-5:] == ".legi":
            unzipToDirectory(annexeFileName, tempDir, "%d_" % i)
        else:
            shutil.copy(annexeFileName, os.path.join(tempDir,os.path.basename(annexeFileName)))

def getBboxes(d):
    bboxDict = {}
    doc = None
    ctxt = None
    try:
        doc = libxml2.parseFile(os.path.join(d,NAME_CONTENT_LEGI))
        ctxt = doc.xpathNewContext()
        res = ctxt.xpathEval("//imagedata")

        for r in res:
            if r.name == "imagedata":
                fileNameProp = r.hasProp("fileref")
                widthProp = r.hasProp("width")
                depthProp = r.hasProp("depth")
                if (fileNameProp and widthProp and depthProp):
                    bboxDict[fileNameProp.getContent()] = (
                            _convertBboxVal(widthProp.getContent()),
                            _convertBboxVal(depthProp.getContent()))
    finally:
        if doc != None:
            doc.freeDoc()
        if ctxt != None:
            ctxt.xpathFreeContext()
    return bboxDict

def convertWmf2Pdf(d):

    bboxDict = getBboxes(d)
    for wmfFileName in os.listdir(d):
        if wmfFileName[-4:] == ".wmf":
            epsFileName = wmfFileName[:-4] + ".eps"
            pdfFileName = wmfFileName[:-4] + ".pdf"
            bbox = bboxDict.get(wmfFileName, None)
            if bbox != None:
                cmd = "/usr/bin/wmf2eps --bbox=%sx%s %s > %s" % (
                        bbox[0],
                        bbox[1],
                        os.path.join(d,wmfFileName),
                        os.path.join(d,epsFileName))
            else:
                cmd = "/usr/bin/wmf2eps %s > %s" % (os.path.join(d,wmfFileName), os.path.join(d,epsFileName))
            subprocess.call([cmd],cwd=d, shell=True)
            eps2pdf(os.path.join(d,epsFileName))

def insertAnnex(xmlMainDoc, annexes, d):
    xmlAnnexDoc = None
    inputFile = None
    bookNode = xmlMainDoc.children
    try:
        for i in range(len(annexes)):
                #bookNode = xmlMainDoc.children
                if annexes[i][0][-5:] == ".legi":
                    inputFile = open(os.path.join(d,"%d_%s" % (i, NAME_CONTENT_LEGI)), "r")
                    xmlAnnexDoc = parseStream(inputFile, validate=0)
                    xslParams = {'annex-id':'%d_' % (i,), 'annex-title':'%s' % utf8encode(annexes[i][1])}
                    xmlAnnexDoc = applyStylesheet2( xmlAnnexDoc , xslAnnexPreprocessingFilterName, xslParams)
                    bookNode.addChild(xmlAnnexDoc.children)
                    inputFile.close()
                    inputFile = None
                elif annexes[i][0][-4:] == ".pdf":
                    newNode = libxml2.newNode("appendix")
                    newNode.newProp("type", "pdf")
                    titleNode = libxml2.newNode("title")
                    title = annexes[i][1]
                    if title is not None:
                        title = title.encode("UTF-8")
                    titleNode.addContent(title)
                    newNode.addChild(titleNode)
                    pdfNode = libxml2.newNode("pdf-annex")
                    pdfNode.newProp("pdf-file", os.path.basename(annexes[i][0]))
                    scale = annexes[i][2]
                    pdfNode.newProp("scale", "%.2f" % (scale/100.0))
                    newNode.addChild(pdfNode)
                    bookNode.addChild(newNode)
                else:
                    log.warn("format d'annexe non supporté %s" % annexes[i][0])
    finally:
        if inputFile != None:
            inputFile.close()

def convertLegi2Tex(input, outputFileName, annexes, d, draft=0, toc=True,
        style="normal", useFont=None, legacyMode=False):
    xmlDoc = None
    inputStream = None
    outputStream = None

    if style.startswith('ooo-'):
        legacyMode = False
        style = style[4:]

    global xslpath, xslPreprocessingFilterName, xslPostprocessingFilterName, xslMainprocessingFilterName, xslAnnexPreprocessingFilterName, xslCopyProcessingFilterName

    if legacyMode:
        xslpath = os.path.join(abspath,"..","..","xsl-legacy")
        xslPreprocessingFilterName = os.path.join(xslpath,"pre_proc.xsl")
        xslPostprocessingFilterName = os.path.join(xslpath,"post_proc.xsl")
        xslMainprocessingFilterName = os.path.join(xslpath,"main.xsl")
        xslAnnexPreprocessingFilterName = os.path.join(xslpath,"annex_pre_proc.xsl")
        xslCopyProcessingFilterName = os.path.join(xslpath,"copy_proc.xsl")
    else:
        xslpath = os.path.join(abspath,"..","..","xsl")
        xslPreprocessingFilterName = os.path.join(xslpath,"pre_proc.xsl")
        xslPostprocessingFilterName = os.path.join(xslpath,"post_proc.xsl")
        xslMainprocessingFilterName = os.path.join(xslpath,"main.xsl")
        xslAnnexPreprocessingFilterName = os.path.join(xslpath,"annex_pre_proc.xsl")
        xslCopyProcessingFilterName = os.path.join(xslpath,"copy_proc.xsl")

    xslParam = {}

    tree = ET.fromstring(input)
    for property in tree.findall('metadata/property'):
        if property.attrib.get('name') != 'keyword':
            continue
        if 'PFB' in property.text:
            xslParam['latex.document.font'] = 'helvet'
            xslParam['latex.documentclass'] = 'PFBstd'
        else:
            xslParam['latex.document.font'] = 'sabon'
            xslParam['latex.documentclass'] = 'PCFstd'

    if useFont:
        # override selected font
        xslParam['latex.document.font'] = useFont

    try:
        # preprocessing
        outputStream = open(os.path.join(d, outputFileName), "w")
        xml_doc = libxml2.parseDoc(input)
        xsl_style = libxslt.parseStylesheetFile(xslPreprocessingFilterName)
        preprocessed_xml_doc = xsl_style.applyStylesheet(xml_doc, {})
        xsl_style.freeStylesheet()
        insertAnnex(preprocessed_xml_doc, annexes, d)

        ctxt = preprocessed_xml_doc.xpathNewContext()
        # some custom preprocessing of text content:
        #   marks -- as TABELLIO-- so substitution with the proper command
        #   (\hyp{}) can happen in the postprocessing phase.
        for node in ctxt.xpathEval('//text()'):
            if '--' in node.content:
                node.setContent(node.content.replace('--', 'TABELLIO--'))
        ctxt.xpathFreeContext()

        # processing
        if draft:
            xslParam['with-draft-tag'] = '1'

        if toc:
            xslParam['with-toc'] = '1'

        if style == "parchment":
            xslParam['parchemin'] = '1'

        if style == "bqr":
            xslParam['bqr'] = '1'

        for k, v in xslParam.items():
            xslParam[k] = xmlutils.makeparam(v)

        xsl_style = libxslt.parseStylesheetFile(xslMainprocessingFilterName)
        processed_xml_doc = xsl_style.applyStylesheet(preprocessed_xml_doc, xslParam)
        xml_latex_doc = xsl_style.saveResultToString(processed_xml_doc)
        xsl_style.freeStylesheet()

        # postprocessing
        doc = ET.fromstring(xml_latex_doc).text
        for before, after in [(u'&#x2019;', u"'"), (u'\u2019', u"'"),
                              (u'&#x201C;', u'\guillemotleft'), (u'\u201C', u'«'),
                              (u'&#x201D;', u'\guillemotright'), (u'\u201D', u'»'),
                              (u'&#x2011;', u'-'), (u'\u2011', u'-'),
                              (u'TABELLIO--', u'\hyp{}'),]:
            doc = doc.replace(before, after)

        # look for end of parts and change the multicol environment not to have
        # balanced columns
        parts = doc.split('\n% end part')
        for i, part in enumerate(parts):
            if i == 0:
                continue
            parts[i] = part.replace(r'\end{multicols}', r'\end{multicols*}', 1)
            reversed_previous_part = parts[i-1][::-1]
            reversed_previous_part = reversed_previous_part.replace(
                    r'\begin{multicols}'[::-1], r'\begin{multicols*}'[::-1], 1)
            parts[i-1] = reversed_previous_part[::-1]
        doc = '\n% adjusted end part'.join(parts)

        outputStream.write(doc.encode('utf-8'))
    finally:
        if outputStream != None:
            outputStream.close()


def copy_extra_files(dest):
    extra_dir = os.path.join(abspath, '..', '..', 'extra')
    for filename in os.listdir(extra_dir):
        src = os.path.join(extra_dir, filename)
        if not os.path.isfile(src):
            continue
        file(os.path.join(dest, filename), 'w').write(file(src).read())


def convertTex2Pdf(d, latexFileName):
    cmd = "pdftex --fmt=pdflatex --interaction=nonstopmode %s" % (latexFileName)
    copy_extra_files(d)
    subprocess.call([cmd], cwd=d, shell=True)
    subprocess.call([cmd], cwd=d, shell=True)
    subprocess.call([cmd], cwd=d, shell=True )

def convertLegi2Pdf(inputFileName, pdfFileName, latexFileName, keepLatex=0,
                    debug=0, annexes=[], draft=0, toc=True, style="normal", useFont=None,
                    legacyMode=True, grayscale=True):
    """
    Based on input .legi document, generatePdf is responsible
    to generate a PDF documents.

    Arguments:
    inputFileName -- name of the input .legi file
    pdfFileName -- name of the output file (pdf)
    latexFileName -- name of the output file (LaTeX)
    keepLatex -- keep the LaTeX file
    debug -- keep the temporary files
    annexes -- annexe definition
    draft -- add a draft stamp
    toc -- enable the table of content
    style -- special style (normal, parchment, bqr)
    useFont -- document font, override document class definition
    legacyMode -- use the legacy xsl conversion files
    grayscale -- convert image to grayscale
    """
    result = []
    allEntries = None
    tempDir = None
    f = None
    try:
        tempDir = makeTempFileName(".legi2pdf")
        os.mkdir(tempDir)

        f = open(inputFileName)
        fmt = magic.fileFormat(f)

        copy_extra_files(tempDir)

        if fmt == "zip":
            allEntries = unzipToDirectory(inputFileName, tempDir)
            if allEntries == None:
                raise PdfGeneratorException, 'Unable to open or to read the legi file: %s' % (inputFileName)

            contentEntryInfo, contentEntryData = allEntries.get( NAME_CONTENT_LEGI, (None, None))
            if contentEntryInfo == None or contentEntryData == None:
                raise PdfGeneratorException, 'Corrupted legi file: No %s entry' % (NAME_CONTENT_LEGI)
            if grayscale and PIL:
                for filename in os.listdir(tempDir):
                    if not os.path.splitext(filename)[-1] in ('.jpg', '.png'):
                        continue
                    # image, convert it to grayscale
                    image = PIL.Image.open(os.path.join(tempDir, filename))
                    grayscaled = image.convert('L')
                    grayscaled.save(os.path.join(tempDir, filename))
        elif fmt == "xml":
            contentEntryData = f.read()
            _writeToFile(os.path.join(tempDir, NAME_CONTENT_LEGI), contentEntryData)
        else:
            raise PdfGeneratorException, 'Unsupported input format: %s (should be: xml or zip)' % (fmt)

        unzipAnnexToDir(annexes, tempDir)

        convertLegi2Tex(contentEntryData, "temp.tex", annexes, tempDir, draft, toc, style, useFont, legacyMode)
        convertWmf2Pdf(tempDir)
        convertTex2Pdf(tempDir, "temp.tex")
        if os.path.exists(pdfFileName):
            os.remove(pdfFileName)
        if not os.path.exists(os.path.join(tempDir, 'temp.pdf')):
            raise PdfGeneratorException('legi2pdf failed to create a pdf file')
        shutil.move(os.path.join(tempDir, "temp.pdf"), pdfFileName)
    finally:
        if keepLatex:
            if os.path.exists(latexFileName):
                os.remove(latexFileName)
            if os.path.exists(os.path.join(tempDir, "temp.tex")):
                shutil.move(os.path.join(tempDir, "temp.tex"), latexFileName)
        if tempDir != None and not debug:
            purgeDirectory(tempDir)
        if f != None:
            f.close()