debian-python-odf/contrib/odf2epub/odf2epub

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2010 Søren Roug, European Environment Agency
#
# This is free software.  You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
#
# Contributor(s):
#
from odf.odf2xhtml import ODF2XHTML
from odf.namespaces import TEXTNS, XLINKNS
from odf.opendocument import load
import sys, getopt, time, zipfile
from StringIO import StringIO
from cgi import escape

UNIXPERMS = 0100644 << 16L  # -rw-r--r--

def escaped(string):
    return escape(string).encode('us-ascii','xmlcharrefreplace')

def usage():
    sys.stderr.write("Usage: %s [-c cover-image] [-o output-file] [-p] inputfile\n" % sys.argv[0])

class NavpointEntry:
    def __init__(self, anchor, title, chapter, level):
        self.anchor = anchor
        self.title = title
        self.chapter = chapter
        self.level = level

class ODF2EPUB(ODF2XHTML):

    in_toc = False
    navpoint_list = []
    chapters = []
    headerpart = []
    cur_html_name = 'chapter0.xhtml'

    def __init__(self, generate_css=True, embedable=False):
        ODF2XHTML.__init__(self, generate_css, embedable)
        self.elements[(TEXTNS, "table-of-content")] = (self.s_text_table_of_content, self.e_text_table_of_content)

    def s_text_table_of_content(self, tag, attrs):
        self.in_toc = True

    def e_text_table_of_content(self, tag, attrs):
        self.in_toc = False

#   def s_text_a(self, tag, attrs):
#       if self.in_toc:
#           href = attrs[(XLINKNS,"href")].split("|")[0]
#           if href[0] == "#":
#               n = NavpointEntry(self.get_anchor(href[1:]), href[1:])
#               self.navpoint_list.append(n)
#       return ODF2XHTML.s_text_a(self, tag, attrs)

#   def e_text_a(self, tag, attrs):
#       pass

    def s_text_h(self, tag, attrs):
        """ Handle a heading
            If the heading is a level 1 heading, then split the HTML file.
            We have to be careful, because the heading can be inside a frame or a table.
        """
        level = int(attrs[(TEXTNS,'outline-level')])
        if level == 1:
            tags_to_keep = self.htmlstack[:]
            tags_to_close = self.htmlstack[:]
            tags_to_close.reverse()
            for htag,hattrs,hblock in tags_to_close:
                if htag == 'body':
                    self.generate_footnotes()
                    self._resetfootnotes()
                self.closetag(htag)
            # I have to do this rather ugly, as the saved header part doesn't
            # go through the self.opentag() method
            self.chapters.append(''.join(self.headerpart + self.lines))
            self.lines = []
            self.htmlstack = tags_to_keep[:2] # Only <html> and <body>
            for htag,hattrs,hblock in tags_to_keep[2:]:
                self.opentag(htag,hattrs,hblock)
        return ODF2XHTML.s_text_h(self, tag, attrs)

    def e_text_h(self, tag, attrs):
        """ Headings end """
        level = int(attrs[(TEXTNS,'outline-level')])
        if level > 6: level = 6 # Heading levels go only to 6 in XHTML
        if level < 1: level = 1
        lev = self.headinglevels[1:level+1]
        outline = '.'.join(map(str,lev) )
        heading = ''.join(self.data)
        anchor = self.get_anchor("%s.%s" % ( outline, heading))
        n = NavpointEntry(anchor, heading, len(self.chapters), level)
        self.navpoint_list.append(n)
        return ODF2XHTML.e_text_h(self, tag, attrs)

    def s_office_text(self, tag, attrs):
        """ Save all the lines up to and including the <body> tag
            so I can split the file into more files
        """
        ODF2XHTML.s_office_text(self, tag, attrs)
        self.headerpart = self.lines
        self.lines = []

    def e_office_document_content(self, tag, attrs):
        ODF2XHTML.e_office_document_content(self, tag, attrs)
        self.chapters.append(''.join(self.headerpart + self.lines))
        self.lines = []

class EPublication:

    mimetype = "application/epub+zip"
    coverimage = None

    coverhtml = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>Cover</title>
    <style type="text/css"> img { max-width: 100%; } </style>
  </head>
  <body>
    <div id="cover-image">
      <img src="Pictures/cover.jpg" alt="Cover image"/>
    </div>
  </body>
</html>"""

    container = """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
    <rootfiles>
        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
    </rootfiles>
</container>"""

    content_opf_head = """<?xml version="1.0"?>
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
      <dc:title>%s</dc:title>
      <dc:language>%s</dc:language>
      <dc:identifier id="BookID" opf:scheme="URI">%s</dc:identifier>
      <dc:creator>%s</dc:creator>
      %s
    </metadata>
    <manifest>
        <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
        <item id="styles-css" href="styles.css" media-type="text/css"/>"""

    toc_ncx_head = """<?xml version="1.0"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
   "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">

<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
    <head>
        <meta name="dtb:uid" content="%s"/>
        <meta name="dtb:depth" content="2"/>
        <meta name="dtb:totalPageCount" content="0"/>
        <meta name="dtb:maxPageNumber" content="0"/>
    </head>
    <docTitle>
        <text>%s</text>
    </docTitle>
    <navMap>
        <navPoint id="navPoint-1" playOrder="1">
            <navLabel>
                <text>Start</text>
            </navLabel>
            <content src="chapter0.xhtml"/>
        </navPoint>"""
    toc_ncx_foot = """    </navMap>
</ncx>"""

    def __init__(self, filename, coverimage):
        self.doc = load(filename)
        self.coverimage = coverimage
        self.odhandler = ODF2EPUB(True, False)
        self.odhandler.add_style_file("styles.css")
        self.odhandler.load(self.doc)

    def save(self, outputfile):
        """ Save the document under the filename """
        if outputfile == '-':
            outputfp = zipfile.ZipFile(sys.stdout,"w")
        else:
            outputfp = zipfile.ZipFile(outputfile, "w")
        self._zipwrite(outputfp)
        outputfp.close()

    def _zipwrite(self, outputfp):
        """ Write the document to an open file pointer """
        now = time.localtime()[:6]
        xhtml = self.odhandler.xhtml()

        # Write mimetype - uncompressed
        zout = zipfile.ZipInfo('mimetype', now)
        zout.compress_type = zipfile.ZIP_STORED
        zout.external_attr = UNIXPERMS
        outputfp.writestr(zout, self.mimetype)

        # Write META-INF/container.xml
        zout = zipfile.ZipInfo('META-INF/container.xml', now)
        zout.compress_type = zipfile.ZIP_DEFLATED
        zout.external_attr = UNIXPERMS
        outputfp.writestr(zout, self.container)

        # Write CSS part
        zout = zipfile.ZipInfo('OEBPS/styles.css', now)
        zout.compress_type = zipfile.ZIP_DEFLATED
        zout.external_attr = UNIXPERMS
        outputfp.writestr(zout, self.odhandler.css())

        # Write HTML parts
        for chapter in range(len(self.odhandler.chapters)):
            zout = zipfile.ZipInfo('OEBPS/chapter%d.xhtml' % chapter, now)
            zout.compress_type = zipfile.ZIP_DEFLATED
            zout.external_attr = UNIXPERMS
            xhtml = self.odhandler.chapters[chapter].encode('us-ascii','xmlcharrefreplace')
            outputfp.writestr(zout, xhtml)

        # Copy images over to output
        for arcname, picturerec in self.doc.Pictures.items():
            what_it_is, fileobj, mediatype = picturerec
            zi = zipfile.ZipInfo("OEBPS/" + str(arcname), now)
            zi.compress_type = zipfile.ZIP_STORED
            zi.external_attr = UNIXPERMS
            outputfp.writestr(zi, fileobj)

        # Write content.opf
        zout = zipfile.ZipInfo('OEBPS/content.opf', now)
        zout.compress_type = zipfile.ZIP_DEFLATED
        zout.external_attr = UNIXPERMS
        opf = []
        if self.coverimage:
            covermeta = """<meta name="cover" content="cover-image"/>"""
        else:
            covermeta = ""
        opf.append(self.content_opf_head % (escaped(self.odhandler.title), escaped(self.odhandler.language),
                escaped(args[0]), escaped(self.odhandler.creator), covermeta))
        if self.coverimage:
            opf.append("""        <item id="cover-page"       href="cover.xhtml"    media-type="application/xhtml+xml"/>""")
            opf.append("""        <item id="cover-image" href="Pictures/cover.jpg" media-type="image/jpeg"/>""")
        for chapter in range(len(self.odhandler.chapters)):
            opf.append("""        <item id="chapter%d.xhtml" href="chapter%d.xhtml" media-type="application/xhtml+xml"/>""" % (chapter, chapter))
        # Write manifest of images.
        for arcname, picturerec in self.doc.Pictures.items():
            what_it_is, fileobj, mediatype = picturerec
            opf.append("""        <item id="%s" href="%s" media-type="%s"/>""" % (arcname.replace('/','_'), arcname, mediatype))
        opf.append("""</manifest>""")
        opf.append("""<spine toc="ncx">""")
        if self.coverimage:
            opf.append("""        <itemref idref="cover-page" linear="no"/>""")
        for chapter in range(len(self.odhandler.chapters)):
            opf.append("""        <itemref idref="chapter%d.xhtml"/>""" % chapter)
        opf.append("""</spine>""")

        if self.coverimage:
            opf.append("""<guide>""")
            opf.append("""    <reference type="cover" title="Cover" href="cover.xhtml"/>""")
            opf.append("""</guide>""")

        opf.append('</package>')
        outputfp.writestr(zout, '\n'.join(opf))

        # Write toc.ncx
        zout = zipfile.ZipInfo('OEBPS/toc.ncx', now)
        zout.compress_type = zipfile.ZIP_DEFLATED
        zout.external_attr = UNIXPERMS
        opf = []
        opf.append(self.toc_ncx_head % (escaped(args[0]), escaped(self.odhandler.title)))
        # We basically force the first navpoint to be a level 1 heading, no
        # matter what it was in reality. Then all other headings are either level 1 or 2.
        np_inx = 2
        np_level = 1
        for np in self.odhandler.navpoint_list:
            if np_inx == 2:
                np.level = 1
            if np.level > 2: np.level = 2
            if np_inx != 2 and np.level <= np_level:
                opf.append("""        </navPoint>""");
            if np_inx != 2 and np.level < np_level:
                opf.append("""        </navPoint>""");
            opf.append("""        <navPoint id="navPoint-%d" playOrder="%d"> <!-- L%d -->
            <navLabel>
                <text>%s</text>
            </navLabel>
            <content src="chapter%d.xhtml#%s"/>
        """ % (np_inx, np_inx, np.level, escaped(np.title), np.chapter, np.anchor))
            np_inx += 1
            np_level = np.level
        opf.append("""        </navPoint>""");
        if np_level > 1:
            opf.append("""        </navPoint>""");
        opf.append(self.toc_ncx_foot)
        outputfp.writestr(zout, '\n'.join(opf))

        # Write cover image
        if self.coverimage:
            outputfp.write(coverimage,'OEBPS/Pictures/cover.jpg', zipfile.ZIP_STORED)
            zout = zipfile.ZipInfo('OEBPS/cover.xhtml', now)
            zout.compress_type = zipfile.ZIP_DEFLATED
            zout.external_attr = UNIXPERMS
            outputfp.writestr(zout, self.coverhtml)

#        zout = zipfile.ZipInfo('OEBPS/styles.css', now)
#        zout.compress_type = zipfile.ZIP_DEFLATED
#        zout.external_attr = UNIXPERMS
#        css = self.odhandler.css().encode('us-ascii','xmlcharrefreplace')
#        outputfp.writestr(zout, css)

try:
    opts, args = getopt.getopt(sys.argv[1:], "c:po:", ["cover", "plain","output"])
except getopt.GetoptError:
    usage()
    sys.exit(2)

generatecss = True
embedable = False
outputfilename = "-"
coverimage = None

for o, a in opts:
    if o in ("-c", "--cover"):
        coverimage = a
    if o in ("-p", "--plain"):
        generatecss = False
    if o in ("-o", "--output"):
        outputfilename = a

if len(args) != 1:
    usage()
    sys.exit(2)

try:
    epub = EPublication(args[0], coverimage)
    epub.save(outputfilename)
except:
    sys.stderr.write("Unable to open file %s or file is not OpenDocument\n" % sys.argv[1])
    sys.exit(1)
sys.exit(0)