debian-python-odf/contrib/html2odt/shtml2odt.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2008-2009 Søren Roug, European Environment Agency
#
# This is free software.  You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
#
# Contributor(s):
#
#
import string, sys, re, getopt
import urllib2, htmlentitydefs, urlparse
from urllib import quote_plus
from HTMLParser import HTMLParser
from cgi import escape,parse_header
from types import StringType

from odf.opendocument import OpenDocumentText, load
from odf import dc, text, table
import htmlstyles


def converturl(url, document=None):
    """ grab and convert url
    """
    url = string.strip(url)
#   if url.lower()[:5] != "http:":
#       raise IOError, "Only http is accepted"

    _proxies = {}
    proxy_support = urllib2.ProxyHandler(_proxies)
    opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)

    urllib2.install_opener(opener)

    req = urllib2.Request(url)
    req.add_header("User-agent", "HTML2ODT: Convert HTML to OpenDocument")
    conn = urllib2.urlopen(req)

    if not conn:
        raise IOError, "Failure in open"
    data = conn.read()
    headers = conn.info()
    conn.close()

    encoding = 'iso8859-1'  #Standard HTML
    if headers.has_key('content-type'):
        (ct, parms) = parse_header(headers['content-type'])
        if parms.has_key('charset'):
            encoding = parms['charset']

    mhp = HTML2ODTParser(document, encoding, url)
    mhp.feed(data)
    return  mhp

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
incomplete = re.compile('&[a-zA-Z#]')
ampersand = re.compile('&')

def listget(list, key, default=None):
    for l in list:
        if l[0] == key:
            default = l[1]
    return default

class TagObject:

    def __init__(self, tag, attrs, output_loc):
        self.tag = tag
        self.attrs = attrs
        self.output_loc = output_loc

class HTML2ODTParser(HTMLParser):

    def __init__(self, document, encoding, baseurl):
        HTMLParser.__init__(self)
        self.doc = document
        self.curr = self.doc.text
        if self.doc.getStyleByName("Standard") is None:
            style = Style(name="Standard", family="paragraph", attributes={'class':"text"})
            self.doc.styles.addElement(style)

        if self.doc.getStyleByName("Text_20_body") is None:
            style = Style(name="Text_20_body", displayname="Text body", family="paragraph",
               parentstylename="Standard", attributes={'class':"text"})
            p = ParagraphProperties(margintop="0cm", marginbottom="0.212cm")
            style.addElement(p)
            self.doc.styles.addElement(style)

        if self.doc.getStyleByName("Heading") is None:
            style = Style(name="Heading", family="paragraph", parentstylename="Standard",
                nextstylename="Text_20_body", attributes={'class':"text"})
            p = ParagraphProperties(margintop="0.423cm", marginbottom="0.212cm", keepwithnext="always")
            style.addElement(p)
            p = TextProperties(fontname="Nimbus Sans L", fontsize="14pt",
                 fontnameasian="DejaVu LGC Sans", fontsizeasian="14pt",
                 fontnamecomplex="DejaVu LGC Sans", fontsizecomplex="14pt")
            style.addElement(p)
            self.doc.styles.addElement(style)

        self.encoding = encoding
        (scheme, host, path, params, fragment) = urlparse.urlsplit(baseurl)
        lastslash = path.rfind('/')
        if lastslash > -1:
            path = path[:lastslash]
        self.baseurl = urlparse.urlunsplit((scheme, host, path,'',''))
        self.basehost = urlparse.urlunsplit((scheme, host, '','',''))
        self.sectnum = 0
        self.tagstack = []
        self.pstack = []
        self.processelem = True
        self.processcont = True
        self.__data = []
        self.elements = {
     'a':    (self.s_html_a, self.close_tag),
     'base': ( self.output_base, None),
     'b':    ( self.s_html_fontstyle, self.close_tag),
     'big':  ( self.s_html_fontstyle, self.close_tag),
     'br':   ( self.output_br, None),
     'col':  ( self.s_html_col, None),
     'dd':   ( self.s_html_dd, self.close_tag),
     'dt':   ( self.s_html_dt, None),
     'div':  ( self.s_html_section, self.e_html_section),
     'em':   ( self.s_html_emphasis, self.close_tag),
     'h1':   ( self.s_html_headline, self.close_tag),
     'h2':   ( self.s_html_headline, self.close_tag),
     'h3':   ( self.s_html_headline, self.close_tag),
     'h4':   ( self.s_html_headline, self.close_tag),
     'h5':   ( self.s_html_headline, self.close_tag),
     'h6':   ( self.s_html_headline, self.close_tag),
     'head': ( self.s_ignorexml, None),
     'i':    ( self.s_html_fontstyle, self.close_tag),
     'img':  ( self.output_img, None),
     'li':   ( self.s_html_li, self.e_html_li),
     'meta': ( self.meta_encoding, None),
     'ol':   ( self.output_ol, self.e_html_list),
     'p':    ( self.s_html_block, self.e_html_block),
     's':    ( self.s_html_fontstyle, self.close_tag),
     'small':( self.s_html_fontstyle, self.close_tag),
     'span': ( self.s_html_span, self.close_tag),
     'strike':( self.s_html_fontstyle, self.close_tag),
     'strong':( self.s_html_emphasis, self.close_tag),
     'table':( self.s_html_table, self.e_html_table),
     'td':   ( self.s_html_td, self.close_tag),
     'th':   ( self.s_html_td, self.close_tag),
     'title':( self.s_html_title, self.e_html_title),
     'tr':   ( self.s_html_tr, self.close_tag),
     'tt':   ( self.s_html_fontstyle, self.close_tag),
     'u':    ( self.s_html_fontstyle, self.close_tag),
     'ul':   ( self.output_ul, self.e_html_list),
     'var':  ( self.s_html_emphasis, self.close_tag),
    }

    def result(self):
        """ Return a string
            String must be in UNICODE
        """
        str = string.join(self.__data,'')
        self.__data = []
        return str

    def meta_name(self, attrs):
        """ Look in meta tag for textual info"""
        foundit = 0
        # Is there a name attribute?
        for attr in attrs:
            if attr[0] == 'name' and string.lower(attr[1]) in ('description',
            'keywords','title',
            'dc.description','dc.keywords','dc.title'
            ):
                foundit = 1
        if foundit == 0:
            return 0

        # Is there a content attribute?
        content = self.find_attr(attrs,'content')
        if content:
            self.handle_data(u' ')
            self.handle_attr(content)
            self.handle_data(u' ')
        return 1

    def meta_encoding(self, tag, attrs):
        """ Look in meta tag for page encoding (Content-Type)"""
        foundit = 0
        # Is there a content-type attribute?
        for attr in attrs:
            if attr[0] == 'http-equiv' and string.lower(attr[1]) == 'content-type':
                foundit = 1
        if foundit == 0:
            return 0

        # Is there a content attribute?
        for attr in attrs:
            if attr[0] == 'content':
                (ct, parms) = parse_header(attr[1])
                if parms.has_key('charset'):
                    self.encoding = parms['charset']
        return 1

    def s_ignorexml(self, tag, attrs):
        self.processelem = False

    def output_base(self, tag, attrs):
        """ Change the document base if there is a base tag """
        baseurl = listget(attrs, 'href', self.baseurl)
        (scheme, host, path, params, fragment) = urlparse.urlsplit(baseurl)
        lastslash = path.rfind('/')
        if lastslash > -1:
            path = path[:lastslash]
        self.baseurl = urlparse.urlunsplit((scheme, host, path,'',''))
        self.basehost = urlparse.urlunsplit((scheme, host, '','',''))

    def output_br(self, tag, attrs):
        self.curr.addElement(text.LineBreak())

    def s_html_font(self, tag, attrs):
        """ 15.2.1 Font style elements: the TT, I, B, BIG, SMALL,
            STRIKE, S, and U elements
        """
        tagdict = {
        }

    def s_html_emphasis(self, tag, attrs):
        """ 9.2.1 Phrase elements: EM, STRONG, DFN, CODE, SAMP, KBD,
            VAR, CITE, ABBR, and ACRONYM
        """
        tagdict = {
           'cite':      ['Citation', {'fontstyle':"italic", 'fontstyleasian':"italic", 'fontstylecomplex':"italic" }],
           'code':      ['Source_20_Text', {'fontname':"Courier", 'fontnameasian':"Courier",'fontnamecomplex':"Courier" }],
           'dfn':      ['Definition',{ }],
           'em':      ['Emphasis', {'fontstyle':"italic", 'fontstyleasian':"italic", 'fontstylecomplex':"italic" }],
           'strong':   ['Strong_20_Emphasis': {'fontweight':"bold",'fontweightasian':"bold",'fontweightcomplex':"bold"}],
           'var':      ['Variable', {'fontstyle':"italic", 'fontstyleasian':"italic", 'fontstylecomplex':"italic" }],
           }
        stylename = tagdict.get(tag,'Emphasis')
        # Add the styles we need to the stylesheet
        if stylename == "Source_20_Text" and self.doc.getStyleByName(stylename) is None:
            style = Style(name="Source_20_Text", displayname="Source Text", family="text")
            p = TextProperties(fontname="Courier", fontnameasian="Courier", fontnamecomplex="Courier")
            style.addElement(p)
            self.doc.styles.addElement(style)

        e = text.Span(stylename=stylename)
        self.curr.addElement(e)
        self.curr = e

    def s_html_fontstyle(self, tag, attrs):
        """ 15.2.1 Font style elements: the TT, I, B, BIG, SMALL,
            STRIKE, S, and U elements
            ('tt' is not considered an automatic style by OOo)
        """
        tagdict = {
           'b':      ['BoldX',{'fontweight':"bold",
                      'fontweightasian':"bold",'fontweightcomplex':"bold" }],
           'big':    ['BigX', {'fontsize':"120%"}],
           'i':      ['ItalicX', {'fontstyle':"italic", 'fontstyleasian':"italic", 'fontstylecomplex':"italic" }],
           'tt':     ['TeletypeX', {'fontname':"Courier", 'fontnameasian':"Courier", 'fontnamecomplex':"Courier" }],
           's':      ['StrikeX', {'textlinethroughstyle':"solid"}],
           'small':  ['SmallX', {'fontsize':"80%"}],
           'strike': ['StrikeX', {'textlinethroughstyle':"solid"}],
           'u':      ['UnderlineX', {'textunderlinestyle':"solid", 'textunderlinewidth':"auto",
                      'textunderlinecolor':"fontcolor"}],
        }
        stylename,styledecl = tagdict.get(tag,[None,None])
        if stylename and self.doc.getStyleByName(stylename) is None:
            style = Style(name=stylename, family="text")
            style.addElement(TextProperties(attributes=styledecl))
            self.doc.automaticstyles.addElement(style)
        if stylename:
            e = text.Span(stylename=stylename)
        else:
            e = text.Span()
        self.curr.addElement(e)
        self.curr = e


    def s_html_span(self, tag, attrs):
        e = text.Span()
        self.curr.addElement(e)
        self.curr = e

    def s_html_title(self, tag, attrs):
        e = dc.Title()
        self.doc.meta.addElement(e)
        self.curr = e

    def e_html_title(self, tag):
        self.curr = self.curr.parentNode

    def output_img(self, tag, attrs):
        src = listget(attrs, 'src', "Illegal IMG tag!")
        alt = listget(attrs, 'alt', src)
        # Must remember name of image and download it.
        self.write_odt(u'<draw:image xlink:href="Pictures/%s" xlink:type="simple" xlink:show="embed" xlink:actuate="onLoad"/>' % '00000.png')

    def s_html_a(self, tag, attrs):
        href = None
        href = listget(attrs, 'href', None)
        if href:
            if href in ("", "#"):
                href == self.baseurl
            elif href.find("://") >= 0:
                pass
            elif href[0] == '/':
                href = self.basehost + href
            e = text.A(type="simple", href=href)
        else:
            e = text.A()
#       if self.curr.parentNode.qname != text.P().qname:
#           p = text.P()
#           self.curr.addElement(p)
#           self.curr = p
        self.curr.addElement(e)
        self.curr = e

    def close_tag(self, tag):
        self.curr = self.curr.parentNode

    def s_html_dd(self, tag, attrs):
        if self.doc.getStyleByName("List_20_Contents") is None:
            style = Style(name="List_20_Contents", displayname="List Contents", family="paragraph",
                 parentstylename="Standard", attributes={'class':"html"})
            p = ParagraphProperties(marginleft="1cm", marginright="0cm", textindent="0cm", autotextindent="false")
            style.addElement(p)
            self.doc.styles.addElement(style)
        e = text.P(stylename="List_20_Contents")
        self.curr.addElement(e)
        self.curr = e

    def s_html_dt(self, tag, attrs):
        if self.doc.getStyleByName("List_20_Heading") is None:
            style = Style(name="List_20_Heading", displayname="List Heading", family="paragraph", parentstylename="Standard",
                 nextstylename="List_20_Contents", attributes={'class':"html"})
            p = ParagraphProperties(marginleft="0cm", marginright="0cm", textindent="0cm", autotextindent="false")
            style.addElement(p)
            self.doc.styles.addElement(style)
        e = text.P(stylename="List_20_Heading")
        self.curr.addElement(e)
        self.curr = e

    def output_ul(self, tag, attrs):
        self.write_odt(u'<text:list text:style-name="List_20_1">')

    def output_ol(self, tag, attrs):
        self.write_odt(u'<text:list text:style-name="Numbering_20_1">')

    def e_html_list(self, tag):
        self.write_odt(u'</text:list>')

    def s_html_li(self, tag, attrs):
        self.write_odt(u'<text:list-item><text:p text:style-name="P1">')

    def e_html_li(self, tag):
        self.write_odt(u'</text:p></text:list-item>')

    def s_html_headline(self, tag, attrs):
        stylename = "Heading_20_%s" % tag[1]
        if stylename == "Heading_20_1" and self.doc.getStyleByName("Heading_20_1") is None:
            style = Style(name="Heading_20_1", displayname="Heading 1",
                 family="paragraph", parentstylename="Heading", nextstylename="Text_20_body",
                 attributes={'class':"text"}, defaultoutlinelevel=1)
            p = TextProperties(fontsize="115%", fontweight="bold", fontsizeasian="115%",
                 fontweightasian="bold", fontsizecomplex="115%", fontweightcomplex="bold")
            style.addElement(p)
            self.doc.styles.addElement(style)

        if stylename == "Heading_20_2" and self.doc.getStyleByName("Heading_20_2") is None:
            style = Style(name="Heading_20_2", displayname="Heading 2",
                 family="paragraph", parentstylename="Heading", nextstylename="Text_20_body",
                 attributes={'class':"text"}, defaultoutlinelevel=2)
            p = TextProperties(fontsize="14pt", fontstyle="italic", fontweight="bold",
                 fontsizeasian="14pt", fontstyleasian="italic", fontweightasian="bold",
                 fontsizecomplex="14pt", fontstylecomplex="italic", fontweightcomplex="bold")
            style.addElement(p)
            self.doc.styles.addElement(style)

        if stylename == "Heading_20_3" and self.doc.getStyleByName("Heading_20_3") is None:
            style = Style(name="Heading_20_3", displayname="Heading 3",
                 family="paragraph", parentstylename="Heading", nextstylename="Text_20_body",
                 attributes={'class':"text"}, defaultoutlinelevel=3)
            p = TextProperties(fontsize="14pt", fontweight="bold", fontsizeasian="14pt",
                 fontweightasian="bold", fontsizecomplex="14pt", fontweightcomplex="bold")
            style.addElement(p)
            self.doc.styles.addElement(style)

        e = text.H(stylename="Heading_20_%s" % tag[1], outlinelevel=tag[1])
        self.curr.addElement(e)
        self.curr = e

    def s_html_table(self, tag, attrs):
        e = table.Table()
        self.curr.addElement(e)
        self.curr = e

    def e_html_table(self, tag):
        self.curr = self.curr.parentNode

    def s_html_td(self, tag, attrs):
        e = table.TableCell()
        self.curr.addElement(e)
        self.curr = e

    def s_html_tr(self, tag, attrs):
        e = table.TableRow()
        self.curr.addElement(e)
        self.curr = e

    def s_html_col(self, tag, attrs):
        e = table.TableColumn()
        self.curr.addElement(e)

    def s_html_section(self, tag, attrs):
        """ Outputs block tag such as <p> and <div> """
        name = self.find_attr(attrs,'id')
        if name is None:
            self.sectnum = self.sectnum + 1
            name = "Sect%d" % self.sectnum
        e = text.Section(name=name)
        self.curr.addElement(e)
        self.curr = e

    def e_html_section(self, tag):
        """ Outputs block tag such as <p> and <div> """
        self.curr = self.curr.parentNode

    def s_html_block(self, tag, attrs):
        """ Outputs block tag such as <p> and <div> """
        e = text.P(stylename="Text_20_body")
        self.curr.addElement(e)
        self.curr = e

    def e_html_block(self, tag):
        """ Outputs block tag such as <p> and <div> """
        self.curr = self.curr.parentNode
#
# HANDLE STARTTAG
#
    def handle_starttag(self, tag, attrs):
        self.pstack.append( (self.processelem, self.processcont) )
        tagobj = TagObject(tag, attrs, self.last_data_pos())
        self.tagstack.append(tagobj)

        method = self.elements.get(tag, (None, None))[0]
        if self.processelem and method:
            method(tag, attrs)
#
# HANDLE END
#
    def handle_endtag(self, tag):
        """
        """
        tagobj = self.tagstack.pop()
        method = self.elements.get(tag, (None, None))[1]
        if self.processelem and method:
            method(tag)
        self.processelem, self.processcont = self.pstack.pop()


#
# Data operations
#
    def handle_data(self, data):
        if data.strip() == '': return
        if self.processelem and self.processcont:
            self.curr.addText(data)

    def write_odt(self, data):
        """ Collect the data to show on the webpage """
        if type(data) == StringType:
            data = unicode(data, self.encoding)
        self.__data.append(data)

    def last_data_pos(self):
        return len(self.__data)

    def find_attr(self, attrs, key):
        """ Run through the attibutes to find a specific one
            return None if not found
        """
        for attr in attrs:
            if attr[0] == key:
                return attr[1]
        return None

#
# Tagstack operations
#
    def find_tag(self, tag):
        """ Run down the stack to find the last entry with the same tag name
            Not Tested
        """
        for tagitem in range(len(self.tagstack), 0, -1):
            if tagitem.tag == tag:
                return tagitem
        return None

    def handle_charref(self, name):
        """ Handle character reference for UNICODE
        """
        if name[0] in ('x', 'X'):
            try:
                n = int(name[1:],16)
            except ValueError:
                return
        else:
            try:
                n = int(name)
            except ValueError:
                return
        if not 0 <= n <= 65535:
            return
        self.handle_data(unichr(n))

    def handle_entityref(self, name):
        """Handle entity references.
        """
        table = htmlentitydefs.name2codepoint
        if name in table:
            self.handle_data(unichr(table[name]))
        else:
            return

    def handle_attr(self, attrval):
        """ Scan attribute values for entities and resolve them
            Simply calls handle_data
        """
        i = 0
        n = len(attrval)
        while i < n:
            match = ampersand.search(attrval, i) #
            if match:
                j = match.start()
            else:
                j = n
            if i < j: self.handle_data(attrval[i:j])
            i = j
            if i == n: break
            startswith = attrval.startswith
            if startswith('&#', i):
                match = charref.match(attrval, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = k
                    continue
                else:
                    break
            elif startswith('&', i):
                match = entityref.match(attrval, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = k
                    continue
                match = incomplete.match(attrval, i)
                if match:
                    # match.group() will contain at least 2 chars
                    if match.group() == attrval[i:]:
                        self.error("EOF in middle of entity or char ref")
                    # incomplete
                    break
                elif (i + 1) < n:
                    # not the end of the buffer, and can't be confused
                    # with some other construct
                    self.handle_data("&")
                    i = i + 1
                else:
                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
        if i < n:
            self.handle_data(attrval[i:n])
            i = n

def usage():
   sys.stderr.write("Usage: %s [-a] inputurl outputfile\n" % sys.argv[0])

if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:], "a", ["append"])

    except getopt.GetoptError:
        usage()
        sys.exit(2)

    appendto = False
    for o, a in opts:
        if o in ("-a", "--append"):
            appendto = True

    if appendto:
        doc = load(args[1])
    else:
        doc = OpenDocumentText()

    result = converturl(args[0], doc)
    print result.doc.xml()
    result.doc.save(args[1])