496 lines
16 KiB
Python
496 lines
16 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
|
#
|
|
# This is free software. You may redistribute it under the terms
|
|
# of the Apache license and the GNU General Public License Version
|
|
# 2 or at your option any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public
|
|
# License along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
#
|
|
# Contributor(s):
|
|
#
|
|
#
|
|
import string, sys, re
|
|
import urllib2, htmlentitydefs, urlparse
|
|
from urllib import quote_plus
|
|
from HTMLParser import HTMLParser
|
|
from cgi import escape,parse_header
|
|
from types import StringType
|
|
import emptycontent
|
|
|
|
def checkurl(url, http_proxy=None):
|
|
""" grab and convert url
|
|
"""
|
|
url = string.strip(url)
|
|
# if url.lower()[:5] != "http:":
|
|
# raise IOError, "Only http is accepted"
|
|
|
|
if http_proxy:
|
|
_proxies = { 'http': http_proxy }
|
|
else:
|
|
_proxies = {}
|
|
proxy_support = urllib2.ProxyHandler(_proxies)
|
|
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
|
|
|
|
urllib2.install_opener(opener)
|
|
|
|
req = urllib2.Request(url)
|
|
req.add_header("User-agent", "HTML2ODT: Convert HTML to OpenDocument")
|
|
conn = urllib2.urlopen(req)
|
|
|
|
if not conn:
|
|
raise IOError, "Failure in open"
|
|
data = conn.read()
|
|
headers = conn.info()
|
|
conn.close()
|
|
|
|
encoding = 'iso8859-1' #Standard HTML
|
|
if headers.has_key('content-type'):
|
|
(ct, parms) = parse_header(headers['content-type'])
|
|
if parms.has_key('charset'):
|
|
encoding = parms['charset']
|
|
|
|
mhp = HTML2ODTParser(encoding, url)
|
|
failure = ""
|
|
mhp.feed(data)
|
|
text = mhp.result() # Flush the buffer
|
|
return text
|
|
|
|
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
|
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
|
incomplete = re.compile('&[a-zA-Z#]')
|
|
ampersand = re.compile('&')
|
|
|
|
def listget(list, key, default=None):
|
|
for l in list:
|
|
if l[0] == key:
|
|
default = l[1]
|
|
return default
|
|
|
|
class TagObject:
|
|
|
|
def __init__(self, tag, attrs, output_loc):
|
|
self.tag = tag
|
|
self.attrs = attrs
|
|
self.output_loc = output_loc
|
|
|
|
class HTML2ODTParser(HTMLParser):
|
|
|
|
def __init__(self, encoding, baseurl):
|
|
HTMLParser.__init__(self)
|
|
self.encoding = encoding
|
|
(scheme, host, path, params, fragment) = urlparse.urlsplit(baseurl)
|
|
lastslash = path.rfind('/')
|
|
if lastslash > -1:
|
|
path = path[:lastslash]
|
|
self.baseurl = urlparse.urlunsplit((scheme, host, path,'',''))
|
|
self.basehost = urlparse.urlunsplit((scheme, host, '','',''))
|
|
self.sectnum = 0
|
|
self.tagstack = []
|
|
self.pstack = []
|
|
self.processelem = True
|
|
self.processcont = True
|
|
self.__data = []
|
|
self.elements = {
|
|
'a': (self.s_html_a, self.e_html_a),
|
|
'base': ( self.output_base, None),
|
|
'br': ( self.output_br, None),
|
|
'caption': ( self.output_caption, None),
|
|
'col': ( self.s_html_col, None),
|
|
'dd': ( self.s_html_dd, None),
|
|
'dt': ( self.s_html_dt, None),
|
|
'div': ( self.s_html_section, self.e_html_section),
|
|
'em': ( self.s_html_emphasis, self.e_html_emphasis),
|
|
'h1': ( self.s_html_headline, self.e_html_headline),
|
|
'h2': ( self.s_html_headline, self.e_html_headline),
|
|
'h3': ( self.s_html_headline, self.e_html_headline),
|
|
'h4': ( self.s_html_headline, self.e_html_headline),
|
|
'h5': ( self.s_html_headline, self.e_html_headline),
|
|
'h6': ( self.s_html_headline, self.e_html_headline),
|
|
'head': ( self.s_ignorexml, None),
|
|
'img': ( self.output_img, None),
|
|
'li': ( self.s_html_li, self.e_html_li),
|
|
'meta': ( self.meta_encoding, None),
|
|
'ol': ( self.output_ol, self.e_html_list),
|
|
'p': ( self.s_html_block, self.e_html_block),
|
|
'span': ( self.s_html_span, self.e_html_span),
|
|
'strong':( self.s_html_emphasis, self.e_html_emphasis),
|
|
'table':( self.s_html_table, self.e_html_table),
|
|
'td': ( self.s_html_td, self.e_html_td),
|
|
'th': ( self.s_html_td, self.e_html_td),
|
|
'title':( self.s_html_title, self.e_html_title),
|
|
'tr': ( self.s_html_tr, self.e_html_tr),
|
|
'ul': ( self.output_ul, self.e_html_list),
|
|
'input':( self.output_input, None),
|
|
'select':( self.output_select, None),
|
|
'textarea':( self.output_textarea, None),
|
|
}
|
|
|
|
|
|
def result(self):
|
|
""" Return a string
|
|
String must be in UNICODE
|
|
"""
|
|
str = string.join(self.__data,'')
|
|
self.__data = []
|
|
return str
|
|
|
|
def meta_name(self, attrs):
|
|
""" Look in meta tag for textual info"""
|
|
foundit = 0
|
|
# Is there a name attribute?
|
|
for attr in attrs:
|
|
if attr[0] == 'name' and string.lower(attr[1]) in ('description',
|
|
'keywords','title',
|
|
'dc.description','dc.keywords','dc.title'
|
|
):
|
|
foundit = 1
|
|
if foundit == 0:
|
|
return 0
|
|
|
|
# Is there a content attribute?
|
|
content = self.find_attr(attrs,'content')
|
|
if content:
|
|
self.handle_data(u' ')
|
|
self.handle_attr(content)
|
|
self.handle_data(u' ')
|
|
return 1
|
|
|
|
def meta_encoding(self, tag, attrs):
|
|
""" Look in meta tag for page encoding (Content-Type)"""
|
|
foundit = 0
|
|
# Is there a content-type attribute?
|
|
for attr in attrs:
|
|
if attr[0] == 'http-equiv' and string.lower(attr[1]) == 'content-type':
|
|
foundit = 1
|
|
if foundit == 0:
|
|
return 0
|
|
|
|
# Is there a content attribute?
|
|
for attr in attrs:
|
|
if attr[0] == 'content':
|
|
(ct, parms) = parse_header(attr[1])
|
|
if parms.has_key('charset'):
|
|
self.encoding = parms['charset']
|
|
return 1
|
|
|
|
def s_ignorexml(self, tag, attrs):
|
|
self.processelem = False
|
|
|
|
def output_base(self, tag, attrs):
|
|
""" Change the document base if there is a base tag """
|
|
baseurl = listget(attrs, 'href', self.baseurl)
|
|
(scheme, host, path, params, fragment) = urlparse.urlsplit(baseurl)
|
|
lastslash = path.rfind('/')
|
|
if lastslash > -1:
|
|
path = path[:lastslash]
|
|
self.baseurl = urlparse.urlunsplit((scheme, host, path,'',''))
|
|
self.basehost = urlparse.urlunsplit((scheme, host, '','',''))
|
|
|
|
def output_br(self, tag, attrs):
|
|
self.write_odt(u'<text:line-break/>')
|
|
|
|
def s_html_emphasis(self, tag, attrs):
|
|
self.write_odt(u'<%s>' % tag)
|
|
|
|
def e_html_emphasis(self, tag):
|
|
self.write_odt(u'</%s>' % tag)
|
|
|
|
def s_html_span(self, tag, attrs):
|
|
self.write_odt(u'<text:span>')
|
|
|
|
def e_html_span(self, tag):
|
|
self.write_odt(u'</text:span>')
|
|
|
|
def s_html_title(self, tag, attrs):
|
|
# Put in meta.xml
|
|
self.write_odt(u'<dc:title>')
|
|
|
|
def e_html_title(self, tag):
|
|
# Put in meta.xml
|
|
self.write_odt(u'</dc:title>')
|
|
|
|
def output_img(self, tag, attrs):
|
|
src = listget(attrs, 'src', "Illegal IMG tag!")
|
|
alt = listget(attrs, 'alt', src)
|
|
# Must remember name of image and download it.
|
|
self.write_odt(u'<draw:image xlink:href="Pictures/%s" xlink:type="simple" xlink:show="embed" xlink:actuate="onLoad"/>' % '00000.png')
|
|
|
|
def s_html_a(self, tag, attrs):
|
|
href = None
|
|
href = listget(attrs, 'href', None)
|
|
if href:
|
|
if href in ("", "#"):
|
|
href == self.baseurl
|
|
elif href.find("://") >= 0:
|
|
pass
|
|
elif href[0] == '/':
|
|
href = self.basehost + href
|
|
self.write_odt(u' <text:a xlink:type="simple" xlink:href="%s">' % escape(href))
|
|
else:
|
|
self.write_odt(u' <text:a>')
|
|
|
|
def e_html_a(self, tag):
|
|
self.write_odt(u'</text:a>')
|
|
|
|
def s_html_dd(self, tag, attrs):
|
|
self.write_odt(u'<text:p text:style-name="List_20_Contents">')
|
|
|
|
def s_html_dt(self, tag, attrs):
|
|
self.write_odt(u'<text:p text:style-name="List_20_Heading">')
|
|
|
|
def output_ul(self, tag, attrs):
|
|
self.write_odt(u'<text:list text:style-name="List_20_1">')
|
|
|
|
def output_ol(self, tag, attrs):
|
|
self.write_odt(u'<text:list text:style-name="Numbering_20_1">')
|
|
|
|
def e_html_list(self, tag):
|
|
self.write_odt(u'</text:list>')
|
|
|
|
def s_html_li(self, tag, attrs):
|
|
self.write_odt(u'<text:list-item><text:p text:style-name="P1">')
|
|
|
|
def e_html_li(self, tag):
|
|
self.write_odt(u'</text:p></text:list-item>')
|
|
|
|
def output_select(self, tag, attrs):
|
|
return
|
|
self.write_odt(u'<br/>Combo box:')
|
|
|
|
def output_textarea(self, tag, attrs):
|
|
return
|
|
self.write_odt(u'<form:textarea>')
|
|
|
|
def output_input(self, tag, attrs):
|
|
return
|
|
type = listget(attrs, 'type', "text")
|
|
value = listget(attrs, 'value', "")
|
|
if type == "text":
|
|
self.write_odt(u'<br/>Edit:')
|
|
elif type == "submit":
|
|
self.write_odt(u' %s' % value)
|
|
elif type == "checkbox":
|
|
#FIXME - Only works in XHTML
|
|
checked = listget(attrs, 'checked', "not checked")
|
|
self.write_odt(u'<br/>Checkbox:' % checked)
|
|
elif type == "radio":
|
|
checked = listget(attrs, 'checked', "not checked")
|
|
self.write_odt(u'<br/>Radio button:' % checked)
|
|
elif type == "file":
|
|
self.write_odt(u'File upload edit %s' % value)
|
|
self.write_odt(u'<br/>Browse button:')
|
|
|
|
def s_html_headline(self, tag, attrs):
|
|
self.write_odt(u'<text:h text:style-name="Heading_20_%s" text:outline-level="%s">' % (tag[1],tag[1]))
|
|
|
|
def e_html_headline(self, tag):
|
|
self.write_odt(u'</text:h>')
|
|
|
|
def s_html_table(self, tag, attrs):
|
|
self.write_odt(u'<table:table>')
|
|
|
|
def e_html_table(self, tag):
|
|
self.write_odt(u'</table:table>')
|
|
|
|
def s_html_td(self, tag, attrs):
|
|
self.write_odt(u'<table:table-cell>')
|
|
|
|
def e_html_td(self, tag):
|
|
self.write_odt(u'</table:table-cell>')
|
|
|
|
def s_html_tr(self, tag, attrs):
|
|
self.write_odt(u'<table:table-row>')
|
|
|
|
def e_html_tr(self, tag):
|
|
self.write_odt(u'</table:table-row>')
|
|
|
|
def s_html_col(self, tag, attrs):
|
|
self.write_odt(u'<table:table-column/>')
|
|
|
|
def output_caption(self, tag, attrs):
|
|
self.write_odt(u'Caption: ')
|
|
|
|
def s_html_section(self, tag, attrs):
|
|
""" Outputs block tag such as <p> and <div> """
|
|
name = self.find_attr(attrs,'id')
|
|
if name is None:
|
|
self.sectnum = self.sectnum + 1
|
|
name = "Sect%d" % self.sectnum
|
|
self.write_odt(u'<text:section text:name="%s">' % name)
|
|
|
|
def e_html_section(self, tag):
|
|
""" Outputs block tag such as <p> and <div> """
|
|
self.write_odt(u'</text:section>')
|
|
|
|
def s_html_block(self, tag, attrs):
|
|
""" Outputs block tag such as <p> and <div> """
|
|
self.write_odt(u'<text:p text:style-name="Text_20_body">')
|
|
|
|
def e_html_block(self, tag):
|
|
""" Outputs block tag such as <p> and <div> """
|
|
self.write_odt(u'</text:p>')
|
|
#
|
|
# HANDLE STARTTAG
|
|
#
|
|
def handle_starttag(self, tag, attrs):
|
|
self.pstack.append( (self.processelem, self.processcont) )
|
|
tagobj = TagObject(tag, attrs, self.last_data_pos())
|
|
self.tagstack.append(tagobj)
|
|
|
|
method = self.elements.get(tag, (None, None))[0]
|
|
if self.processelem and method:
|
|
method(tag, attrs)
|
|
#
|
|
# HANDLE END
|
|
#
|
|
def handle_endtag(self, tag):
|
|
"""
|
|
"""
|
|
tagobj = self.tagstack.pop()
|
|
method = self.elements.get(tag, (None, None))[1]
|
|
if self.processelem and method:
|
|
method(tag)
|
|
self.processelem, self.processcont = self.pstack.pop()
|
|
|
|
|
|
#
|
|
# Data operations
|
|
#
|
|
def handle_data(self, data):
|
|
if self.processelem and self.processcont:
|
|
self.write_odt(escape(data))
|
|
|
|
def write_odt(self, data):
|
|
""" Collect the data to show on the webpage """
|
|
if type(data) == StringType:
|
|
data = unicode(data, self.encoding)
|
|
self.__data.append(data)
|
|
|
|
def last_data_pos(self):
|
|
return len(self.__data)
|
|
|
|
def find_attr(self, attrs, key):
|
|
""" Run through the attibutes to find a specific one
|
|
return None if not found
|
|
"""
|
|
for attr in attrs:
|
|
if attr[0] == key:
|
|
return attr[1]
|
|
return None
|
|
|
|
#
|
|
# Tagstack operations
|
|
#
|
|
def find_tag(self, tag):
|
|
""" Run down the stack to find the last entry with the same tag name
|
|
Not Tested
|
|
"""
|
|
for tagitem in range(len(self.tagstack), 0, -1):
|
|
if tagitem.tag == tag:
|
|
return tagitem
|
|
return None
|
|
|
|
def handle_charref(self, name):
|
|
""" Handle character reference for UNICODE
|
|
"""
|
|
if name[0] in ('x', 'X'):
|
|
try:
|
|
n = int(name[1:],16)
|
|
except ValueError:
|
|
return
|
|
else:
|
|
try:
|
|
n = int(name)
|
|
except ValueError:
|
|
return
|
|
if not 0 <= n <= 65535:
|
|
return
|
|
self.handle_data(unichr(n))
|
|
|
|
def handle_entityref(self, name):
|
|
"""Handle entity references.
|
|
"""
|
|
table = htmlentitydefs.name2codepoint
|
|
if name in table:
|
|
self.handle_data(unichr(table[name]))
|
|
else:
|
|
return
|
|
|
|
def handle_attr(self, attrval):
|
|
""" Scan attribute values for entities and resolve them
|
|
Simply calls handle_data
|
|
"""
|
|
i = 0
|
|
n = len(attrval)
|
|
while i < n:
|
|
match = ampersand.search(attrval, i) #
|
|
if match:
|
|
j = match.start()
|
|
else:
|
|
j = n
|
|
if i < j: self.handle_data(attrval[i:j])
|
|
i = j
|
|
if i == n: break
|
|
startswith = attrval.startswith
|
|
if startswith('&#', i):
|
|
match = charref.match(attrval, i)
|
|
if match:
|
|
name = match.group()[2:-1]
|
|
self.handle_charref(name)
|
|
k = match.end()
|
|
if not startswith(';', k-1):
|
|
k = k - 1
|
|
i = k
|
|
continue
|
|
else:
|
|
break
|
|
elif startswith('&', i):
|
|
match = entityref.match(attrval, i)
|
|
if match:
|
|
name = match.group(1)
|
|
self.handle_entityref(name)
|
|
k = match.end()
|
|
if not startswith(';', k-1):
|
|
k = k - 1
|
|
i = k
|
|
continue
|
|
match = incomplete.match(attrval, i)
|
|
if match:
|
|
# match.group() will contain at least 2 chars
|
|
if match.group() == attrval[i:]:
|
|
self.error("EOF in middle of entity or char ref")
|
|
# incomplete
|
|
break
|
|
elif (i + 1) < n:
|
|
# not the end of the buffer, and can't be confused
|
|
# with some other construct
|
|
self.handle_data("&")
|
|
i = i + 1
|
|
else:
|
|
break
|
|
else:
|
|
assert 0, "interesting.search() lied"
|
|
# end while
|
|
if i < n:
|
|
self.handle_data(attrval[i:n])
|
|
i = n
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
result = checkurl(sys.argv[1])
|
|
sys.stdout.write('\n'.join(emptycontent.chead))
|
|
sys.stdout.write(result.encode('utf-8'))
|
|
sys.stdout.write('\n'.join(emptycontent.cfoot))
|
|
sys.stdout.write('\n')
|
|
|
|
|