debian-python-odf/contrib/gutenberg/gbtext2odt.py

270 lines
9.5 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2007 Søren Roug, European Environment Agency
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from odf.opendocument import OpenDocumentText
from odf import style, text, dc, meta
import sys, getopt, time
def usage():
sys.stderr.write("""Usage: %s [-l language] [-e encoding] [-T] [-a author]
\t[-c creation_date] [-d description] [-n etext] [-p publisher] [-t title] inputfile\n""" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "a:n:c:d:e:l:p:t:T", ["author=",
"date=", "created=", "description=", "number=", "title=",
"language=", "publisher=", "encoding="])
except getopt.GetoptError:
usage()
sys.exit(2)
language = None
description = None
encoding = 'cp1252' # Codepage 1252 is a superset of ASCII and ISO-8859-1
argencoding = 'utf-8'
creator = ""
creationdate = None
title = ""
ebooknum = None
publisher = "Project Gutenberg"
copyrights = "http://www.gutenberg.org/license"
fn_is_title = False
for o, a in opts:
if o in ("-l", "--language"):
if len(a) > 3 and a[2] != '-' and a[3] != '-' or len(a) > 6:
sys.stderr.write("""Language must be a two or three letter language code optionally
\tfollowed by a hyphen and a two-letter country code""")
sys.exit(2)
language = a
elif o in ("-e", "--encoding"):
encoding = a
elif o in ("-a", "--author"):
creator = unicode(a, argencoding)
elif o in ("-d", "--description"):
description = a
elif o in ("-c", "--date", "--created"):
if len(a) > 10 and a[10] != "T":
sys.stderr.write("""Date must be in ISO8601 format (YYYY-MM-DDTHH:MM:SS)\n""")
sys.exit(2)
if len(a) < 10 or (len(a) == 10 and a[4] != "-" and a[7] != "-"):
sys.stderr.write("""Date must be in ISO8601 format (YYYY-MM-DD)\n""")
sys.exit(2)
creationdate = a
elif o in ("-p", "--publisher"):
publisher = a
elif o in ("-n", "--number"):
ebooknum = unicode(a, argencoding)
elif o in ("-t", "--title"):
title = unicode(a, argencoding)
elif o == "-T":
fn_is_title = True
if len(args) != 1:
usage()
sys.exit(2)
doc=OpenDocumentText()
textdoc = doc.text
if creator != "":
doc.meta.addElement(meta.InitialCreator(text=creator))
doc.meta.addElement(dc.Creator(text=creator))
if creationdate is not None:
doc.meta.addElement(meta.CreationDate(text=creationdate))
doc.meta.addElement(dc.Date(text=creationdate))
if description is not None:
doc.meta.addElement(dc.Description(text=description))
if title != "":
doc.meta.addElement(dc.Title(text=title))
if language is not None:
doc.meta.addElement(dc.Language(text=language))
if publisher is not None:
# doc.meta.addElement(dc.Publisher(text=publisher))
doc.meta.addElement(meta.UserDefined(name="Publisher", text=publisher))
if copyrights is not None:
# doc.meta.addElement(dc.Rights(text=copyrights))
doc.meta.addElement(meta.UserDefined(name="Rights", text=copyrights))
if ebooknum is not None:
doc.meta.addElement(meta.UserDefined(name="EText", text=ebooknum))
arial = style.FontFace(name="Arial", fontfamily="Arial", fontfamilygeneric="swiss", fontpitch="variable")
doc.fontfacedecls.addElement(arial)
# Paragraph styles
standardstyle = style.Style(name="Standard", family="paragraph")
standardstyle.addElement(style.ParagraphProperties(marginbottom="0cm", margintop="0cm" ))
doc.styles.addElement(standardstyle)
h1style = style.Style(name="Heading 1", family="paragraph", defaultoutlinelevel="1")
h1style.addElement(style.TextProperties(attributes={'fontsize':"20pt", 'fontweight':"bold"}))
doc.styles.addElement(h1style)
textbodystyle = style.Style(name="Text body", family="paragraph", parentstylename=standardstyle)
textbodystyle.addElement(style.ParagraphProperties(attributes={'marginbottom':"0.212cm", 'margintop':"0cm",
'textalign':"justify", 'justifysingleword':"false"}))
doc.styles.addElement(textbodystyle)
subtitlestyle = style.Style(name="Subtitle", family="paragraph", nextstylename=textbodystyle)
subtitlestyle.addElement(style.ParagraphProperties(textalign="center") )
subtitlestyle.addElement(style.TextProperties(fontsize="14pt", fontstyle="italic", fontname="Arial"))
doc.styles.addElement(subtitlestyle)
titlestyle = style.Style(name="Title", family="paragraph", nextstylename=subtitlestyle)
titlestyle.addElement(style.ParagraphProperties(textalign="center") )
titlestyle.addElement(style.TextProperties(fontsize="18pt", fontweight="bold", fontname="Arial"))
doc.styles.addElement(titlestyle)
# Text styles
emphasisstyle = style.Style(name="Emphasis",family="text")
emphasisstyle.addElement(style.TextProperties(fontstyle="italic"))
doc.styles.addElement(emphasisstyle)
# Make the Gutenberg sections grey
sectstyle = style.Style(name="Sect1", family="section")
sectstyle.addElement(style.SectionProperties(backgroundcolor="#e6e6e6"))
doc.automaticstyles.addElement(sectstyle)
FULLLINE=55
paragraph=[]
def addparagraph(section):
""" Join the paragraph list and add it to the section
"""
global paragraph
p = ' '.join(paragraph)
textsegs = p.split('_')
para = text.P(stylename=textbodystyle)
section.addElement(para)
if len(textsegs) > 1 and (len(textsegs) % 2) == 1:
# We have found some kursive text segments
for i in range(len(textsegs)):
if len(textsegs[i]) > 0:
if (i % 2) == 1:
y = text.Span(stylename=emphasisstyle, text=textsegs[i])
para.addElement(y)
else:
para.addText(textsegs[i])
else:
para.addText(p)
def cleantext(s):
if s[0] == '"' or s[-1] == '"':
ls=list(s)
if ls[0] == '"': ls[0] = u''
if ls[-1] == '"': ls[-1] = u''
s = ''.join(ls)
s = s.replace('" ',u'')
s = s.replace(' "',u'')
s = s.replace("'m",u"m") # I'm
s = s.replace("'s",u"s") # genitive case
s = s.replace("'t",u"t") # don't, doesn't, haven't
s = s.replace("'S",u"S") # genitive case
s = s.replace("'T",u"T") # DON'T, etc
s = s.replace("l'",u"l") # French
s = s.replace("d'",u"d") # French
if s.find('---') < 0: # Don't replace double dash for lines
s = s.replace('--',u'')
return s
def pretext(section, line, linelen):
section.addElement(text.P(stylename=standardstyle, text=line))
def posttext(section, line, linelen):
section.addElement(text.P(stylename=standardstyle, text=line))
def mainpart(section, line, linelen):
global paragraph
if linelen > 0 and len(paragraph) == 0 and \
line.upper() == line and line.upper() != line.lower():
# Headlines are always upper case
style = h1style
l = cleantext(line)
section.addElement(text.H(outlinelevel=1, stylename=h1style, text=l))
elif linelen >= FULLLINE:
# In the middle of a paragraph
paragraph.append(cleantext(line))
elif linelen == 0:
# End of paragraph
if len(paragraph) > 0:
addparagraph(section)
paragraph=[]
elif linelen < FULLLINE and len(paragraph) > 0:
# Short tail of paragraph
paragraph.append(cleantext(line))
else:
if line == title or line == title + " by " + creator:
section.addElement(text.P( stylename=titlestyle, text=cleantext(line)))
return
if line == "by" or line == creator:
section.addElement(text.P( stylename=subtitlestyle, text=cleantext(line)))
return
if len(paragraph) > 0:
addparagraph(section)
paragraph=[]
section.addElement(text.P(stylename=textbodystyle, text=cleantext(line)))
PRETEXT = 1
MAINPART = 2
POSTTEXT = 3
textpart = PRETEXT
# Start in the preamble
section = text.Section(stylename=sectstyle, name="preamble") #, display="none")
textdoc.addElement(section)
filename = args[0]
if fn_is_title and title is not None and title != "":
outfn = title
else:
suffixi = filename.rfind(".")
if suffixi > 1:
outfn = filename[:suffixi]
else:
outfn = "interimname"
f = open(filename)
for rawline in f:
line = unicode(rawline.strip(), encoding)
linelen = len(line)
if line.find("*** END OF TH") == 0:
textpart = POSTTEXT
section = text.Section(stylename=sectstyle, name="license") #, display="none")
textdoc.addElement(section)
if textpart == PRETEXT:
pretext(section, line, linelen)
if line.find("*** START OF TH") == 0 or \
line.find("*END THE SMALL PRINT!") == 0 or \
line.find("*END*THE SMALL PRINT!") == 0:
textpart = MAINPART
elif textpart == MAINPART:
section = textdoc
mainpart(section, line, linelen)
else:
posttext(section, line, linelen)
# print d.contentxml()
doc.save(outfn, True)