1109 lines
43 KiB
Python
1109 lines
43 KiB
Python
#! /usr/bin/env python
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
# TabellioOOo - OpenDocument to .legi converter
|
|
# Copyright (C) 2007-2010 Parlement de la Communauté française de Belgique
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
|
|
from cStringIO import StringIO
|
|
from optparse import OptionParser
|
|
try:
|
|
import xml.etree.ElementTree as ET
|
|
except ImportError:
|
|
import elementtree.ElementTree as ET
|
|
|
|
import zipfile
|
|
import sys
|
|
import os
|
|
|
|
debug = False # activate verbose debugging output ?
|
|
|
|
# OpenDocument Format namespaces
|
|
META_NS = 'urn:oasis:names:tc:opendocument:xmlns:meta:1.0'
|
|
DC_NS = 'http://purl.org/dc/elements/1.1/'
|
|
OFFICE_NS = 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'
|
|
TEXT_NS = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
|
|
TABLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'
|
|
STYLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:style:1.0'
|
|
FO_NS = 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0'
|
|
DRAW_NS = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
|
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
|
SVG_NS = 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0'
|
|
|
|
|
|
# title styles and their respective .legi element
|
|
TITLE_LEVELS = {
|
|
'Partie': 'part',
|
|
'Heading_20_1': 'part',
|
|
'Chap': 'chapter',
|
|
'Heading_20_2': 'chapter',
|
|
'Sec 1': 'sect1',
|
|
'Sec_20_1': 'sect1',
|
|
'Heading_20_3': 'sect1',
|
|
'Sec 1.1': 'sect2',
|
|
'Sec_20_1.1': 'sect2',
|
|
'Heading_20_4': 'sect2',
|
|
'Sec 1.1.1': 'sect3',
|
|
'Sec_20_1.1.1': 'sect3',
|
|
'Heading_20_5': 'sect3',
|
|
'Lchapitre': 'legistic_chapter',
|
|
'Lsection': 'legistic_section',
|
|
'LSous-Section': 'legistic_subsection',
|
|
'Lpart': 'legistic_part',
|
|
'Llivre': 'legistic_book',
|
|
'LLivre': 'legistic_book',
|
|
'Ltitre': 'legistic_title',
|
|
}
|
|
|
|
# titles disguised as paragraph and their respective .legi elements
|
|
PARAGRAPH_TITLE_STYLES = {
|
|
'TitrePreface': 'preface',
|
|
'TitreSynthese': 'synthese',
|
|
'Lchapitre': 'legistic_chapter',
|
|
'Lpart': 'legistic_part',
|
|
'LLivre': 'legistic_book',
|
|
'Ltitre': 'legistic_title',
|
|
'Lsection': 'legistic_section',
|
|
'LSous-Section': 'legistic_subsection',
|
|
}
|
|
|
|
|
|
# dictionary with known styles, parsed from //office:automatic-styles
|
|
STYLES = {}
|
|
|
|
# global variable for paragraph numbering
|
|
paragraph_numbering = 0
|
|
|
|
class SkipElement(Exception):
|
|
'''
|
|
Exception to instruct convertor to skip processing of the given element.
|
|
'''
|
|
|
|
def convert_to_pt(x):
|
|
if x.endswith('cm'):
|
|
t = float(x[:-2])
|
|
return t*28.45
|
|
if x.endswith('pt'):
|
|
t = float(x[:-2])
|
|
return t
|
|
raise NotImplementedError # unknown unit
|
|
|
|
def create_metadata(metadata_element, metadata_tree, content_tree):
|
|
'''
|
|
Create metadata section (/book/metadata), filled with data from
|
|
OpenDocument meta.xml elements and content.xml user fields
|
|
'''
|
|
title = metadata_tree.findall('{%s}meta/{%s}title' % (OFFICE_NS, DC_NS))
|
|
if not title:
|
|
if debug:
|
|
print >> sys.stderr, 'W: missing title'
|
|
else:
|
|
title = title[0]
|
|
elem = ET.SubElement(metadata_element, 'property')
|
|
elem.set('name', 'title')
|
|
elem.text = title.text
|
|
|
|
done_fields = []
|
|
for user_field in content_tree.findall('//{%s}user-field-decl' % TEXT_NS):
|
|
attr_name = user_field.attrib.get('{%s}name' % TEXT_NS)
|
|
if attr_name in done_fields:
|
|
continue
|
|
elem = ET.SubElement(metadata_element, 'property')
|
|
done_fields.append(attr_name)
|
|
elem.set('name', attr_name)
|
|
elem.set('type', 'custom')
|
|
elem.text = user_field.attrib.get('{%s}string-value' % OFFICE_NS)
|
|
|
|
for user_meta in metadata_tree.findall('{%s}meta/{%s}user-defined' % (OFFICE_NS, META_NS)):
|
|
content = user_meta.text
|
|
if not content:
|
|
continue
|
|
attr_name = user_meta.attrib.get('{%s}name' % META_NS)
|
|
if attr_name in done_fields:
|
|
continue
|
|
elem = ET.SubElement(metadata_element, 'property')
|
|
done_fields.append(attr_name)
|
|
elem.set('name', attr_name)
|
|
elem.set('type', 'custom')
|
|
elem.text = content
|
|
|
|
keyword = metadata_tree.findall('{%s}meta/{%s}keyword' % (OFFICE_NS, META_NS))
|
|
if keyword:
|
|
keyword = keyword[0]
|
|
elem = ET.SubElement(metadata_element, 'property')
|
|
elem.set('name', 'keyword')
|
|
elem.text = keyword.text
|
|
|
|
|
|
def append_remaining_text(para, *texts):
|
|
text = ''.join([x for x in texts if x])
|
|
if not text:
|
|
return
|
|
if para.getchildren():
|
|
if para.getchildren()[-1].tail is None:
|
|
para.getchildren()[-1].tail = ''
|
|
para.getchildren()[-1].tail += text
|
|
else:
|
|
if para.text is None:
|
|
para.text = ''
|
|
para.text += text
|
|
|
|
|
|
def handle_little_span(para, child, invert_bg):
|
|
style_name = child.attrib.get('{%s}style-name' % TEXT_NS)
|
|
|
|
if not 'props' in STYLES.get(style_name, {}):
|
|
if debug:
|
|
print >> sys.stderr, 'W: unknown style name:', style_name
|
|
append_remaining_text(para, child.text, child.tail)
|
|
return
|
|
|
|
# known style, pile up emphasis elements as legistic emphasis
|
|
# element are limited to a single role
|
|
props = STYLES.get(style_name).get('props')
|
|
if child.text and child.text.endswith(' '):
|
|
child.text = child.text.rstrip()
|
|
if not child.tail:
|
|
child.tail = ' '
|
|
else:
|
|
child.tail = ' ' + child.tail
|
|
if 'background-color' in props:
|
|
# ignore elements with a background colour, they are used to
|
|
# communicate between writers
|
|
if child.tail:
|
|
append_remaining_text(para, '', child.tail)
|
|
raise SkipElement()
|
|
|
|
if invert_bg and not 'no-background-color' in props:
|
|
if child.tail:
|
|
append_remaining_text(para, '', child.tail)
|
|
raise SkipElement()
|
|
|
|
l_emph = None
|
|
emph = None
|
|
for p in props:
|
|
if p == 'no-background-color':
|
|
continue
|
|
t_emph = ET.Element('emphasis')
|
|
t_emph.attrib['role'] = p
|
|
if emph is None:
|
|
emph = t_emph
|
|
if l_emph is not None:
|
|
l_emph.append(t_emph)
|
|
l_emph = t_emph
|
|
if emph is not None:
|
|
para.append(emph)
|
|
l_emph.text = child.text
|
|
emph.tail = child.tail
|
|
else:
|
|
append_remaining_text(para, '', child.text)
|
|
|
|
|
|
def handle_text_box(para, child):
|
|
# draw:text-box is used to attach a caption to a figure, it will
|
|
# have a text:p as only element
|
|
if len(child.getchildren()) > 1:
|
|
if debug:
|
|
print >> sys.stderr, 'W: text-box with more than one child'
|
|
|
|
textboxchild = child.getchildren()[0]
|
|
if textboxchild.tag != '{%s}p' % TEXT_NS:
|
|
if debug:
|
|
print >> sys.stderr, 'W: text-box with non <p> child'
|
|
else:
|
|
# the <p> will have a <draw:frame> with the image, and tail
|
|
# text with the caption
|
|
for frame in textboxchild.findall('{%s}frame' % DRAW_NS):
|
|
fill_inline(para, frame)
|
|
width = frame.attrib.get('{%s}width' % SVG_NS)
|
|
height = frame.attrib.get('{%s}height' % SVG_NS)
|
|
image = para.find('imageobject/imagedata')
|
|
if width:
|
|
image.attrib['width'] = width
|
|
if height:
|
|
image.attrib['depth'] = height
|
|
|
|
# scan for a text:equence
|
|
caption = ET.Element('caption')
|
|
para.append(caption)
|
|
for child in textboxchild.getchildren():
|
|
if child.tag != '{%s}sequence' % TEXT_NS:
|
|
continue
|
|
caption.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
|
|
paracaption = ET.SubElement(caption, 'para')
|
|
paracaption.text = child.tail.strip()
|
|
if paracaption.text[0] == ':':
|
|
paracaption.text = paracaption.text[1:].strip()
|
|
|
|
|
|
def fill_inline(para, elem, invert_bg=False):
|
|
'''
|
|
Fill a block element (para, title, etc.) with its inline elements (mostly
|
|
emphasis, but also footnotes)
|
|
'''
|
|
|
|
if elem.text:
|
|
if para.getchildren():
|
|
if para.getchildren()[-1].tail is None:
|
|
para.getchildren()[-1].tail = ''
|
|
para.getchildren()[-1].tail += elem.text.strip('\n')
|
|
else:
|
|
para.text = elem.text.strip('\n')
|
|
|
|
for child in elem.getchildren():
|
|
if child.tag == '{%s}span' % TEXT_NS and child.getchildren():
|
|
fill_inline(para, child, invert_bg=invert_bg)
|
|
append_remaining_text(para, child.tail)
|
|
|
|
elif child.tag == '{%s}span' % TEXT_NS and not child.getchildren():
|
|
try:
|
|
handle_little_span(para, child, invert_bg=invert_bg)
|
|
except SkipElement:
|
|
continue
|
|
|
|
elif child.tag == '{%s}a' % TEXT_NS:
|
|
if len(child.getchildren()) == 1 and (
|
|
child.getchildren()[0].getchildren() and
|
|
child.getchildren()[0].getchildren()[0].tag == '{%s}note' % TEXT_NS):
|
|
# footnote copy/pasted from a Microsoft Word document, all
|
|
# elements get embedded in <text:a><text:span> tags; this calls
|
|
# back fill_inline from those, to get straight to the footnote
|
|
fill_inline(para, child.getchildren()[0])
|
|
elif len(child.getchildren()) == 1 and (
|
|
child.getchildren()[0].tag == '{%s}note' % TEXT_NS):
|
|
# another situation that happens with some copy/pasting from
|
|
# Microsoft Word
|
|
fill_inline(para, child)
|
|
else:
|
|
handle_little_span(para, child, False)
|
|
|
|
elif child.tag == '{%s}note' % TEXT_NS:
|
|
footnote = ET.Element('footnote')
|
|
para.append(footnote)
|
|
for foot_elem in child.find('{%s}note-body' % TEXT_NS).getchildren():
|
|
handle_elem(footnote, foot_elem)
|
|
footnote.tail = child.tail
|
|
|
|
elif child.tag == '{%s}sequence' % TEXT_NS:
|
|
append_remaining_text(para, child.text, child.tail)
|
|
|
|
elif child.tag == '{%s}s' % TEXT_NS:
|
|
append_remaining_text(para, child.text, child.tail)
|
|
|
|
elif child.tag == '{%s}sequence-ref' % TEXT_NS:
|
|
xref = ET.Element('xref')
|
|
xref.attrib['linkend'] = child.attrib.get('{%s}ref-name' % TEXT_NS)
|
|
para.append(xref)
|
|
xref.tail = child.tail
|
|
|
|
elif child.tag == '{%s}line-break' % TEXT_NS:
|
|
br = ET.ProcessingInstruction('line-break')
|
|
para.append(br)
|
|
br.tail = child.tail
|
|
|
|
elif child.tag == '{%s}frame' % DRAW_NS:
|
|
mediaobject = ET.Element('mediaobject')
|
|
para.append(mediaobject)
|
|
fill_inline(mediaobject, child)
|
|
width = child.attrib.get('{%s}width' % SVG_NS)
|
|
height = child.attrib.get('{%s}height' % SVG_NS)
|
|
image = mediaobject.find('imageobject/imagedata')
|
|
if width:
|
|
image.attrib['width'] = width
|
|
if height:
|
|
image.attrib['depth'] = height
|
|
|
|
elif child.tag == '{%s}image' % DRAW_NS:
|
|
imageobject = ET.Element('imageobject')
|
|
imagedata = ET.SubElement(imageobject, 'imagedata')
|
|
fileref = child.attrib.get('{%s}href' % XLINK_NS)
|
|
imagedata.attrib['fileref'] = os.path.basename(fileref)
|
|
if fileref.endswith('.jpg'):
|
|
imagedata.attrib['format'] = 'JPG'
|
|
elif fileref.endswith('.png'):
|
|
imagedata.attrib['format'] = 'PNG'
|
|
para.append(imageobject)
|
|
|
|
elif child.tag == '{%s}text-box' % DRAW_NS:
|
|
handle_text_box(para, child)
|
|
|
|
else:
|
|
if debug and child.tag not in (
|
|
'{%s}soft-page-break' % TEXT_NS,
|
|
'{%s}annotation' % OFFICE_NS):
|
|
print >> sys.stderr, 'W: got unknown %s in paragraph' % child.tag
|
|
append_remaining_text(para, child.tail)
|
|
|
|
|
|
def handle_paragraph(parent, elem):
|
|
|
|
orig_style = style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
|
|
align = None
|
|
if style == 'Para_20_Right':
|
|
align = 'right'
|
|
elif style == 'Para_20_Center':
|
|
align = 'center'
|
|
align = STYLES.get(orig_style, {}).get('align', align)
|
|
|
|
invert_bg = False
|
|
if 'background-color' in STYLES.get(style, {}).get('props', []) or \
|
|
'background-color' in STYLES.get(orig_style, {}).get('props', []):
|
|
# ignore elements with a background colour, they are used to
|
|
# communicate between writers
|
|
if not elem.getchildren():
|
|
return
|
|
# do not abort yet, as the paragraph may contain text:span with the
|
|
# background colour explicitely unset
|
|
invert_bg = True
|
|
|
|
if style == 'SousTitre':
|
|
para = ET.SubElement(parent, 'subtitle')
|
|
elif style == 'Note':
|
|
note = ET.SubElement(parent, 'note')
|
|
para = ET.SubElement(note, 'para')
|
|
elif style == 'Table':
|
|
# title for previous table
|
|
table = parent.getchildren()[-1]
|
|
if table.tag == 'table':
|
|
title = ET.Element('title')
|
|
table.insert(0, title)
|
|
para = title
|
|
# scan the children for a text:sequence/text:ref-name, to be
|
|
# used as identifier
|
|
for child in elem.getchildren():
|
|
if child.tag != '{%s}sequence' % TEXT_NS:
|
|
continue
|
|
title.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
|
|
else:
|
|
# uh oh, strange, should have been a table
|
|
para = ET.SubElement(parent, 'para')
|
|
else:
|
|
# simple paragraph
|
|
para = ET.SubElement(parent, 'para')
|
|
if align == 'center':
|
|
para.attrib['role'] = 'center'
|
|
if align == 'right':
|
|
para.attrib['role'] = 'right'
|
|
if style == 'Larttitre':
|
|
para.attrib['role'] = 'legistic_manualarticle'
|
|
if style == 'NoteTableDesMatieres':
|
|
para.attrib['role'] = 'note_table_des_matieres'
|
|
|
|
if align and parent.tag == 'entry':
|
|
parent.attrib['role'] = align
|
|
|
|
fill_inline(para, elem, invert_bg=invert_bg)
|
|
if not para.text:
|
|
para_text = None
|
|
else:
|
|
para_text = para.text.replace(' ', '').replace('\t', '').replace(u'\xa0', '')
|
|
if not (para_text or para.tail or len(para.getchildren())):
|
|
# remove empty paragraphs
|
|
if para in parent:
|
|
parent.remove(para)
|
|
elif style == 'Table':
|
|
table.remove(para)
|
|
else:
|
|
if STYLES.get(orig_style, {}).get('align'):
|
|
para.attrib['align'] = STYLES.get(orig_style, {}).get('align')
|
|
if STYLES.get(orig_style, {}).get('margin-left') == 'true':
|
|
para.attrib['margin-left'] = 'true'
|
|
if style == 'Table':
|
|
# if it's a table, the title will start with "Tableau N:", but
|
|
# it will be renumbered in LaTeX, so we remove that heading.
|
|
if ':' in para.text:
|
|
para.text = para.text[para.text.index(':')+1:].strip()
|
|
|
|
if STYLES.get(orig_style, {}).get('props'):
|
|
# the style change italic/bold status
|
|
props = STYLES.get(orig_style, {}).get('props')
|
|
for p in props:
|
|
if p == 'background-color':
|
|
continue
|
|
t_emph = ET.Element('emphasis')
|
|
t_emph.attrib['role'] = p
|
|
t_emph.text, para.text = para.text, None
|
|
t_emph.tail, para.tail = para.tail, None
|
|
t_emph._children, para._children = para.getchildren(), [t_emph]
|
|
|
|
def handle_list(parent, elem):
|
|
|
|
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
level = 1
|
|
if style is None:
|
|
# means our parent was also a list
|
|
style_props = None
|
|
ancestor = elem
|
|
while True:
|
|
try:
|
|
gdparent = ancestor.parent.parent
|
|
except AttributeError:
|
|
break
|
|
if gdparent.tag != '{%s}list' % TEXT_NS:
|
|
break
|
|
level += 1
|
|
ancestor = gdparent
|
|
ancestorstyle = ancestor.attrib.get('{%s}style-name' % TEXT_NS)
|
|
style_props = STYLES.get('LIST:' + ancestorstyle)
|
|
else:
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
style_props = STYLES.get('LIST:' + style)
|
|
num_format = None
|
|
if style_props:
|
|
num_format = style_props.get('levels', {}).get(level, {}).get('format')
|
|
bullet = None
|
|
if style_props:
|
|
bullet = style_props.get('levels', {}).get(level, {}).get('bullet')
|
|
|
|
if style == 'Paragraph_20_Numbering':
|
|
# paragraph numbering is special, as we want it to get out as normal
|
|
# paragraphs, but numbered
|
|
continue_numbering = elem.attrib.get('{%s}continue-numbering' % TEXT_NS)
|
|
global paragraph_numbering
|
|
if continue_numbering != 'true':
|
|
paragraph_numbering = 0
|
|
list_elem = parent
|
|
for item in elem.findall('{%s}list-item' % TEXT_NS):
|
|
item.parent = elem
|
|
for child in item.getchildren():
|
|
child.parent = item
|
|
paragraph_numbering += 1
|
|
if child.text is None:
|
|
child.text = u'%s. ' % paragraph_numbering
|
|
else:
|
|
child.text = (u'%s. ' % paragraph_numbering) + child.text
|
|
handle_elem(parent, child)
|
|
else:
|
|
para = ET.SubElement(parent, 'para')
|
|
|
|
if style in ('Liste_20_Alpha', 'AlphaList') or num_format == 'a':
|
|
list_elem = ET.SubElement(para, 'orderedlist')
|
|
list_elem.attrib['continuation'] = 'restarts'
|
|
list_elem.attrib['numeration'] = 'loweralpha'
|
|
elif style in ('Liste_20_Num', 'NumList') or num_format == '1':
|
|
list_elem = ET.SubElement(para, 'orderedlist')
|
|
list_elem.attrib['continuation'] = 'restarts'
|
|
list_elem.attrib['numeration'] = 'arabic'
|
|
else:
|
|
list_elem = ET.SubElement(para, 'itemizedlist')
|
|
for item in elem.findall('{%s}list-item' % TEXT_NS):
|
|
item.parent = elem
|
|
listitem = ET.SubElement(list_elem, 'listitem')
|
|
if bullet and bullet != '-':
|
|
listitem.attrib['bullet'] = bullet
|
|
for child in item.getchildren():
|
|
child.parent = item
|
|
handle_elem(listitem, child)
|
|
|
|
|
|
def handle_signature_table(parent, elem):
|
|
table_with_titles = False
|
|
number_columns = 0
|
|
for row in elem.findall('{%s}table-row' % TABLE_NS):
|
|
cells = row.findall('{%s}table-cell' % TABLE_NS)
|
|
number_columns = len(cells)
|
|
for cell in cells:
|
|
para = cell.findall('{%s}p' % TEXT_NS)[0]
|
|
if para.text is None and not para.getchildren():
|
|
number_columns -= 1
|
|
style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if style_name and 'props' in STYLES.get(style_name, {}):
|
|
props = STYLES.get(style_name).get('props')
|
|
if props and 'italic' in props:
|
|
table_with_titles = True
|
|
|
|
table = ET.SubElement(parent, 'informaltable')
|
|
tgroup = ET.SubElement(table, 'tgroup')
|
|
tgroup.attrib['cols'] = '2'
|
|
|
|
if number_columns == 1:
|
|
# create a fake column, to mark indentation
|
|
colspec = ET.SubElement(tgroup, 'colspec')
|
|
colspec.attrib['colname'] = 'C1'
|
|
colspec.attrib['colnum'] = '1'
|
|
if table_with_titles:
|
|
colspec.attrib['colwidth'] = '46pt'
|
|
else:
|
|
colspec.attrib['colwidth'] = '22pt'
|
|
|
|
colspec = ET.SubElement(tgroup, 'colspec')
|
|
colspec.attrib['colname'] = 'C2'
|
|
colspec.attrib['colnum'] = '2'
|
|
colspec.attrib['colwidth'] = '92.1pt'
|
|
else:
|
|
colspec = ET.SubElement(tgroup, 'colspec')
|
|
colspec.attrib['colname'] = 'C1'
|
|
colspec.attrib['colnum'] = '1'
|
|
colspec.attrib['colwidth'] = '92.1pt'
|
|
|
|
colspec = ET.SubElement(tgroup, 'colspec')
|
|
colspec.attrib['colname'] = 'C2'
|
|
colspec.attrib['colnum'] = '2'
|
|
colspec.attrib['colwidth'] = '92.1pt'
|
|
|
|
|
|
tbody = ET.SubElement(tgroup, 'tbody')
|
|
for row in elem.findall('{%s}table-row' % TABLE_NS):
|
|
trow = ET.SubElement(tbody, 'row')
|
|
if number_columns == 1:
|
|
# empty cell for first (fake) column
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
tpara = ET.SubElement(tcell, 'para')
|
|
|
|
for cell in row.findall('{%s}table-cell' % TABLE_NS):
|
|
para = cell.findall('{%s}p' % TEXT_NS)[0]
|
|
if para.text is None and not para.getchildren():
|
|
continue
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
tpara = ET.SubElement(tcell, 'para')
|
|
style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if style_name and 'props' in STYLES.get(style_name, {}):
|
|
props = STYLES.get(style_name).get('props')
|
|
if props and 'italic' in props:
|
|
t_emph = ET.SubElement(tpara, 'emphasis')
|
|
tpara = t_emph
|
|
fill_inline(tpara, para)
|
|
|
|
# add empty interline
|
|
trow = ET.SubElement(tbody, 'row')
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
tpara = ET.SubElement(tcell, 'para')
|
|
if number_columns == 1:
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
tpara = ET.SubElement(tcell, 'para')
|
|
|
|
|
|
def handle_table(parent, elem):
|
|
try:
|
|
row = elem.findall('{%s}table-row' % TABLE_NS)[0]
|
|
cell = row.findall('{%s}table-cell' % TABLE_NS)[0]
|
|
p = cell.findall('{%s}p' % TEXT_NS)[0]
|
|
p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if p_style_name == 'Signature':
|
|
return handle_signature_table(parent, elem)
|
|
if STYLES.get(p_style_name) and STYLES[p_style_name].get('parent') == 'Signature':
|
|
return handle_signature_table(parent, elem)
|
|
except IndexError:
|
|
pass
|
|
|
|
table = ET.SubElement(parent, 'table')
|
|
cols = elem.findall('{%s}table-columns/{%s}table-column' % (TABLE_NS, TABLE_NS))
|
|
if not cols:
|
|
cols = elem.findall('{%s}table-column' % TABLE_NS)
|
|
nbcols = 0
|
|
tgroup = ET.SubElement(table, 'tgroup')
|
|
|
|
# count columns, and generate colspecs
|
|
for c in cols:
|
|
nb_new_cols = int(c.attrib.get('{%s}number-columns-repeated' % TABLE_NS, 1))
|
|
style_name = c.attrib.get('{%s}style-name' % TABLE_NS)
|
|
col_width = None
|
|
if style_name and style_name in STYLES:
|
|
style = STYLES.get(style_name)
|
|
col_width = style.get('column-width')
|
|
|
|
offset = 0
|
|
for i in range(nb_new_cols):
|
|
alignment = None
|
|
offset = 0
|
|
for row in elem.findall('{%s}table-row' % TABLE_NS):
|
|
try:
|
|
cell = row.findall('{%s}table-cell' % TABLE_NS)[nbcols+i+offset]
|
|
except IndexError:
|
|
# somehow it was impossible to get to that cell, ignore that
|
|
break
|
|
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
|
|
if columns_spanned:
|
|
offset -= int(columns_spanned) + 1
|
|
if not cell.findall('{%s}p' % TEXT_NS):
|
|
continue
|
|
p = cell.findall('{%s}p' % TEXT_NS)[0]
|
|
p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if not p_style_name:
|
|
continue
|
|
p_style = STYLES.get(p_style_name)
|
|
if not p_style:
|
|
continue
|
|
alignment = p_style.get('align')
|
|
if alignment:
|
|
break
|
|
|
|
colspec = ET.SubElement(tgroup, 'colspec')
|
|
colspec.attrib['colnum'] = str(nbcols + i + 1)
|
|
colspec.attrib['colname'] = 'col%s' % colspec.attrib['colnum']
|
|
if col_width:
|
|
colspec.attrib['colwidth'] = '%spt' % col_width
|
|
if alignment:
|
|
colspec.attrib['align'] = alignment
|
|
nbcols += nb_new_cols
|
|
tgroup.attrib['cols'] = '%s' % nbcols
|
|
|
|
# pass over all colspecs to set a width if it was not set before
|
|
for colspec in tgroup.getchildren():
|
|
if not colspec.attrib.has_key('colwidth'):
|
|
# (A4 width in pts - some margin) / nbcols
|
|
colspec.attrib['colwidth'] = '%spt' % (500/nbcols)
|
|
|
|
if elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
|
|
thead = ET.SubElement(tgroup, 'thead')
|
|
for row in elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
|
|
trow = ET.SubElement(thead, 'row')
|
|
for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
for child in cell.getchildren():
|
|
handle_elem(tcell, child)
|
|
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
|
|
if columns_spanned:
|
|
tcell.attrib['namest'] = 'col%s' % (i+1)
|
|
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
|
|
|
|
rows = elem.findall('{%s}table-rows/{%s}table-row' % (TABLE_NS, TABLE_NS))
|
|
if not rows:
|
|
rows = elem.findall('{%s}table-row' % TABLE_NS)
|
|
|
|
if rows:
|
|
first_row = rows[0]
|
|
for cell in first_row.findall('{%s}table-cell' % TABLE_NS):
|
|
for elem in cell.getchildren():
|
|
if elem.tag != '{%s}p' % TEXT_NS:
|
|
continue
|
|
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
|
|
if style == 'Table_20_Heading':
|
|
# this is actually an title line
|
|
thead = ET.SubElement(tgroup, 'thead')
|
|
|
|
trow = ET.SubElement(thead, 'row')
|
|
for i, cell in enumerate(first_row.findall('{%s}table-cell' % TABLE_NS)):
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
for child in cell.getchildren():
|
|
handle_elem(tcell, child)
|
|
|
|
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
|
|
if columns_spanned:
|
|
tcell.attrib['namest'] = 'col%s' % (i+1)
|
|
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
|
|
|
|
rows = rows[1:]
|
|
break
|
|
else:
|
|
continue
|
|
break
|
|
|
|
tbody = ET.SubElement(tgroup, 'tbody')
|
|
for row in rows:
|
|
trow = ET.SubElement(tbody, 'row')
|
|
for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
|
|
tcell = ET.SubElement(trow, 'entry')
|
|
for child in cell.getchildren():
|
|
handle_elem(tcell, child)
|
|
if tcell.getchildren() and tcell.getchildren()[-1].attrib.get('align'):
|
|
# remove align attribute on cell paragraphs
|
|
del tcell.getchildren()[-1].attrib['align']
|
|
|
|
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
|
|
if columns_spanned:
|
|
tcell.attrib['namest'] = 'col%s' % (i+1)
|
|
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
|
|
|
|
return
|
|
|
|
|
|
|
|
def handle_elem(parent, elem):
|
|
'''
|
|
Handle a block element (paragraph, lists, etc.)
|
|
'''
|
|
|
|
if elem.tag == '{%s}p' % TEXT_NS:
|
|
return handle_paragraph(parent, elem)
|
|
|
|
if elem.tag == '{%s}list' % TEXT_NS:
|
|
return handle_list(parent, elem)
|
|
|
|
if elem.tag == '{%s}table' % TABLE_NS:
|
|
return handle_table(parent, elem)
|
|
|
|
|
|
if debug:
|
|
print >> sys.stderr, 'W: unhandled element:', elem.tag
|
|
|
|
|
|
def look_for_annotation(elem):
|
|
'''Look for an annotation, in children and subchildren'''
|
|
speaker_annotation = [x for x in elem.getchildren() if x.tag == '{%s}annotation' % OFFICE_NS]
|
|
if speaker_annotation:
|
|
return speaker_annotation[0]
|
|
|
|
for child in elem.getchildren():
|
|
speaker_annotation = look_for_annotation(child)
|
|
if speaker_annotation:
|
|
return speaker_annotation
|
|
|
|
return None
|
|
|
|
|
|
def convert(input_filename, output_filename):
|
|
'''
|
|
Convert a file from the OpenDocument Format to the legacy .legi format
|
|
'''
|
|
|
|
# get content.xml and meta.xml from file
|
|
z = zipfile.ZipFile(input_filename)
|
|
content = None
|
|
metadata = None
|
|
for zfile in z.namelist():
|
|
if zfile == 'content.xml':
|
|
content = z.read(zfile)
|
|
elif zfile == 'meta.xml':
|
|
metadata = z.read(zfile)
|
|
|
|
if content and metadata:
|
|
break
|
|
|
|
legi = convert_to_legi_xml(content, metadata)
|
|
if debug:
|
|
print legi
|
|
|
|
# add XML prolog, necessary for some legacy Tabellio tools
|
|
legi = '<?xml version="1.0"?>\n' + legi
|
|
|
|
# write down content to the .legi file
|
|
legiz = zipfile.ZipFile(output_filename, 'w')
|
|
zi = zipfile.ZipInfo('contents.xml')
|
|
zi.external_attr = 0664 << 16L
|
|
legiz.writestr(zi, legi)
|
|
|
|
# copy pictures to the .legi file
|
|
for zfile in z.namelist():
|
|
if not zfile.startswith('Pictures/'):
|
|
continue
|
|
zi = zipfile.ZipInfo(os.path.basename(zfile))
|
|
zi.external_attr = 0664 << 16L
|
|
legiz.writestr(zi, z.read(zfile))
|
|
legiz.close()
|
|
|
|
def parse_automatic_styles(content_tree):
|
|
'''
|
|
Parse styles created automatically and populate the global styles
|
|
dictionary.
|
|
'''
|
|
global STYLES
|
|
|
|
for elem in content_tree.findall('{%s}automatic-styles/{%s}style' % (OFFICE_NS, STYLE_NS)):
|
|
name = elem.attrib.get('{%s}name' % STYLE_NS)
|
|
props = []
|
|
STYLES[name] = {}
|
|
# get inline italic, bold and underline attributes
|
|
if elem.attrib.get('{%s}family' % STYLE_NS) in ('paragraph', 'text'):
|
|
for prop in elem.findall('{%s}text-properties' % STYLE_NS):
|
|
if prop.attrib.get('{%s}font-style' % FO_NS) == 'italic':
|
|
props.append('italic')
|
|
if prop.attrib.get('{%s}font-weight' % FO_NS) == 'bold':
|
|
props.append('bold')
|
|
if prop.attrib.get('{%s}text-underline-style' % STYLE_NS) == 'solid':
|
|
props.insert(0, 'underline')
|
|
if prop.attrib.get('{%s}background-color' % FO_NS):
|
|
if elem.attrib.get('{%s}family' % STYLE_NS) == 'paragraph':
|
|
if prop.attrib.get('{%s}background-color' % FO_NS) not in (None, 'transparent', '#ffffff'):
|
|
props.append('background-color')
|
|
else:
|
|
if prop.attrib.get('{%s}background-color' % FO_NS) in (None, 'transparent', '#ffffff'):
|
|
props.append('no-background-color')
|
|
else:
|
|
props.append('background-color')
|
|
for prop in elem.findall('{%s}paragraph-properties' % STYLE_NS):
|
|
if prop.attrib.get('{%s}break-before' % FO_NS) == 'page':
|
|
STYLES[name]['page-break'] = True
|
|
|
|
if prop.attrib.get('{%s}text-align' % FO_NS) == 'center':
|
|
STYLES[name]['align'] = 'center'
|
|
elif prop.attrib.get('{%s}text-align' % FO_NS) == 'end':
|
|
STYLES[name]['align'] = 'right'
|
|
|
|
if prop.attrib.get('{%s}margin-left' % FO_NS, '0cm') != '0cm':
|
|
STYLES[name]['margin-left'] = 'true'
|
|
|
|
if elem.attrib.get('{%s}family' % STYLE_NS) == 'table-column':
|
|
for prop in elem.findall('{%s}table-column-properties' % STYLE_NS):
|
|
if prop.attrib.get('{%s}column-width' % STYLE_NS):
|
|
STYLES[name]['column-width'] = convert_to_pt(
|
|
prop.attrib.get('{%s}column-width' % STYLE_NS))
|
|
|
|
if props:
|
|
STYLES[name]['props'] = props
|
|
|
|
if elem.attrib.get('{%s}parent-style-name' % STYLE_NS):
|
|
STYLES[name]['parent'] = elem.attrib.get('{%s}parent-style-name' % STYLE_NS)
|
|
|
|
# parse automatic list styles
|
|
for elem in content_tree.findall('{%s}automatic-styles/{%s}list-style' % (OFFICE_NS, TEXT_NS)):
|
|
style_name = 'LIST:%s' % elem.attrib.get('{%s}name' % STYLE_NS)
|
|
STYLES[style_name] = {'levels': {}}
|
|
for level in elem.findall('{%s}list-level-style-number' % TEXT_NS):
|
|
num_level = level.attrib.get('{%s}level' % TEXT_NS)
|
|
num_format = level.attrib.get('{%s}num-format' % STYLE_NS)
|
|
STYLES[style_name]['levels'][int(num_level)] = {
|
|
'format': num_format,
|
|
}
|
|
for level in elem.findall('{%s}list-level-style-bullet' % TEXT_NS):
|
|
num_level = level.attrib.get('{%s}level' % TEXT_NS)
|
|
bullet_char = level.attrib.get('{%s}bullet-char' % TEXT_NS)
|
|
if not STYLES[style_name]['levels'].get(int(num_level)):
|
|
STYLES[style_name]['levels'][int(num_level)] = {}
|
|
STYLES[style_name]['levels'][int(num_level)].update({
|
|
'bullet': bullet_char,
|
|
})
|
|
|
|
|
|
def convert_to_legi_xml(content, metadata = None):
|
|
'''
|
|
Convert a content.xml/metadata.xml pair from an odt file
|
|
to the legi XML format.
|
|
'''
|
|
|
|
global STYLES
|
|
|
|
# create top <book> element
|
|
legi = ET.Element('book')
|
|
|
|
# parse content
|
|
content_tree = ET.ElementTree(ET.fromstring(content))
|
|
|
|
if metadata:
|
|
metadata_tree = ET.ElementTree(ET.fromstring(metadata))
|
|
# create child <metadata> element
|
|
metadata_element = ET.SubElement(legi, 'metadata')
|
|
create_metadata(metadata_element, metadata_tree, content_tree)
|
|
|
|
current_top = [legi]
|
|
current_legi = []
|
|
speech = None
|
|
offstructure = None
|
|
|
|
levels = {'book':0, 'part':1, 'chapter':2, 'preface':2, 'synthese':2, 'sect1':3, 'sect2':4, 'sect3':5}
|
|
legistic_levels = {'legistic_part': 0,
|
|
'legistic_book': 1,
|
|
'legistic_title': 2,
|
|
'legistic_chapter':3,
|
|
'legistic_section':4,
|
|
'legistic_subsection':5}
|
|
|
|
parse_automatic_styles(content_tree)
|
|
|
|
# convert content
|
|
for elem in content_tree.find('{%s}body/{%s}text' % (OFFICE_NS, OFFICE_NS)).getchildren():
|
|
|
|
if elem.tag == '{%s}user-field-decls' % TEXT_NS:
|
|
# user fields are handled as part of the metadata
|
|
continue
|
|
|
|
if elem.tag == '{%s}list' % TEXT_NS:
|
|
# OOo tends to consider titles as lists, so its numbering works,
|
|
# so look for a single paragraph item in the list, and get it out
|
|
# of it if it exists with a title style name.
|
|
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
real_elem = elem.findall('{%s}list-item/{%s}p' % (TEXT_NS, TEXT_NS))
|
|
if len(real_elem) == 1:
|
|
real_elem = real_elem[0]
|
|
style = real_elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
if style in ('Lchapitre', 'Lsection', 'Lpart', 'LLivre', 'Ltitre'):
|
|
elem = real_elem
|
|
|
|
# some paragraphs are out of title hierarchy but should nevertheless
|
|
# be handled as title in the final output
|
|
is_p_title = False
|
|
if elem.tag == '{%s}p' % TEXT_NS:
|
|
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
|
|
if STYLES.get(style, {}).get('page-break') is True:
|
|
pb = ET.ProcessingInstruction('page-break')
|
|
current_top[-1].append(pb)
|
|
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
|
|
|
|
if style in PARAGRAPH_TITLE_STYLES:
|
|
new_level = PARAGRAPH_TITLE_STYLES.get(style)
|
|
is_p_title = True
|
|
|
|
speaker_annotation = look_for_annotation(elem)
|
|
if speaker_annotation is not None:
|
|
if speech: # there was a speech, pop it
|
|
current_top.pop()
|
|
speech = ET.SubElement(current_top[-1], 'speech')
|
|
current_top.append(speech)
|
|
ref = ET.SubElement(speech, 'ref')
|
|
for param in speaker_annotation.getchildren():
|
|
if param.tag != '{%s}p' % TEXT_NS:
|
|
continue
|
|
try:
|
|
arg, value = tuple([x.strip() for x in param.text.split(':', 2)])
|
|
if arg == 'type':
|
|
ref.attrib['type'] = value
|
|
else:
|
|
legi_param = ET.SubElement(ref, 'param')
|
|
legi_param.attrib['name'] = arg
|
|
legi_param.text = value
|
|
except (AttributeError, ValueError):
|
|
# annotation paragraph was not "key: value"; consider
|
|
# it as a comment and ignore it.
|
|
pass
|
|
|
|
# handle all titles
|
|
if elem.tag == '{%s}h' % TEXT_NS or is_p_title:
|
|
style = elem.attrib.get('{%s}style' % TEXT_NS)
|
|
if not style:
|
|
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
|
|
|
|
if 'parent' in STYLES.get(style, {}):
|
|
style = STYLES.get(style).get('parent')
|
|
|
|
if style in TITLE_LEVELS:
|
|
new_level = TITLE_LEVELS.get(style)
|
|
elif style == 'Sous-Titre':
|
|
# Subtitles are out-of-hierarchy, just adding a <subtitle> node
|
|
# under an existing <title> node; they have no content
|
|
if current_legi:
|
|
subtitle = ET.SubElement(current_legi[-1], 'subtitle')
|
|
else:
|
|
subtitle = ET.SubElement(current_top[-1], 'subtitle')
|
|
fill_inline(subtitle, elem)
|
|
continue
|
|
else:
|
|
if debug:
|
|
print >> sys.stderr, 'E: unknown heading style:', style
|
|
|
|
if new_level.startswith('legistic_'):
|
|
# title in a legistic part
|
|
if current_legi:
|
|
# existing legistic part
|
|
current_level = current_legi[-1].tag
|
|
if legistic_levels[current_level] == legistic_levels[new_level]:
|
|
current_legi.pop()
|
|
current_legi.append(ET.SubElement(current_legi[-1], new_level))
|
|
elif legistic_levels[current_level] > legistic_levels[new_level]:
|
|
current_legi.pop()
|
|
for i in range(legistic_levels[current_level] - legistic_levels[new_level]):
|
|
current_legi.pop()
|
|
current_legi.append(ET.SubElement(current_legi[-1], new_level))
|
|
elif legistic_levels[current_level] < legistic_levels[new_level]:
|
|
current_legi.append(ET.SubElement(current_legi[-1], new_level))
|
|
else:
|
|
# new legistic part
|
|
current_legi.append(current_top[-1])
|
|
current_legi.append(ET.SubElement(current_top[-1], new_level))
|
|
else:
|
|
# title in a "normal" part
|
|
|
|
# must first close current legistic part, if any
|
|
if current_legi:
|
|
current_legi = []
|
|
|
|
if speech: # there was a speech, pop it
|
|
current_top.pop()
|
|
speech = None
|
|
|
|
current_level = current_top[-1].tag
|
|
if levels[current_level] == levels[new_level]:
|
|
current_top.pop()
|
|
current_top.append(ET.SubElement(current_top[-1], new_level))
|
|
elif levels[current_level] > levels[new_level]:
|
|
current_top.pop()
|
|
for i in range(levels[current_level] - levels[new_level]):
|
|
if len(current_top) == 1:
|
|
if debug:
|
|
print >> sys.stderr, 'W: would go too low'
|
|
break
|
|
current_top.pop()
|
|
current_top.append(ET.SubElement(current_top[-1], new_level))
|
|
elif levels[current_level] < levels[new_level]:
|
|
current_top.append(ET.SubElement(current_top[-1], new_level))
|
|
|
|
if current_legi:
|
|
title = ET.SubElement(current_legi[-1], 'title')
|
|
else:
|
|
title = ET.SubElement(current_top[-1], 'title')
|
|
|
|
# fill title with content
|
|
fill_inline(title, elem)
|
|
continue
|
|
|
|
# handle other content
|
|
if len(current_top) > 1 or current_legi:
|
|
if current_legi:
|
|
handle_elem(current_legi[-1], elem)
|
|
else:
|
|
handle_elem(current_top[-1], elem)
|
|
else:
|
|
# this is out of hierarchy, before any title, this should not be
|
|
# authorized but people got used to do that for prefaces
|
|
if len(current_top[0].getchildren()) == 1 and offstructure is None:
|
|
offstructure = ET.SubElement(current_top[-1], 'nosection')
|
|
elif len(current_top[0].getchildren()) == 2 and offstructure is not None:
|
|
handle_elem(offstructure, elem)
|
|
if len(offstructure.getchildren()) > 0 and \
|
|
len(offstructure.getchildren()[0].getchildren()) == 1 and \
|
|
offstructure.getchildren()[0].text is None and \
|
|
offstructure.getchildren()[0].getchildren()[0].tag == 'footnote':
|
|
# This is a special situation, title page with a footnote
|
|
# (such as "Voir Doc. n°161 (2010-2011)."), the footnote
|
|
# would be considered part of the content, and on next
|
|
# conversion we would end with two footnotes, the one
|
|
# created from the metadata, and the one created from this
|
|
# <nosection> element. Therefore we detect the situation
|
|
# where the first item of a <nosection> is a footnote, and
|
|
# clear it.
|
|
offstructure.remove(offstructure.getchildren()[0])
|
|
|
|
# get content as an XML tree
|
|
out = StringIO()
|
|
ET.ElementTree(legi).write(out)
|
|
|
|
return out.getvalue()
|
|
|
|
|
|
def main():
|
|
global debug
|
|
|
|
parser = OptionParser()
|
|
parser.add_option('--debug',
|
|
action = 'store_true', dest = 'debug',
|
|
help = 'display some output useful for debugging')
|
|
options, args = parser.parse_args()
|
|
|
|
debug = options.debug
|
|
|
|
if len(args) == 2:
|
|
convert(args[0], args[1])
|
|
else:
|
|
convert(args[0], args[0].replace('.odt', '.legi'))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|