This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
tabellioOOo/odf2legi/odf2legi.py

1120 lines
43 KiB
Python

#! /usr/bin/env python
# -*- coding: UTF-8 -*-
# TabellioOOo - OpenDocument to .legi converter
# Copyright (C) 2007-2010 Parlement de la Communauté française de Belgique
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
from cStringIO import StringIO
from optparse import OptionParser
try:
import xml.etree.ElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
import zipfile
import sys
import os
debug = False # activate verbose debugging output ?
# OpenDocument Format namespaces
META_NS = 'urn:oasis:names:tc:opendocument:xmlns:meta:1.0'
DC_NS = 'http://purl.org/dc/elements/1.1/'
OFFICE_NS = 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'
TEXT_NS = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
TABLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'
STYLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:style:1.0'
FO_NS = 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0'
DRAW_NS = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
XLINK_NS = 'http://www.w3.org/1999/xlink'
SVG_NS = 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0'
# title styles and their respective .legi element
TITLE_LEVELS = {
'Partie': 'part',
'Heading_20_1': 'part',
'Chap': 'chapter',
'Heading_20_2': 'chapter',
'Sec 1': 'sect1',
'Sec_20_1': 'sect1',
'Heading_20_3': 'sect1',
'Sec 1.1': 'sect2',
'Sec_20_1.1': 'sect2',
'Heading_20_4': 'sect2',
'Sec 1.1.1': 'sect3',
'Sec_20_1.1.1': 'sect3',
'Heading_20_5': 'sect3',
'Sec 1.1.1.1': 'sect4',
'Sec_20_1.1.1.1': 'sect4',
'Heading_20_6': 'sect4',
'Lchapitre': 'legistic_chapter',
'Lsection': 'legistic_section',
'LSous-Section': 'legistic_subsection',
'Lpart': 'legistic_part',
'Llivre': 'legistic_book',
'LLivre': 'legistic_book',
'Ltitre': 'legistic_title',
}
# titles disguised as paragraph and their respective .legi elements
PARAGRAPH_TITLE_STYLES = {
'TitrePreface': 'preface',
'TitreSynthese': 'synthese',
'Lchapitre': 'legistic_chapter',
'Lpart': 'legistic_part',
'LLivre': 'legistic_book',
'Ltitre': 'legistic_title',
'Lsection': 'legistic_section',
'LSous-Section': 'legistic_subsection',
}
# dictionary with known styles, parsed from //office:automatic-styles
STYLES = {}
# global variable for paragraph numbering
paragraph_numbering = 0
class SkipElement(Exception):
'''
Exception to instruct convertor to skip processing of the given element.
'''
def convert_to_pt(x):
if x.endswith('cm'):
t = float(x[:-2])
return t*28.45
if x.endswith('pt'):
t = float(x[:-2])
return t
raise NotImplementedError # unknown unit
def create_metadata(metadata_element, metadata_tree, content_tree):
'''
Create metadata section (/book/metadata), filled with data from
OpenDocument meta.xml elements and content.xml user fields
'''
title = metadata_tree.findall('{%s}meta/{%s}title' % (OFFICE_NS, DC_NS))
if not title:
if debug:
print >> sys.stderr, 'W: missing title'
else:
title = title[0]
elem = ET.SubElement(metadata_element, 'property')
elem.set('name', 'title')
elem.text = title.text
done_fields = []
for user_field in content_tree.findall('//{%s}user-field-decl' % TEXT_NS):
attr_name = user_field.attrib.get('{%s}name' % TEXT_NS)
if attr_name in done_fields:
continue
elem = ET.SubElement(metadata_element, 'property')
done_fields.append(attr_name)
elem.set('name', attr_name)
elem.set('type', 'custom')
elem.text = user_field.attrib.get('{%s}string-value' % OFFICE_NS)
for user_meta in metadata_tree.findall('{%s}meta/{%s}user-defined' % (OFFICE_NS, META_NS)):
content = user_meta.text
if not content:
continue
attr_name = user_meta.attrib.get('{%s}name' % META_NS)
if attr_name in done_fields:
continue
elem = ET.SubElement(metadata_element, 'property')
done_fields.append(attr_name)
elem.set('name', attr_name)
elem.set('type', 'custom')
elem.text = content
keyword = metadata_tree.findall('{%s}meta/{%s}keyword' % (OFFICE_NS, META_NS))
if keyword:
keyword = keyword[0]
elem = ET.SubElement(metadata_element, 'property')
elem.set('name', 'keyword')
elem.text = keyword.text
def append_remaining_text(para, *texts):
text = ''.join([x for x in texts if x])
if not text:
return
if para.getchildren():
if para.getchildren()[-1].tail is None:
para.getchildren()[-1].tail = ''
para.getchildren()[-1].tail += text
else:
if para.text is None:
para.text = ''
para.text += text
def handle_little_span(para, child, invert_bg):
style_name = child.attrib.get('{%s}style-name' % TEXT_NS)
if not 'props' in STYLES.get(style_name, {}):
if debug:
print >> sys.stderr, 'W: unknown style name:', style_name
append_remaining_text(para, child.text, child.tail)
return
# known style, pile up emphasis elements as legistic emphasis
# element are limited to a single role
props = STYLES.get(style_name).get('props')
if child.text and child.text.endswith(' '):
child.text = child.text.rstrip()
if not child.tail:
child.tail = ' '
else:
child.tail = ' ' + child.tail
if 'background-color' in props:
# ignore elements with a background colour, they are used to
# communicate between writers
if child.tail:
append_remaining_text(para, '', child.tail)
raise SkipElement()
if invert_bg and not 'no-background-color' in props:
if child.tail:
append_remaining_text(para, '', child.tail)
raise SkipElement()
l_emph = None
emph = None
for p in props:
if p == 'no-background-color':
continue
t_emph = ET.Element('emphasis')
t_emph.attrib['role'] = p
if emph is None:
emph = t_emph
if l_emph is not None:
l_emph.append(t_emph)
l_emph = t_emph
if emph is not None:
para.append(emph)
l_emph.text = child.text
emph.tail = child.tail
else:
append_remaining_text(para, '', child.text)
def handle_text_box(para, child):
# draw:text-box is used to attach a caption to a figure, it will
# have a text:p as only element
if len(child.getchildren()) > 1:
if debug:
print >> sys.stderr, 'W: text-box with more than one child'
textboxchild = child.getchildren()[0]
if textboxchild.tag != '{%s}p' % TEXT_NS:
if debug:
print >> sys.stderr, 'W: text-box with non <p> child'
else:
# the <p> will have a <draw:frame> with the image, and tail
# text with the caption
for frame in textboxchild.findall('{%s}frame' % DRAW_NS):
fill_inline(para, frame)
width = frame.attrib.get('{%s}width' % SVG_NS)
height = frame.attrib.get('{%s}height' % SVG_NS)
image = para.find('imageobject/imagedata')
if width:
image.attrib['width'] = width
if height:
image.attrib['depth'] = height
# scan for a text:equence
caption = ET.Element('caption')
para.append(caption)
for child in textboxchild.getchildren():
if child.tag != '{%s}sequence' % TEXT_NS:
continue
caption.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
paracaption = ET.SubElement(caption, 'para')
paracaption.text = child.tail.strip()
if paracaption.text[0] == ':':
paracaption.text = paracaption.text[1:].strip()
def fill_inline(para, elem, invert_bg=False):
'''
Fill a block element (para, title, etc.) with its inline elements (mostly
emphasis, but also footnotes)
'''
if elem.text:
if para.getchildren():
if para.getchildren()[-1].tail is None:
para.getchildren()[-1].tail = ''
para.getchildren()[-1].tail += elem.text.strip('\n')
else:
if para.text:
orig = para.text
else:
orig = ''
para.text = orig + elem.text.strip('\n')
for child in elem.getchildren():
if child.tag == '{%s}span' % TEXT_NS and child.getchildren():
fill_inline(para, child, invert_bg=invert_bg)
append_remaining_text(para, child.tail)
elif child.tag == '{%s}span' % TEXT_NS and not child.getchildren():
try:
handle_little_span(para, child, invert_bg=invert_bg)
except SkipElement:
continue
elif child.tag == '{%s}a' % TEXT_NS:
if len(child.getchildren()) == 1 and (
child.getchildren()[0].getchildren() and
child.getchildren()[0].getchildren()[0].tag == '{%s}note' % TEXT_NS):
# footnote copy/pasted from a Microsoft Word document, all
# elements get embedded in <text:a><text:span> tags; this calls
# back fill_inline from those, to get straight to the footnote
fill_inline(para, child.getchildren()[0])
elif len(child.getchildren()) == 1 and (
child.getchildren()[0].tag == '{%s}note' % TEXT_NS):
# another situation that happens with some copy/pasting from
# Microsoft Word
fill_inline(para, child)
elif len(child.getchildren()):
fill_inline(para, child)
else:
handle_little_span(para, child, False)
elif child.tag == '{%s}note' % TEXT_NS:
footnote = ET.Element('footnote')
para.append(footnote)
for foot_elem in child.find('{%s}note-body' % TEXT_NS).getchildren():
handle_elem(footnote, foot_elem)
footnote.tail = child.tail
elif child.tag == '{%s}sequence' % TEXT_NS:
append_remaining_text(para, child.text, child.tail)
elif child.tag == '{%s}s' % TEXT_NS:
append_remaining_text(para, child.text, child.tail)
elif child.tag == '{%s}sequence-ref' % TEXT_NS:
xref = ET.Element('xref')
xref.attrib['linkend'] = child.attrib.get('{%s}ref-name' % TEXT_NS)
para.append(xref)
xref.tail = child.tail
elif child.tag == '{%s}line-break' % TEXT_NS:
br = ET.ProcessingInstruction('line-break')
para.append(br)
br.tail = child.tail
elif child.tag == '{%s}frame' % DRAW_NS:
mediaobject = ET.Element('mediaobject')
para.append(mediaobject)
fill_inline(mediaobject, child)
width = child.attrib.get('{%s}width' % SVG_NS)
height = child.attrib.get('{%s}height' % SVG_NS)
image = mediaobject.find('imageobject/imagedata')
if width:
image.attrib['width'] = width
if height:
image.attrib['depth'] = height
elif child.tag == '{%s}image' % DRAW_NS:
imageobject = ET.Element('imageobject')
imagedata = ET.SubElement(imageobject, 'imagedata')
fileref = child.attrib.get('{%s}href' % XLINK_NS)
imagedata.attrib['fileref'] = os.path.basename(fileref)
if fileref.endswith('.jpg'):
imagedata.attrib['format'] = 'JPG'
elif fileref.endswith('.png'):
imagedata.attrib['format'] = 'PNG'
para.append(imageobject)
elif child.tag == '{%s}text-box' % DRAW_NS:
handle_text_box(para, child)
else:
if debug and child.tag not in (
'{%s}soft-page-break' % TEXT_NS,
'{%s}annotation' % OFFICE_NS):
print >> sys.stderr, 'W: got unknown %s in paragraph' % child.tag
append_remaining_text(para, child.tail)
def handle_paragraph(parent, elem):
orig_style = style = elem.attrib.get('{%s}style-name' % TEXT_NS)
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
align = None
if style == 'Para_20_Right':
align = 'right'
elif style == 'Para_20_Center':
align = 'center'
align = STYLES.get(orig_style, {}).get('align', align)
invert_bg = False
if 'background-color' in STYLES.get(style, {}).get('props', []) or \
'background-color' in STYLES.get(orig_style, {}).get('props', []):
# ignore elements with a background colour, they are used to
# communicate between writers
if not elem.getchildren():
return
# do not abort yet, as the paragraph may contain text:span with the
# background colour explicitely unset
invert_bg = True
if style == 'SousTitre':
para = ET.SubElement(parent, 'subtitle')
elif style == 'Note':
note = ET.SubElement(parent, 'note')
para = ET.SubElement(note, 'para')
elif style == 'Table':
# title for previous table
table = parent.getchildren()[-1]
if table.tag == 'table':
title = ET.Element('title')
table.insert(0, title)
para = title
# scan the children for a text:sequence/text:ref-name, to be
# used as identifier
for child in elem.getchildren():
if child.tag != '{%s}sequence' % TEXT_NS:
continue
title.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
else:
# uh oh, strange, should have been a table
para = ET.SubElement(parent, 'para')
else:
# simple paragraph
para = ET.SubElement(parent, 'para')
if align == 'center':
para.attrib['role'] = 'center'
if align == 'right':
para.attrib['role'] = 'right'
if style == 'Larttitre':
para.attrib['role'] = 'legistic_manualarticle'
if style == 'NoteTableDesMatieres':
para.attrib['role'] = 'note_table_des_matieres'
if align and parent.tag == 'entry':
parent.attrib['role'] = align
fill_inline(para, elem, invert_bg=invert_bg)
if not para.text:
para_text = None
else:
para_text = para.text.replace(' ', '').replace('\t', '').replace(u'\xa0', '')
if not (para_text or para.tail or len(para.getchildren())):
# remove empty paragraphs
if para in parent:
parent.remove(para)
elif style == 'Table':
table.remove(para)
else:
if STYLES.get(orig_style, {}).get('align'):
para.attrib['align'] = STYLES.get(orig_style, {}).get('align')
if STYLES.get(orig_style, {}).get('margin-left') == 'true':
para.attrib['margin-left'] = 'true'
if style == 'Table':
# if it's a table, the title will start with "Tableau N:", but
# it will be renumbered in LaTeX, so we remove that heading.
if ':' in para.text:
para.text = para.text[para.text.index(':')+1:].strip()
if STYLES.get(orig_style, {}).get('props'):
# the style change italic/bold status
props = STYLES.get(orig_style, {}).get('props')
for p in props:
if p == 'background-color':
continue
t_emph = ET.Element('emphasis')
t_emph.attrib['role'] = p
t_emph.text, para.text = para.text, None
t_emph.tail, para.tail = para.tail, None
t_emph._children, para._children = para.getchildren(), [t_emph]
def handle_list(parent, elem):
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
level = 1
if style is None:
# means our parent was also a list
style_props = None
ancestor = elem
while True:
try:
gdparent = ancestor.parent.parent
except AttributeError:
break
if gdparent.tag != '{%s}list' % TEXT_NS:
break
level += 1
ancestor = gdparent
ancestorstyle = ancestor.attrib.get('{%s}style-name' % TEXT_NS)
style_props = STYLES.get('LIST:' + ancestorstyle)
else:
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
style_props = STYLES.get('LIST:' + style)
num_format = None
if style_props:
num_format = style_props.get('levels', {}).get(level, {}).get('format')
bullet = None
if style_props:
bullet = style_props.get('levels', {}).get(level, {}).get('bullet')
if style == 'Paragraph_20_Numbering':
# paragraph numbering is special, as we want it to get out as normal
# paragraphs, but numbered
continue_numbering = elem.attrib.get('{%s}continue-numbering' % TEXT_NS)
global paragraph_numbering
if continue_numbering != 'true':
paragraph_numbering = 0
list_elem = parent
for item in elem.findall('{%s}list-item' % TEXT_NS):
item.parent = elem
for child in item.getchildren():
child.parent = item
paragraph_numbering += 1
if child.text is None:
child.text = u'%s. ' % paragraph_numbering
else:
child.text = (u'%s. ' % paragraph_numbering) + child.text
handle_elem(parent, child)
else:
para = ET.SubElement(parent, 'para')
if style in ('Liste_20_Alpha', 'AlphaList') or num_format == 'a':
list_elem = ET.SubElement(para, 'orderedlist')
list_elem.attrib['continuation'] = 'restarts'
list_elem.attrib['numeration'] = 'loweralpha'
elif style in ('Liste_20_Num', 'NumList') or num_format == '1':
list_elem = ET.SubElement(para, 'orderedlist')
list_elem.attrib['continuation'] = 'restarts'
list_elem.attrib['numeration'] = 'arabic'
else:
list_elem = ET.SubElement(para, 'itemizedlist')
for item in elem.findall('{%s}list-item' % TEXT_NS):
item.parent = elem
listitem = ET.SubElement(list_elem, 'listitem')
if bullet and bullet != '-':
listitem.attrib['bullet'] = bullet
for child in item.getchildren():
child.parent = item
handle_elem(listitem, child)
def handle_signature_table(parent, elem):
table_with_titles = False
number_columns = 0
for row in elem.findall('{%s}table-row' % TABLE_NS):
cells = row.findall('{%s}table-cell' % TABLE_NS)
number_columns = len(cells)
for cell in cells:
para = cell.findall('{%s}p' % TEXT_NS)[0]
if para.text is None and not para.getchildren():
number_columns -= 1
style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
if style_name and 'props' in STYLES.get(style_name, {}):
props = STYLES.get(style_name).get('props')
if props and 'italic' in props:
table_with_titles = True
table = ET.SubElement(parent, 'informaltable')
tgroup = ET.SubElement(table, 'tgroup')
tgroup.attrib['cols'] = '2'
if number_columns == 1:
# create a fake column, to mark indentation
colspec = ET.SubElement(tgroup, 'colspec')
colspec.attrib['colname'] = 'C1'
colspec.attrib['colnum'] = '1'
if table_with_titles:
colspec.attrib['colwidth'] = '46pt'
else:
colspec.attrib['colwidth'] = '22pt'
colspec = ET.SubElement(tgroup, 'colspec')
colspec.attrib['colname'] = 'C2'
colspec.attrib['colnum'] = '2'
colspec.attrib['colwidth'] = '92.1pt'
else:
colspec = ET.SubElement(tgroup, 'colspec')
colspec.attrib['colname'] = 'C1'
colspec.attrib['colnum'] = '1'
colspec.attrib['colwidth'] = '92.1pt'
colspec = ET.SubElement(tgroup, 'colspec')
colspec.attrib['colname'] = 'C2'
colspec.attrib['colnum'] = '2'
colspec.attrib['colwidth'] = '92.1pt'
tbody = ET.SubElement(tgroup, 'tbody')
for row in elem.findall('{%s}table-row' % TABLE_NS):
trow = ET.SubElement(tbody, 'row')
if number_columns == 1:
# empty cell for first (fake) column
tcell = ET.SubElement(trow, 'entry')
tpara = ET.SubElement(tcell, 'para')
for cell in row.findall('{%s}table-cell' % TABLE_NS):
para = cell.findall('{%s}p' % TEXT_NS)[0]
if para.text is None and not para.getchildren():
continue
tcell = ET.SubElement(trow, 'entry')
tpara = ET.SubElement(tcell, 'para')
style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
if style_name and 'props' in STYLES.get(style_name, {}):
props = STYLES.get(style_name).get('props')
if props and 'italic' in props:
t_emph = ET.SubElement(tpara, 'emphasis')
tpara = t_emph
fill_inline(tpara, para)
# add empty interline
trow = ET.SubElement(tbody, 'row')
tcell = ET.SubElement(trow, 'entry')
tpara = ET.SubElement(tcell, 'para')
if number_columns == 1:
tcell = ET.SubElement(trow, 'entry')
tpara = ET.SubElement(tcell, 'para')
def handle_table(parent, elem):
try:
row = elem.findall('{%s}table-row' % TABLE_NS)[0]
cell = row.findall('{%s}table-cell' % TABLE_NS)[0]
p = cell.findall('{%s}p' % TEXT_NS)[0]
p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
if p_style_name == 'Signature':
return handle_signature_table(parent, elem)
if STYLES.get(p_style_name) and STYLES[p_style_name].get('parent') == 'Signature':
return handle_signature_table(parent, elem)
except IndexError:
pass
table = ET.SubElement(parent, 'table')
cols = elem.findall('{%s}table-columns/{%s}table-column' % (TABLE_NS, TABLE_NS))
if not cols:
cols = elem.findall('{%s}table-column' % TABLE_NS)
nbcols = 0
tgroup = ET.SubElement(table, 'tgroup')
# count columns, and generate colspecs
for c in cols:
nb_new_cols = int(c.attrib.get('{%s}number-columns-repeated' % TABLE_NS, 1))
style_name = c.attrib.get('{%s}style-name' % TABLE_NS)
col_width = None
if style_name and style_name in STYLES:
style = STYLES.get(style_name)
col_width = style.get('column-width')
offset = 0
for i in range(nb_new_cols):
alignment = None
offset = 0
for row in elem.findall('{%s}table-row' % TABLE_NS):
try:
cell = row.findall('{%s}table-cell' % TABLE_NS)[nbcols+i+offset]
except IndexError:
# somehow it was impossible to get to that cell, ignore that
break
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
if columns_spanned:
offset -= int(columns_spanned) + 1
if not cell.findall('{%s}p' % TEXT_NS):
continue
p = cell.findall('{%s}p' % TEXT_NS)[0]
p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
if not p_style_name:
continue
p_style = STYLES.get(p_style_name)
if not p_style:
continue
alignment = p_style.get('align')
if alignment:
break
colspec = ET.SubElement(tgroup, 'colspec')
colspec.attrib['colnum'] = str(nbcols + i + 1)
colspec.attrib['colname'] = 'col%s' % colspec.attrib['colnum']
if col_width:
colspec.attrib['colwidth'] = '%spt' % col_width
if alignment:
colspec.attrib['align'] = alignment
nbcols += nb_new_cols
tgroup.attrib['cols'] = '%s' % nbcols
# pass over all colspecs to set a width if it was not set before
for colspec in tgroup.getchildren():
if not colspec.attrib.has_key('colwidth'):
# (A4 width in pts - some margin) / nbcols
colspec.attrib['colwidth'] = '%spt' % (500/nbcols)
if elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
thead = ET.SubElement(tgroup, 'thead')
for row in elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
trow = ET.SubElement(thead, 'row')
for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
tcell = ET.SubElement(trow, 'entry')
for child in cell.getchildren():
handle_elem(tcell, child)
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
if columns_spanned:
tcell.attrib['namest'] = 'col%s' % (i+1)
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
rows = elem.findall('{%s}table-rows/{%s}table-row' % (TABLE_NS, TABLE_NS))
if not rows:
rows = elem.findall('{%s}table-row' % TABLE_NS)
if rows:
first_row = rows[0]
for cell in first_row.findall('{%s}table-cell' % TABLE_NS):
for elem in cell.getchildren():
if elem.tag != '{%s}p' % TEXT_NS:
continue
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
if style == 'Table_20_Heading':
# this is actually an title line
thead = ET.SubElement(tgroup, 'thead')
trow = ET.SubElement(thead, 'row')
for i, cell in enumerate(first_row.findall('{%s}table-cell' % TABLE_NS)):
tcell = ET.SubElement(trow, 'entry')
for child in cell.getchildren():
handle_elem(tcell, child)
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
if columns_spanned:
tcell.attrib['namest'] = 'col%s' % (i+1)
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
rows = rows[1:]
break
else:
continue
break
tbody = ET.SubElement(tgroup, 'tbody')
for row in rows:
trow = ET.SubElement(tbody, 'row')
for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
tcell = ET.SubElement(trow, 'entry')
for child in cell.getchildren():
handle_elem(tcell, child)
if tcell.getchildren() and tcell.getchildren()[-1].attrib.get('align'):
# remove align attribute on cell paragraphs
del tcell.getchildren()[-1].attrib['align']
columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
if columns_spanned:
tcell.attrib['namest'] = 'col%s' % (i+1)
tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))
return
def handle_elem(parent, elem):
'''
Handle a block element (paragraph, lists, etc.)
'''
if elem.tag == '{%s}p' % TEXT_NS:
return handle_paragraph(parent, elem)
if elem.tag == '{%s}list' % TEXT_NS:
return handle_list(parent, elem)
if elem.tag == '{%s}table' % TABLE_NS:
return handle_table(parent, elem)
if debug:
print >> sys.stderr, 'W: unhandled element:', elem.tag
def look_for_annotation(elem):
'''Look for an annotation, in children and subchildren'''
speaker_annotation = [x for x in elem.getchildren() if x.tag == '{%s}annotation' % OFFICE_NS]
if speaker_annotation:
return speaker_annotation[0]
for child in elem.getchildren():
speaker_annotation = look_for_annotation(child)
if speaker_annotation:
return speaker_annotation
return None
def convert(input_filename, output_filename):
'''
Convert a file from the OpenDocument Format to the legacy .legi format
'''
# get content.xml and meta.xml from file
z = zipfile.ZipFile(input_filename)
content = None
metadata = None
for zfile in z.namelist():
if zfile == 'content.xml':
content = z.read(zfile)
elif zfile == 'meta.xml':
metadata = z.read(zfile)
if content and metadata:
break
legi = convert_to_legi_xml(content, metadata)
if debug:
print legi
# add XML prolog, necessary for some legacy Tabellio tools
legi = '<?xml version="1.0"?>\n' + legi
# write down content to the .legi file
legiz = zipfile.ZipFile(output_filename, 'w')
zi = zipfile.ZipInfo('contents.xml')
zi.external_attr = 0664 << 16L
legiz.writestr(zi, legi)
# copy pictures to the .legi file
for zfile in z.namelist():
if not zfile.startswith('Pictures/'):
continue
zi = zipfile.ZipInfo(os.path.basename(zfile))
zi.external_attr = 0664 << 16L
legiz.writestr(zi, z.read(zfile))
legiz.close()
def parse_automatic_styles(content_tree):
'''
Parse styles created automatically and populate the global styles
dictionary.
'''
global STYLES
for elem in content_tree.findall('{%s}automatic-styles/{%s}style' % (OFFICE_NS, STYLE_NS)):
name = elem.attrib.get('{%s}name' % STYLE_NS)
props = []
STYLES[name] = {}
# get inline italic, bold and underline attributes
if elem.attrib.get('{%s}family' % STYLE_NS) in ('paragraph', 'text'):
for prop in elem.findall('{%s}text-properties' % STYLE_NS):
if prop.attrib.get('{%s}font-style' % FO_NS) == 'italic':
props.append('italic')
if prop.attrib.get('{%s}font-weight' % FO_NS) == 'bold':
props.append('bold')
if prop.attrib.get('{%s}text-underline-style' % STYLE_NS) == 'solid':
props.insert(0, 'underline')
if prop.attrib.get('{%s}background-color' % FO_NS):
if elem.attrib.get('{%s}family' % STYLE_NS) == 'paragraph':
if prop.attrib.get('{%s}background-color' % FO_NS) not in (None, 'transparent', '#ffffff'):
props.append('background-color')
else:
if prop.attrib.get('{%s}background-color' % FO_NS) in (None, 'transparent', '#ffffff'):
props.append('no-background-color')
else:
props.append('background-color')
for prop in elem.findall('{%s}paragraph-properties' % STYLE_NS):
if prop.attrib.get('{%s}break-before' % FO_NS) == 'page':
STYLES[name]['page-break'] = True
if prop.attrib.get('{%s}text-align' % FO_NS) == 'center':
STYLES[name]['align'] = 'center'
elif prop.attrib.get('{%s}text-align' % FO_NS) == 'end':
STYLES[name]['align'] = 'right'
if prop.attrib.get('{%s}margin-left' % FO_NS, '0cm') != '0cm':
STYLES[name]['margin-left'] = 'true'
if elem.attrib.get('{%s}family' % STYLE_NS) == 'table-column':
for prop in elem.findall('{%s}table-column-properties' % STYLE_NS):
if prop.attrib.get('{%s}column-width' % STYLE_NS):
STYLES[name]['column-width'] = convert_to_pt(
prop.attrib.get('{%s}column-width' % STYLE_NS))
if props:
STYLES[name]['props'] = props
if elem.attrib.get('{%s}parent-style-name' % STYLE_NS):
STYLES[name]['parent'] = elem.attrib.get('{%s}parent-style-name' % STYLE_NS)
# parse automatic list styles
for elem in content_tree.findall('{%s}automatic-styles/{%s}list-style' % (OFFICE_NS, TEXT_NS)):
style_name = 'LIST:%s' % elem.attrib.get('{%s}name' % STYLE_NS)
STYLES[style_name] = {'levels': {}}
for level in elem.findall('{%s}list-level-style-number' % TEXT_NS):
num_level = level.attrib.get('{%s}level' % TEXT_NS)
num_format = level.attrib.get('{%s}num-format' % STYLE_NS)
STYLES[style_name]['levels'][int(num_level)] = {
'format': num_format,
}
for level in elem.findall('{%s}list-level-style-bullet' % TEXT_NS):
num_level = level.attrib.get('{%s}level' % TEXT_NS)
bullet_char = level.attrib.get('{%s}bullet-char' % TEXT_NS)
if not STYLES[style_name]['levels'].get(int(num_level)):
STYLES[style_name]['levels'][int(num_level)] = {}
STYLES[style_name]['levels'][int(num_level)].update({
'bullet': bullet_char,
})
def convert_to_legi_xml(content, metadata = None):
'''
Convert a content.xml/metadata.xml pair from an odt file
to the legi XML format.
'''
global STYLES
# create top <book> element
legi = ET.Element('book')
# parse content
content_tree = ET.ElementTree(ET.fromstring(content))
if metadata:
metadata_tree = ET.ElementTree(ET.fromstring(metadata))
# create child <metadata> element
metadata_element = ET.SubElement(legi, 'metadata')
create_metadata(metadata_element, metadata_tree, content_tree)
current_top = [legi]
current_legi = []
speech = None
offstructure = None
levels = {'book':0, 'part':1, 'chapter':2, 'preface':2, 'synthese':2,
'sect1':3, 'sect2':4, 'sect3':5, 'sect4':6}
legistic_levels = {'legistic_part': 0,
'legistic_book': 1,
'legistic_title': 2,
'legistic_chapter':3,
'legistic_section':4,
'legistic_subsection':5}
parse_automatic_styles(content_tree)
# convert content
for elem in content_tree.find('{%s}body/{%s}text' % (OFFICE_NS, OFFICE_NS)).getchildren():
if elem.tag == '{%s}user-field-decls' % TEXT_NS:
# user fields are handled as part of the metadata
continue
if elem.tag == '{%s}list' % TEXT_NS:
# OOo tends to consider titles as lists, so its numbering works,
# so look for a single paragraph item in the list, and get it out
# of it if it exists with a title style name.
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
real_elem = elem.findall('{%s}list-item/{%s}p' % (TEXT_NS, TEXT_NS))
if len(real_elem) == 1:
real_elem = real_elem[0]
style = real_elem.attrib.get('{%s}style-name' % TEXT_NS)
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
if style in ('Lchapitre', 'Lsection', 'Lpart', 'LLivre', 'Ltitre'):
elem = real_elem
# some paragraphs are out of title hierarchy but should nevertheless
# be handled as title in the final output
is_p_title = False
if elem.tag == '{%s}p' % TEXT_NS:
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
if STYLES.get(style, {}).get('page-break') is True:
pb = ET.ProcessingInstruction('page-break')
current_top[-1].append(pb)
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
if style in PARAGRAPH_TITLE_STYLES:
new_level = PARAGRAPH_TITLE_STYLES.get(style)
is_p_title = True
speaker_annotation = look_for_annotation(elem)
if speaker_annotation is not None:
if speech: # there was a speech, pop it
current_top.pop()
speech = ET.SubElement(current_top[-1], 'speech')
current_top.append(speech)
ref = ET.SubElement(speech, 'ref')
for param in speaker_annotation.getchildren():
if param.tag != '{%s}p' % TEXT_NS:
continue
try:
arg, value = tuple([x.strip() for x in param.text.split(':', 2)])
if arg == 'type':
ref.attrib['type'] = value
else:
legi_param = ET.SubElement(ref, 'param')
legi_param.attrib['name'] = arg
legi_param.text = value
except (AttributeError, ValueError):
# annotation paragraph was not "key: value"; consider
# it as a comment and ignore it.
pass
# handle all titles
if elem.tag == '{%s}h' % TEXT_NS or is_p_title:
style = elem.attrib.get('{%s}style' % TEXT_NS)
if not style:
style = elem.attrib.get('{%s}style-name' % TEXT_NS)
if 'parent' in STYLES.get(style, {}):
style = STYLES.get(style).get('parent')
if style in TITLE_LEVELS:
new_level = TITLE_LEVELS.get(style)
elif style == 'Sous-Titre':
# Subtitles are out-of-hierarchy, just adding a <subtitle> node
# under an existing <title> node; they have no content
if current_legi:
subtitle = ET.SubElement(current_legi[-1], 'subtitle')
else:
subtitle = ET.SubElement(current_top[-1], 'subtitle')
fill_inline(subtitle, elem)
continue
else:
if debug:
print >> sys.stderr, 'E: unknown heading style:', style
if new_level.startswith('legistic_'):
# title in a legistic part
if current_legi:
# existing legistic part
current_level = current_legi[-1].tag
if legistic_levels[current_level] == legistic_levels[new_level]:
current_legi.pop()
current_legi.append(ET.SubElement(current_legi[-1], new_level))
elif legistic_levels[current_level] > legistic_levels[new_level]:
current_legi.pop()
for i in range(legistic_levels[current_level] - legistic_levels[new_level]):
current_legi.pop()
current_legi.append(ET.SubElement(current_legi[-1], new_level))
elif legistic_levels[current_level] < legistic_levels[new_level]:
current_legi.append(ET.SubElement(current_legi[-1], new_level))
else:
# new legistic part
current_legi.append(current_top[-1])
current_legi.append(ET.SubElement(current_top[-1], new_level))
else:
# title in a "normal" part
# must first close current legistic part, if any
if current_legi:
current_legi = []
if speech: # there was a speech, pop it
current_top.pop()
speech = None
current_level = current_top[-1].tag
if levels[current_level] == levels[new_level]:
current_top.pop()
current_top.append(ET.SubElement(current_top[-1], new_level))
elif levels[current_level] > levels[new_level]:
current_top.pop()
for i in range(levels[current_level] - levels[new_level]):
if len(current_top) == 1:
if debug:
print >> sys.stderr, 'W: would go too low'
break
current_top.pop()
current_top.append(ET.SubElement(current_top[-1], new_level))
elif levels[current_level] < levels[new_level]:
current_top.append(ET.SubElement(current_top[-1], new_level))
if current_legi:
title = ET.SubElement(current_legi[-1], 'title')
else:
title = ET.SubElement(current_top[-1], 'title')
# fill title with content
fill_inline(title, elem)
continue
# handle other content
if len(current_top) > 1 or current_legi:
if current_legi:
handle_elem(current_legi[-1], elem)
else:
handle_elem(current_top[-1], elem)
else:
# this is out of hierarchy, before any title, this should not be
# authorized but people got used to do that for prefaces
if len(current_top[0].getchildren()) == 1 and offstructure is None:
offstructure = ET.SubElement(current_top[-1], 'nosection')
handle_elem(offstructure, elem)
elif len(current_top[0].getchildren()) == 2 and offstructure is not None:
handle_elem(offstructure, elem)
if len(offstructure.getchildren()) > 0 and \
len(offstructure.getchildren()[0].getchildren()) == 1 and \
offstructure.getchildren()[0].text is None and \
offstructure.getchildren()[0].getchildren()[0].tag == 'footnote':
# This is a special situation, title page with a footnote
# (such as "Voir Doc. n°161 (2010-2011)."), the footnote
# would be considered part of the content, and on next
# conversion we would end with two footnotes, the one
# created from the metadata, and the one created from this
# <nosection> element. Therefore we detect the situation
# where the first item of a <nosection> is a footnote, and
# clear it.
offstructure.remove(offstructure.getchildren()[0])
# get content as an XML tree
out = StringIO()
ET.ElementTree(legi).write(out)
return out.getvalue()
def main():
global debug
parser = OptionParser()
parser.add_option('--debug',
action = 'store_true', dest = 'debug',
help = 'display some output useful for debugging')
options, args = parser.parse_args()
debug = options.debug
if len(args) == 2:
convert(args[0], args[1])
else:
convert(args[0], args[0].replace('.odt', '.legi'))
if __name__ == '__main__':
main()