tabellioOOo/odf2legi/odf2legi.py

#! /usr/bin/env python
# -*- coding: UTF-8 -*-

# TabellioOOo - OpenDocument to .legi converter
# Copyright (C) 2007-2010  Parlement de la Communauté française de Belgique
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA


from cStringIO import StringIO
from optparse import OptionParser
try:
    import xml.etree.ElementTree as ET
except ImportError:
    import elementtree.ElementTree as ET

import zipfile
import sys
import os

debug = False # activate verbose debugging output ?

# OpenDocument Format namespaces
META_NS = 'urn:oasis:names:tc:opendocument:xmlns:meta:1.0'
DC_NS = 'http://purl.org/dc/elements/1.1/'
OFFICE_NS = 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'
TEXT_NS = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
TABLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'
STYLE_NS = 'urn:oasis:names:tc:opendocument:xmlns:style:1.0'
FO_NS = 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0'
DRAW_NS = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
XLINK_NS = 'http://www.w3.org/1999/xlink'
SVG_NS = 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0'


# title styles and their respective .legi element
TITLE_LEVELS = {
    'Partie': 'part',
    'Heading_20_1': 'part',
    'Chap': 'chapter',
    'Heading_20_2': 'chapter',
    'Sec 1': 'sect1',
    'Sec_20_1': 'sect1',
    'Heading_20_3': 'sect1',
    'Sec 1.1': 'sect2',
    'Sec_20_1.1': 'sect2',
    'Heading_20_4': 'sect2',
    'Sec 1.1.1': 'sect3',
    'Sec_20_1.1.1': 'sect3',
    'Heading_20_5': 'sect3',
    'Lchapitre': 'legistic_chapter',
    'Lsection': 'legistic_section',
    'LSous-Section': 'legistic_subsection',
    'Lpart': 'legistic_part',
    'Llivre': 'legistic_book',
    'LLivre': 'legistic_book',
    'Ltitre': 'legistic_title',
}

# titles disguised as paragraph and their respective .legi elements
PARAGRAPH_TITLE_STYLES = {
    'TitrePreface': 'preface',
    'TitreSynthese': 'synthese',
    'Lchapitre': 'legistic_chapter',
    'Lpart': 'legistic_part',
    'LLivre': 'legistic_book',
    'Ltitre': 'legistic_title',
    'Lsection': 'legistic_section',
    'LSous-Section': 'legistic_subsection',
}


# dictionary with known styles, parsed from //office:automatic-styles
STYLES = {}

# global variable for paragraph numbering
paragraph_numbering = 0

class SkipElement(Exception):
    '''
    Exception to instruct convertor to skip processing of the given element.
    '''

def convert_to_pt(x):
    if x.endswith('cm'):
        t = float(x[:-2])
        return t*28.45
    if x.endswith('pt'):
        t = float(x[:-2])
        return t
    raise NotImplementedError # unknown unit

def create_metadata(metadata_element, metadata_tree, content_tree):
    '''
    Create metadata section (/book/metadata), filled with data from
    OpenDocument meta.xml elements and content.xml user fields
    '''
    title = metadata_tree.findall('{%s}meta/{%s}title' % (OFFICE_NS, DC_NS))
    if not title:
        if debug:
            print >> sys.stderr, 'W: missing title'
    else:
        title = title[0]
        elem = ET.SubElement(metadata_element, 'property')
        elem.set('name', 'title')
        elem.text = title.text

    done_fields = []
    for user_field in content_tree.findall('//{%s}user-field-decl' % TEXT_NS):
        attr_name = user_field.attrib.get('{%s}name' % TEXT_NS)
        if attr_name in done_fields:
            continue
        elem = ET.SubElement(metadata_element, 'property')
        done_fields.append(attr_name)
        elem.set('name', attr_name)
        elem.set('type', 'custom')
        elem.text = user_field.attrib.get('{%s}string-value' % OFFICE_NS)

    for user_meta in metadata_tree.findall('{%s}meta/{%s}user-defined' %  (OFFICE_NS, META_NS)):
        content = user_meta.text
        if not content:
            continue
        attr_name = user_meta.attrib.get('{%s}name' % META_NS)
        if attr_name in done_fields:
            continue
        elem = ET.SubElement(metadata_element, 'property')
        done_fields.append(attr_name)
        elem.set('name', attr_name)
        elem.set('type', 'custom')
        elem.text = content

    keyword = metadata_tree.findall('{%s}meta/{%s}keyword' % (OFFICE_NS, META_NS))
    if keyword:
        keyword = keyword[0]
        elem = ET.SubElement(metadata_element, 'property')
        elem.set('name', 'keyword')
        elem.text = keyword.text


def append_remaining_text(para, *texts):
    text = ''.join([x for x in texts if x])
    if not text:
        return
    if para.getchildren():
        if para.getchildren()[-1].tail is None:
            para.getchildren()[-1].tail = ''
        para.getchildren()[-1].tail += text
    else:
        if para.text is None:
            para.text = ''
        para.text += text


def handle_little_span(para, child, invert_bg):
    style_name = child.attrib.get('{%s}style-name' % TEXT_NS)

    if not 'props' in STYLES.get(style_name, {}):
        if debug:
            print >> sys.stderr, 'W: unknown style name:', style_name
        append_remaining_text(para, child.text, child.tail)
        return

    # known style, pile up emphasis elements as legistic emphasis
    # element are limited to a single role
    props = STYLES.get(style_name).get('props')
    if child.text and child.text.endswith(' '):
        child.text = child.text.rstrip()
        if not child.tail:
            child.tail = ' '
        else:
            child.tail = ' ' + child.tail
    if 'background-color' in props:
        # ignore elements with a background colour, they are used to
        # communicate between writers
        if child.tail:
            append_remaining_text(para, '', child.tail)
        raise SkipElement()

    if invert_bg and not 'no-background-color' in props:
        if child.tail:
            append_remaining_text(para, '', child.tail)
        raise SkipElement()

    l_emph = None
    emph = None
    for p in props:
        if p == 'no-background-color':
            continue
        t_emph = ET.Element('emphasis')
        t_emph.attrib['role'] = p
        if emph is None:
            emph = t_emph
        if l_emph is not None:
            l_emph.append(t_emph)
        l_emph = t_emph
    if emph is not None:
        para.append(emph)
        l_emph.text = child.text
        emph.tail = child.tail
    else:
        append_remaining_text(para, '', child.text)


def handle_text_box(para, child):
    # draw:text-box is used to attach a caption to a figure, it will
    # have a text:p as only element
    if len(child.getchildren()) > 1:
        if debug:
            print >> sys.stderr, 'W: text-box with more than one child'

    textboxchild = child.getchildren()[0]
    if textboxchild.tag != '{%s}p' % TEXT_NS:
        if debug:
            print >> sys.stderr, 'W: text-box with non <p> child'
    else:
        # the <p> will have a <draw:frame> with the image, and tail
        # text with the caption
        for frame in textboxchild.findall('{%s}frame' % DRAW_NS):
            fill_inline(para, frame)
            width = frame.attrib.get('{%s}width' % SVG_NS)
            height = frame.attrib.get('{%s}height' % SVG_NS)
            image = para.find('imageobject/imagedata')
            if width:
                image.attrib['width'] = width
            if height:
                image.attrib['depth'] = height

        # scan for a text:equence
        caption = ET.Element('caption')
        para.append(caption)
        for child in textboxchild.getchildren():
            if child.tag != '{%s}sequence' % TEXT_NS:
                 continue
            caption.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
            paracaption = ET.SubElement(caption, 'para')
            paracaption.text = child.tail.strip()
            if paracaption.text[0] == ':':
                paracaption.text = paracaption.text[1:].strip()


def fill_inline(para, elem, invert_bg=False):
    '''
    Fill a block element (para, title, etc.) with its inline elements (mostly
    emphasis, but also footnotes)
    '''

    if elem.text:
        if para.getchildren():
            if para.getchildren()[-1].tail is None:
                para.getchildren()[-1].tail = ''
            para.getchildren()[-1].tail += elem.text.strip('\n')
        else:
            para.text = elem.text.strip('\n')

    for child in elem.getchildren():
        if child.tag == '{%s}span' % TEXT_NS and child.getchildren():
            fill_inline(para, child, invert_bg=invert_bg)
            append_remaining_text(para, child.tail)

        elif child.tag == '{%s}span' % TEXT_NS and not child.getchildren():
            try:
                handle_little_span(para, child, invert_bg=invert_bg)
            except SkipElement:
                continue

        elif child.tag == '{%s}a' % TEXT_NS:
            if len(child.getchildren()) == 1 and (
                    child.getchildren()[0].getchildren() and
                    child.getchildren()[0].getchildren()[0].tag == '{%s}note' % TEXT_NS):
                # footnote copy/pasted from a Microsoft Word document, all
                # elements get embedded in <text:a><text:span> tags; this calls
                # back fill_inline from those, to get straight to the footnote
                fill_inline(para, child.getchildren()[0])
            elif len(child.getchildren()) == 1 and (
                    child.getchildren()[0].tag == '{%s}note' % TEXT_NS):
                # another situation that happens with some copy/pasting from
                # Microsoft Word
                fill_inline(para, child)
            else:
                handle_little_span(para, child, False)

        elif child.tag == '{%s}note' % TEXT_NS:
            footnote = ET.Element('footnote')
            para.append(footnote)
            for foot_elem in child.find('{%s}note-body' % TEXT_NS).getchildren():
                handle_elem(footnote, foot_elem)
            footnote.tail = child.tail

        elif child.tag == '{%s}sequence' % TEXT_NS:
            append_remaining_text(para, child.text, child.tail)

        elif child.tag == '{%s}s' % TEXT_NS:
            append_remaining_text(para, child.text, child.tail)

        elif child.tag == '{%s}sequence-ref' % TEXT_NS:
            xref = ET.Element('xref')
            xref.attrib['linkend'] = child.attrib.get('{%s}ref-name' % TEXT_NS)
            para.append(xref)
            xref.tail = child.tail

        elif child.tag == '{%s}line-break' % TEXT_NS:
            br = ET.ProcessingInstruction('line-break')
            para.append(br)
            br.tail = child.tail

        elif child.tag == '{%s}frame' % DRAW_NS:
            mediaobject = ET.Element('mediaobject')
            para.append(mediaobject)
            fill_inline(mediaobject, child)
            width = child.attrib.get('{%s}width' % SVG_NS)
            height = child.attrib.get('{%s}height' % SVG_NS)
            image = mediaobject.find('imageobject/imagedata')
            if width:
                image.attrib['width'] = width
            if height:
                image.attrib['depth'] = height

        elif child.tag == '{%s}image' % DRAW_NS:
            imageobject = ET.Element('imageobject')
            imagedata = ET.SubElement(imageobject, 'imagedata')
            fileref = child.attrib.get('{%s}href' % XLINK_NS)
            imagedata.attrib['fileref'] = os.path.basename(fileref)
            if fileref.endswith('.jpg'):
                imagedata.attrib['format'] = 'JPG'
            elif fileref.endswith('.png'):
                imagedata.attrib['format'] = 'PNG'
            para.append(imageobject)

        elif child.tag == '{%s}text-box' % DRAW_NS:
            handle_text_box(para, child)

        else:
            if debug and child.tag not in (
                    '{%s}soft-page-break' % TEXT_NS,
                    '{%s}annotation' % OFFICE_NS):
                print >> sys.stderr, 'W: got unknown %s in paragraph' % child.tag
            append_remaining_text(para, child.tail)


def handle_paragraph(parent, elem):

    orig_style = style = elem.attrib.get('{%s}style-name' % TEXT_NS)
    if 'parent' in STYLES.get(style, {}):
        style = STYLES.get(style).get('parent')

    align = None
    if style == 'Para_20_Right':
        align = 'right'
    elif style == 'Para_20_Center':
        align = 'center'
    align = STYLES.get(orig_style, {}).get('align', align)

    invert_bg = False
    if 'background-color' in STYLES.get(style, {}).get('props', []) or \
            'background-color' in STYLES.get(orig_style, {}).get('props', []):
        # ignore elements with a background colour, they are used to
        # communicate between writers
        if not elem.getchildren():
            return
        # do not abort yet, as the paragraph may contain text:span with the
        # background colour explicitely unset
        invert_bg = True

    if style == 'SousTitre':
        para = ET.SubElement(parent, 'subtitle')
    elif style == 'Note':
        note = ET.SubElement(parent, 'note')
        para = ET.SubElement(note, 'para')
    elif style == 'Table':
        # title for previous table
        table = parent.getchildren()[-1]
        if table.tag == 'table':
            title = ET.Element('title')
            table.insert(0, title)
            para = title
            # scan the children for a text:sequence/text:ref-name, to be
            # used as identifier
            for child in elem.getchildren():
                if child.tag != '{%s}sequence' % TEXT_NS:
                    continue
                title.attrib['id'] = child.attrib['{%s}ref-name' % TEXT_NS]
        else:
            # uh oh, strange, should have been a table
            para = ET.SubElement(parent, 'para')
    else:
        # simple paragraph
        para = ET.SubElement(parent, 'para')
        if align == 'center':
            para.attrib['role'] = 'center'
        if align == 'right':
            para.attrib['role'] = 'right'
        if style == 'Larttitre':
            para.attrib['role'] = 'legistic_manualarticle'
        if style == 'NoteTableDesMatieres':
            para.attrib['role'] = 'note_table_des_matieres'

        if align and parent.tag == 'entry':
            parent.attrib['role'] = align

    fill_inline(para, elem, invert_bg=invert_bg)
    if not para.text:
        para_text = None
    else:
        para_text = para.text.replace(' ', '').replace('\t', '').replace(u'\xa0', '')
    if not (para_text or para.tail or len(para.getchildren())):
        # remove empty paragraphs
        if para in parent:
            parent.remove(para)
        elif style == 'Table':
            table.remove(para)
    else:
        if STYLES.get(orig_style, {}).get('align'):
            para.attrib['align'] = STYLES.get(orig_style, {}).get('align')
        if STYLES.get(orig_style, {}).get('margin-left') == 'true':
            para.attrib['margin-left'] = 'true'
        if style == 'Table':
            # if it's a table, the title will start with "Tableau N:", but
            # it will be renumbered in LaTeX, so we remove that heading.
            if ':' in para.text:
                para.text = para.text[para.text.index(':')+1:].strip()

        if STYLES.get(orig_style, {}).get('props'):
            # the style change italic/bold status
            props = STYLES.get(orig_style, {}).get('props')
            for p in props:
                if p == 'background-color':
                    continue
                t_emph = ET.Element('emphasis')
                t_emph.attrib['role'] = p
                t_emph.text, para.text = para.text, None
                t_emph.tail, para.tail = para.tail, None
                t_emph._children, para._children = para.getchildren(), [t_emph]

def handle_list(parent, elem):

    style = elem.attrib.get('{%s}style-name' % TEXT_NS)
    level = 1
    if style is None:
        # means our parent was also a list
        style_props = None
        ancestor = elem
        while True:
            try:
                gdparent = ancestor.parent.parent
            except AttributeError:
                break
            if gdparent.tag != '{%s}list' % TEXT_NS:
                break
            level += 1
            ancestor = gdparent
        ancestorstyle = ancestor.attrib.get('{%s}style-name' % TEXT_NS)
        style_props = STYLES.get('LIST:' + ancestorstyle)
    else:
        if 'parent' in STYLES.get(style, {}):
            style = STYLES.get(style).get('parent')
        style_props = STYLES.get('LIST:' + style)
    num_format = None
    if style_props:
        num_format = style_props.get('levels', {}).get(level, {}).get('format')
    bullet = None
    if style_props:
        bullet = style_props.get('levels', {}).get(level, {}).get('bullet')

    if style == 'Paragraph_20_Numbering':
        # paragraph numbering is special, as we want it to get out as normal
        # paragraphs, but numbered
        continue_numbering = elem.attrib.get('{%s}continue-numbering' % TEXT_NS)
        global paragraph_numbering
        if continue_numbering != 'true':
            paragraph_numbering = 0
        list_elem = parent
        for item in elem.findall('{%s}list-item' % TEXT_NS):
            item.parent = elem
            for child in item.getchildren():
                child.parent = item
                paragraph_numbering += 1
                if child.text is None:
                    child.text = u'%s. ' % paragraph_numbering
                else:
                    child.text = (u'%s. ' % paragraph_numbering) + child.text
                handle_elem(parent, child)
    else:
        para = ET.SubElement(parent, 'para')

        if style in ('Liste_20_Alpha', 'AlphaList') or num_format == 'a':
            list_elem = ET.SubElement(para, 'orderedlist')
            list_elem.attrib['continuation'] = 'restarts'
            list_elem.attrib['numeration'] = 'loweralpha'
        elif style in ('Liste_20_Num', 'NumList') or num_format == '1':
            list_elem = ET.SubElement(para, 'orderedlist')
            list_elem.attrib['continuation'] = 'restarts'
            list_elem.attrib['numeration'] = 'arabic'
        else:
            list_elem = ET.SubElement(para, 'itemizedlist')
        for item in elem.findall('{%s}list-item' % TEXT_NS):
            item.parent = elem
            listitem = ET.SubElement(list_elem, 'listitem')
            if bullet and bullet != '-':
                listitem.attrib['bullet'] = bullet
            for child in item.getchildren():
                child.parent = item
                handle_elem(listitem, child)


def handle_signature_table(parent, elem):
    table_with_titles = False
    number_columns = 0
    for row in elem.findall('{%s}table-row' % TABLE_NS):
        cells = row.findall('{%s}table-cell' % TABLE_NS)
        number_columns = len(cells)
        for cell in cells:
            para = cell.findall('{%s}p' % TEXT_NS)[0]
            if para.text is None and not para.getchildren():
                number_columns -= 1
            style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
            if style_name and 'props' in STYLES.get(style_name, {}):
                props = STYLES.get(style_name).get('props')
                if props and 'italic' in props:
                    table_with_titles = True

    table = ET.SubElement(parent, 'informaltable')
    tgroup = ET.SubElement(table, 'tgroup')
    tgroup.attrib['cols'] = '2'

    if number_columns == 1:
        # create a fake column, to mark indentation
        colspec = ET.SubElement(tgroup, 'colspec')
        colspec.attrib['colname'] = 'C1'
        colspec.attrib['colnum'] = '1'
        if table_with_titles:
            colspec.attrib['colwidth'] = '46pt'
        else:
            colspec.attrib['colwidth'] = '22pt'

        colspec = ET.SubElement(tgroup, 'colspec')
        colspec.attrib['colname'] = 'C2'
        colspec.attrib['colnum'] = '2'
        colspec.attrib['colwidth'] = '92.1pt'
    else:
        colspec = ET.SubElement(tgroup, 'colspec')
        colspec.attrib['colname'] = 'C1'
        colspec.attrib['colnum'] = '1'
        colspec.attrib['colwidth'] = '92.1pt'

        colspec = ET.SubElement(tgroup, 'colspec')
        colspec.attrib['colname'] = 'C2'
        colspec.attrib['colnum'] = '2'
        colspec.attrib['colwidth'] = '92.1pt'


    tbody = ET.SubElement(tgroup, 'tbody')
    for row in elem.findall('{%s}table-row' % TABLE_NS):
        trow = ET.SubElement(tbody, 'row')
        if number_columns == 1:
            # empty cell for first (fake) column
            tcell = ET.SubElement(trow, 'entry')
            tpara = ET.SubElement(tcell, 'para')

        for cell in row.findall('{%s}table-cell' % TABLE_NS):
            para = cell.findall('{%s}p' % TEXT_NS)[0]
            if para.text is None and not para.getchildren():
                continue
            tcell = ET.SubElement(trow, 'entry')
            tpara = ET.SubElement(tcell, 'para')
            style_name = para.attrib.get('{%s}style-name' % TEXT_NS)
            if style_name and 'props' in STYLES.get(style_name, {}):
                props = STYLES.get(style_name).get('props')
                if props and 'italic' in props:
                    t_emph = ET.SubElement(tpara, 'emphasis')
                    tpara = t_emph
            fill_inline(tpara, para)

        # add empty interline
        trow = ET.SubElement(tbody, 'row')
        tcell = ET.SubElement(trow, 'entry')
        tpara = ET.SubElement(tcell, 'para')
        if number_columns == 1:
            tcell = ET.SubElement(trow, 'entry')
            tpara = ET.SubElement(tcell, 'para')


def handle_table(parent, elem):
    try:
        row = elem.findall('{%s}table-row' % TABLE_NS)[0]
        cell = row.findall('{%s}table-cell' % TABLE_NS)[0]
        p = cell.findall('{%s}p' % TEXT_NS)[0]
        p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
        if p_style_name == 'Signature':
            return handle_signature_table(parent, elem)
        if STYLES.get(p_style_name) and STYLES[p_style_name].get('parent') == 'Signature':
            return handle_signature_table(parent, elem)
    except IndexError:
        pass

    table = ET.SubElement(parent, 'table')
    cols = elem.findall('{%s}table-columns/{%s}table-column' % (TABLE_NS, TABLE_NS))
    if not cols:
        cols = elem.findall('{%s}table-column' % TABLE_NS)
    nbcols = 0
    tgroup = ET.SubElement(table, 'tgroup')

    # count columns, and generate colspecs
    for c in cols:
        nb_new_cols = int(c.attrib.get('{%s}number-columns-repeated' % TABLE_NS, 1))
        style_name = c.attrib.get('{%s}style-name' % TABLE_NS)
        col_width = None
        if style_name and style_name in STYLES:
            style = STYLES.get(style_name)
            col_width = style.get('column-width')

        offset = 0
        for i in range(nb_new_cols):
            alignment = None
            offset = 0
            for row in elem.findall('{%s}table-row' % TABLE_NS):
                try:
                    cell = row.findall('{%s}table-cell' % TABLE_NS)[nbcols+i+offset]
                except IndexError:
                    # somehow it was impossible to get to that cell, ignore that
                    break
                columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
                if columns_spanned:
                    offset -= int(columns_spanned) + 1
                if not cell.findall('{%s}p' % TEXT_NS):
                    continue
                p = cell.findall('{%s}p' % TEXT_NS)[0]
                p_style_name = p.attrib.get('{%s}style-name' % TEXT_NS)
                if not p_style_name:
                    continue
                p_style = STYLES.get(p_style_name)
                if not p_style:
                    continue
                alignment = p_style.get('align')
                if alignment:
                    break

            colspec = ET.SubElement(tgroup, 'colspec')
            colspec.attrib['colnum'] = str(nbcols + i + 1)
            colspec.attrib['colname'] = 'col%s' % colspec.attrib['colnum']
            if col_width:
                colspec.attrib['colwidth'] = '%spt' % col_width
            if alignment:
                colspec.attrib['align'] = alignment
        nbcols += nb_new_cols
    tgroup.attrib['cols'] = '%s' % nbcols

    # pass over all colspecs to set a width if it was not set before
    for colspec in tgroup.getchildren():
        if not colspec.attrib.has_key('colwidth'):
            # (A4 width in pts - some margin) / nbcols
            colspec.attrib['colwidth'] = '%spt' % (500/nbcols)

    if elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
        thead = ET.SubElement(tgroup, 'thead')
        for row in elem.findall('{%s}table-header-rows/{%s}table-row' % (TABLE_NS, TABLE_NS)):
            trow = ET.SubElement(thead, 'row')
            for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
                tcell = ET.SubElement(trow, 'entry')
                for child in cell.getchildren():
                    handle_elem(tcell, child)
                columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
                if columns_spanned:
                    tcell.attrib['namest'] = 'col%s' % (i+1)
                    tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))

    rows = elem.findall('{%s}table-rows/{%s}table-row' % (TABLE_NS, TABLE_NS))
    if not rows:
        rows = elem.findall('{%s}table-row' % TABLE_NS)

    if rows:
        first_row = rows[0]
        for cell in first_row.findall('{%s}table-cell' % TABLE_NS):
            for elem in cell.getchildren():
                if elem.tag != '{%s}p' % TEXT_NS:
                    continue
                style = elem.attrib.get('{%s}style-name' % TEXT_NS)

                if style == 'Table_20_Heading':
                    # this is actually an title line
                    thead = ET.SubElement(tgroup, 'thead')

                    trow = ET.SubElement(thead, 'row')
                    for i, cell in enumerate(first_row.findall('{%s}table-cell' % TABLE_NS)):
                        tcell = ET.SubElement(trow, 'entry')
                        for child in cell.getchildren():
                            handle_elem(tcell, child)

                        columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
                        if columns_spanned:
                            tcell.attrib['namest'] = 'col%s' % (i+1)
                            tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))

                    rows = rows[1:]
                    break
            else:
                continue
            break

    tbody = ET.SubElement(tgroup, 'tbody')
    for row in rows:
        trow = ET.SubElement(tbody, 'row')
        for i, cell in enumerate(row.findall('{%s}table-cell' % TABLE_NS)):
            tcell = ET.SubElement(trow, 'entry')
            for child in cell.getchildren():
                handle_elem(tcell, child)
                if tcell.getchildren() and tcell.getchildren()[-1].attrib.get('align'):
                    # remove align attribute on cell paragraphs
                    del tcell.getchildren()[-1].attrib['align']

                columns_spanned = cell.attrib.get('{%s}number-columns-spanned' % TABLE_NS)
                if columns_spanned:
                    tcell.attrib['namest'] = 'col%s' % (i+1)
                    tcell.attrib['nameend'] = 'col%s' % (i+int(columns_spanned))

    return


def handle_elem(parent, elem):
    '''
    Handle a block element (paragraph, lists, etc.)
    '''

    if elem.tag == '{%s}p' % TEXT_NS:
        return handle_paragraph(parent, elem)

    if elem.tag == '{%s}list' % TEXT_NS:
        return handle_list(parent, elem)

    if elem.tag == '{%s}table' % TABLE_NS:
        return handle_table(parent, elem)


    if debug:
        print >> sys.stderr, 'W: unhandled element:', elem.tag


def look_for_annotation(elem):
    '''Look for an annotation, in children and subchildren'''
    speaker_annotation = [x for x in elem.getchildren() if x.tag == '{%s}annotation' % OFFICE_NS]
    if speaker_annotation:
        return speaker_annotation[0]

    for child in elem.getchildren():
        speaker_annotation = look_for_annotation(child)
        if speaker_annotation:
            return speaker_annotation

    return None


def convert(input_filename, output_filename):
    '''
    Convert a file from the OpenDocument Format to the legacy .legi format
    '''

    # get content.xml and meta.xml from file
    z = zipfile.ZipFile(input_filename)
    content = None
    metadata = None
    for zfile in z.namelist():
        if zfile == 'content.xml':
            content = z.read(zfile)
        elif zfile == 'meta.xml':
            metadata = z.read(zfile)

        if content and metadata:
            break

    legi = convert_to_legi_xml(content, metadata)
    if debug:
        print legi

    # add XML prolog, necessary for some legacy Tabellio tools
    legi = '<?xml version="1.0"?>\n' + legi

    # write down content to the .legi file
    legiz = zipfile.ZipFile(output_filename, 'w')
    zi = zipfile.ZipInfo('contents.xml')
    zi.external_attr = 0664 << 16L
    legiz.writestr(zi, legi)

    # copy pictures to the .legi file
    for zfile in z.namelist():
        if not zfile.startswith('Pictures/'):
            continue
        zi = zipfile.ZipInfo(os.path.basename(zfile))
        zi.external_attr = 0664 << 16L
        legiz.writestr(zi, z.read(zfile))
    legiz.close()

def parse_automatic_styles(content_tree):
    '''
    Parse styles created automatically and populate the global styles
    dictionary.
    '''
    global STYLES

    for elem in content_tree.findall('{%s}automatic-styles/{%s}style' % (OFFICE_NS, STYLE_NS)):
        name = elem.attrib.get('{%s}name' % STYLE_NS)
        props = []
        STYLES[name] = {}
        # get inline italic, bold and underline attributes
        if elem.attrib.get('{%s}family' % STYLE_NS) in ('paragraph', 'text'):
            for prop in elem.findall('{%s}text-properties' % STYLE_NS):
                if prop.attrib.get('{%s}font-style' % FO_NS) == 'italic':
                    props.append('italic')
                if prop.attrib.get('{%s}font-weight' % FO_NS) == 'bold':
                    props.append('bold')
                if prop.attrib.get('{%s}text-underline-style' % STYLE_NS) == 'solid':
                    props.insert(0, 'underline')
                if prop.attrib.get('{%s}background-color' % FO_NS):
                    if elem.attrib.get('{%s}family' % STYLE_NS) == 'paragraph':
                        if prop.attrib.get('{%s}background-color' % FO_NS) not in (None, 'transparent', '#ffffff'):
                            props.append('background-color')
                    else:
                        if prop.attrib.get('{%s}background-color' % FO_NS) in (None, 'transparent', '#ffffff'):
                            props.append('no-background-color')
                        else:
                            props.append('background-color')
            for prop in elem.findall('{%s}paragraph-properties' % STYLE_NS):
                if prop.attrib.get('{%s}break-before' % FO_NS) == 'page':
                    STYLES[name]['page-break'] = True

                if prop.attrib.get('{%s}text-align' % FO_NS) == 'center':
                    STYLES[name]['align'] = 'center'
                elif prop.attrib.get('{%s}text-align' % FO_NS) == 'end':
                    STYLES[name]['align'] = 'right'

                if prop.attrib.get('{%s}margin-left' % FO_NS, '0cm') != '0cm':
                    STYLES[name]['margin-left'] = 'true'

        if elem.attrib.get('{%s}family' % STYLE_NS) == 'table-column':
            for prop in elem.findall('{%s}table-column-properties' % STYLE_NS):
                if prop.attrib.get('{%s}column-width' % STYLE_NS):
                    STYLES[name]['column-width'] = convert_to_pt(
                            prop.attrib.get('{%s}column-width' % STYLE_NS))

        if props:
            STYLES[name]['props'] = props

        if elem.attrib.get('{%s}parent-style-name' % STYLE_NS):
            STYLES[name]['parent'] = elem.attrib.get('{%s}parent-style-name' % STYLE_NS)

    # parse automatic list styles
    for elem in content_tree.findall('{%s}automatic-styles/{%s}list-style' % (OFFICE_NS, TEXT_NS)):
        style_name = 'LIST:%s' % elem.attrib.get('{%s}name' % STYLE_NS)
        STYLES[style_name] = {'levels': {}}
        for level in elem.findall('{%s}list-level-style-number' % TEXT_NS):
            num_level = level.attrib.get('{%s}level' % TEXT_NS)
            num_format = level.attrib.get('{%s}num-format' % STYLE_NS)
            STYLES[style_name]['levels'][int(num_level)] = {
                'format': num_format,
            }
        for level in elem.findall('{%s}list-level-style-bullet' % TEXT_NS):
            num_level = level.attrib.get('{%s}level' % TEXT_NS)
            bullet_char = level.attrib.get('{%s}bullet-char' % TEXT_NS)
            if not STYLES[style_name]['levels'].get(int(num_level)):
                STYLES[style_name]['levels'][int(num_level)] = {}
            STYLES[style_name]['levels'][int(num_level)].update({
                'bullet': bullet_char,
            })


def convert_to_legi_xml(content, metadata = None):
    '''
    Convert a content.xml/metadata.xml pair from an odt file
    to the legi XML format.
    '''

    global STYLES

    # create top <book> element
    legi = ET.Element('book')

    # parse content
    content_tree = ET.ElementTree(ET.fromstring(content))

    if metadata:
        metadata_tree = ET.ElementTree(ET.fromstring(metadata))
        # create child <metadata> element
        metadata_element = ET.SubElement(legi, 'metadata')
        create_metadata(metadata_element, metadata_tree, content_tree)

    current_top = [legi]
    current_legi = []
    speech = None
    offstructure = None

    levels = {'book':0, 'part':1, 'chapter':2, 'preface':2, 'synthese':2, 'sect1':3, 'sect2':4, 'sect3':5}
    legistic_levels = {'legistic_part': 0,
                       'legistic_book': 1,
                       'legistic_title': 2,
                       'legistic_chapter':3,
                       'legistic_section':4,
                       'legistic_subsection':5}

    parse_automatic_styles(content_tree)

    # convert content
    for elem in content_tree.find('{%s}body/{%s}text' % (OFFICE_NS, OFFICE_NS)).getchildren():

        if elem.tag == '{%s}user-field-decls' % TEXT_NS:
            # user fields are handled as part of the metadata
            continue

        if elem.tag == '{%s}list' % TEXT_NS:
            # OOo tends to consider titles as lists, so its numbering works,
            # so look for a single paragraph item in the list, and get it out
            # of it if it exists with a title style name.
            style = elem.attrib.get('{%s}style-name' % TEXT_NS)
            if 'parent' in STYLES.get(style, {}):
                style = STYLES.get(style).get('parent')
            real_elem = elem.findall('{%s}list-item/{%s}p' % (TEXT_NS, TEXT_NS))
            if len(real_elem) == 1:
                real_elem = real_elem[0]
                style = real_elem.attrib.get('{%s}style-name' % TEXT_NS)
                if 'parent' in STYLES.get(style, {}):
                    style = STYLES.get(style).get('parent')
                if style in ('Lchapitre', 'Lsection', 'Lpart', 'LLivre', 'Ltitre'):
                    elem = real_elem

        # some paragraphs are out of title hierarchy but should nevertheless
        # be handled as title in the final output
        is_p_title = False
        if elem.tag == '{%s}p' % TEXT_NS:
            style = elem.attrib.get('{%s}style-name' % TEXT_NS)

            if STYLES.get(style, {}).get('page-break') is True:
                pb = ET.ProcessingInstruction('page-break')
                current_top[-1].append(pb)

            if 'parent' in STYLES.get(style, {}):
                style = STYLES.get(style).get('parent')


            if style in PARAGRAPH_TITLE_STYLES:
                new_level = PARAGRAPH_TITLE_STYLES.get(style)
                is_p_title = True

            speaker_annotation = look_for_annotation(elem)
            if speaker_annotation is not None:
                if speech: # there was a speech, pop it
                    current_top.pop()
                speech = ET.SubElement(current_top[-1], 'speech')
                current_top.append(speech)
                ref = ET.SubElement(speech, 'ref')
                for param in speaker_annotation.getchildren():
                    if param.tag != '{%s}p' % TEXT_NS:
                        continue
                    try:
                        arg, value = tuple([x.strip() for x in param.text.split(':', 2)])
                        if arg == 'type':
                            ref.attrib['type'] = value
                        else:
                            legi_param = ET.SubElement(ref, 'param')
                            legi_param.attrib['name'] = arg
                            legi_param.text = value
                    except (AttributeError, ValueError):
                        # annotation paragraph was not "key: value"; consider
                        # it as a comment and ignore it.
                        pass

        # handle all titles
        if elem.tag == '{%s}h' % TEXT_NS or is_p_title:
            style = elem.attrib.get('{%s}style' % TEXT_NS)
            if not style:
                style = elem.attrib.get('{%s}style-name' % TEXT_NS)

            if 'parent' in STYLES.get(style, {}):
                style = STYLES.get(style).get('parent')

            if style in TITLE_LEVELS:
                new_level = TITLE_LEVELS.get(style)
            elif style == 'Sous-Titre':
                # Subtitles are out-of-hierarchy, just adding a <subtitle> node
                # under an existing <title> node; they have no content
                if current_legi:
                    subtitle = ET.SubElement(current_legi[-1], 'subtitle')
                else:
                    subtitle = ET.SubElement(current_top[-1], 'subtitle')
                fill_inline(subtitle, elem)
                continue
            else:
                if debug:
                    print >> sys.stderr, 'E: unknown heading style:', style

            if new_level.startswith('legistic_'):
                # title in a legistic part
                if current_legi:
                    # existing legistic part
                    current_level = current_legi[-1].tag
                    if legistic_levels[current_level] == legistic_levels[new_level]:
                        current_legi.pop()
                        current_legi.append(ET.SubElement(current_legi[-1], new_level))
                    elif legistic_levels[current_level] > legistic_levels[new_level]:
                        current_legi.pop()
                        for i in range(legistic_levels[current_level] - legistic_levels[new_level]):
                            current_legi.pop()
                        current_legi.append(ET.SubElement(current_legi[-1], new_level))
                    elif legistic_levels[current_level] < legistic_levels[new_level]:
                        current_legi.append(ET.SubElement(current_legi[-1], new_level))
                else:
                    # new legistic part
                    current_legi.append(current_top[-1])
                    current_legi.append(ET.SubElement(current_top[-1], new_level))
            else:
                # title in a "normal" part

                # must first close current legistic part, if any
                if current_legi:
                    current_legi = []

                if speech: # there was a speech, pop it
                    current_top.pop()
                    speech = None

                current_level = current_top[-1].tag
                if levels[current_level] == levels[new_level]:
                    current_top.pop()
                    current_top.append(ET.SubElement(current_top[-1], new_level))
                elif levels[current_level] > levels[new_level]:
                    current_top.pop()
                    for i in range(levels[current_level] - levels[new_level]):
                        if len(current_top) == 1:
                            if debug:
                                print >> sys.stderr, 'W: would go too low'
                            break
                        current_top.pop()
                    current_top.append(ET.SubElement(current_top[-1], new_level))
                elif levels[current_level] < levels[new_level]:
                    current_top.append(ET.SubElement(current_top[-1], new_level))

            if current_legi:
                title = ET.SubElement(current_legi[-1], 'title')
            else:
                title = ET.SubElement(current_top[-1], 'title')

            # fill title with content
            fill_inline(title, elem)
            continue

        # handle other content
        if len(current_top) > 1 or current_legi:
            if current_legi:
                handle_elem(current_legi[-1], elem)
            else:
                handle_elem(current_top[-1], elem)
        else:
            # this is out of hierarchy, before any title, this should not be
            # authorized but people got used to do that for prefaces
            if len(current_top[0].getchildren()) == 1 and offstructure is None:
                offstructure = ET.SubElement(current_top[-1], 'nosection')
            elif len(current_top[0].getchildren()) == 2 and offstructure is not None:
                handle_elem(offstructure, elem)
                if len(offstructure.getchildren()) > 0 and \
                   len(offstructure.getchildren()[0].getchildren()) == 1 and \
                   offstructure.getchildren()[0].text is None and \
                   offstructure.getchildren()[0].getchildren()[0].tag == 'footnote':
                    # This is a special situation, title page with a footnote
                    # (such as "Voir Doc. n°161 (2010-2011)."), the footnote
                    # would be considered part of the content, and on next
                    # conversion we would end with two footnotes, the one
                    # created from the metadata, and the one created from this
                    # <nosection> element. Therefore we detect the situation
                    # where the first item of a <nosection> is a footnote, and
                    # clear it.
                    offstructure.remove(offstructure.getchildren()[0])

    # get content as an XML tree
    out = StringIO()
    ET.ElementTree(legi).write(out)

    return out.getvalue()


def main():
    global debug

    parser = OptionParser()
    parser.add_option('--debug',
        action = 'store_true', dest = 'debug',
        help = 'display some output useful for debugging')
    options, args = parser.parse_args()

    debug = options.debug

    if len(args) == 2:
        convert(args[0], args[1])
    else:
        convert(args[0], args[0].replace('.odt', '.legi'))


if __name__ == '__main__':
    main()