tabularfile/tabularfile/ods.py

# tabularfile.ods - simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import contextlib
import datetime
import io
from lxml import etree as ET
import re
import sys
import tempfile
import zipfile

from .common import TabularFileError, parse_date, parse_datetime


class ODSUnsupportedCellException(TabularFileError):
    pass


class Namespace:
    def __init__(self, url):
        self.url = url

    def __call__(self, name):
        return '{%s}%s' % (self.url, name)


TEXT_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:text:1.0')
P = TEXT_NS('p')
A = TEXT_NS('a')

TABLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:table:1.0')
TABLE = TABLE_NS('table')
TABLE_NAME = TABLE_NS('name')
COLUMN = TABLE_NS('table-column')
ROW = TABLE_NS('table-row')
CELL = TABLE_NS('table-cell')
COVERED_CELL = TABLE_NS('covered-table-cell')

NUMBER_COLUMNS_REPEATED = TABLE_NS('number-columns-repeated')
NUMBER_ROWS_REPEATED = TABLE_NS('number-rows-repeated')
NUMBER_COLUMNS_SPANNED = TABLE_NS('number-columns-spanned')
NUMBER_ROWS_SPANNED = TABLE_NS('number-rows-spanned')
TABLE_STYLE_NAME = TABLE_NS('style-name')

OFFICE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:office:1.0')
DOCUMENT_CONTENT = OFFICE_NS('document-content')
BODY = OFFICE_NS('body')
SPREADSHEET = OFFICE_NS('spreadsheet')
VALUE_TYPE = OFFICE_NS('value-type')
VALUE = OFFICE_NS('value')
DATE_VALUE = OFFICE_NS('date-value')
DOCUMENT_STYLES = OFFICE_NS('document-styles')
FONT_FACE_DECLS = OFFICE_NS('font-face-decls')
STYLES = OFFICE_NS('styles')

STYLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:style:1.0')
STYLE = STYLE_NS('style')
STYLE_NAME = STYLE_NS('name')
FAMILY = STYLE_NS('family')
DATA_STYLE_NAME = STYLE_NS('data-style-name')
PARENT_STYLE = STYLE_NS('parent-style')
TABLE_COLUMN_PROPERTIES = STYLE_NS('table-column-properties')
COLUMN_WIDTH = STYLE_NS('column-width')

NUMBER_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0')
DATE_STYLE = NUMBER_NS('date-style')
NUMBER_STYLE = NUMBER_NS('style')
YEAR = NUMBER_NS('year')
MONTH = NUMBER_NS('month')
DAY = NUMBER_NS('day')
HOURS = NUMBER_NS('hours')
MINUTES = NUMBER_NS('minutes')
SECONDS = NUMBER_NS('seconds')
TEXT = NUMBER_NS('text')

XLINK_NS = Namespace('http://www.w3.org/1999/xlink')
XLINK_HREF = XLINK_NS('href')

VALUE_WITH_HREF_CLASS_REGISTRY = {}


def value_with_href(value, href=None):
    if not href:
        return value
    return LinkedValue(value, href)


def text_content(node):
    '''Extract text content from node and all its children. Equivalent to
       xmlNodeGetContent from libxml.'''
    return ''.join(node.itertext())


TYPED_VALUE_TYPES = ['float', 'date']


class LinkedValue(object):
    def __init__(self, value, href):
        assert href, 'href is mandatory'

        self.value = value
        self.href = href

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False
        return self.value == other.value and self.href == other.href

    def __repr__(self):
        return 'LinkedValue({self.value!r}, {self.href!r})'.format(self=self)


def cell_content(elem, typed=False):
    text = '\n'.join(text_content(elem) for subelem in elem.iter(tag=P))
    value_type = elem.attrib.get(VALUE_TYPE, 'string')

    if not typed or value_type not in TYPED_VALUE_TYPES:
        pass
    elif value_type == 'float':
        value = elem.attrib.get(VALUE, '')
        try:
            return float(value)
        except ValueError:
            pass
    elif value_type == 'date':
        value = elem.attrib.get(DATE_VALUE, '')
        try:
            return parse_datetime(value)
        except ValueError:
            pass
        try:
            return parse_date(value)
        except ValueError:
            pass
    return text


def cell_repeat(elem):
    try:
        repeat = int(elem.attrib[NUMBER_COLUMNS_REPEATED])
        if repeat < 1:
            return 1
        return repeat
    except (KeyError, ValueError):
        return 1


def row_repeat(elem):
    try:
        repeat = int(elem.attrib[NUMBER_ROWS_REPEATED])
        if repeat < 1:
            return 1
        return repeat
    except (KeyError, ValueError):
        return 1


class Reader:
    def __init__(self, ods_content, sheet=0, typed=False, xlink=False, **kwargs):
        self.ods_content = ods_content
        self.sheet = sheet
        self.typed = typed
        self.xlink = xlink

    @contextlib.contextmanager
    def _zip(self):
        if isinstance(self.ods_content, bytes):
            fd = io.BytesIO(self.ods_content)
            with zipfile.ZipFile(fd) as _zip:
                yield _zip
        else:
            with zipfile.ZipFile(self.ods_content) as _zip:
                yield _zip

    @property
    def sheets(self):
        sheets = []
        with self._parser_context(events=('start',), tag=TABLE) as context:
            idx = 0
            for event, elem in context:
                name = idx
                idx += 1
                if TABLE_NAME in elem.attrib:
                    name = elem.attrib[TABLE_NAME]
                sheets.append(name)
        self.__dict__['sheets'] = sheets
        return sheets

    @contextlib.contextmanager
    def _parser_context(self,
                        events=('start', 'end'),
                        tag=(TABLE, ROW, CELL, COVERED_CELL),
                        **kwargs):
        with self._zip() as _zip:
            with _zip.open('content.xml', mode='r') as fd:
                yield ET.iterparse(fd, events=events, tag=tag, **kwargs)

    def __iter__(self):
        with self._parser_context() as context:
            yield from self._parse_target_sheet(context)

    def _parse_target_sheet(self, context):
        idx = 0
        for event, elem in context:
            if event == 'start' and elem.tag == TABLE:
                if idx == self.sheet:
                    yield from self._parse_sheet(context)
                    break
                idx += 1
            elem.clear()
            parent = elem.getparent()
            if parent is not None:
                parent.remove(elem)

    def _parse_sheet(self, context):
        for event, elem in context:
            if event == 'end' and elem.tag == TABLE:
                elem.clear()
                break
            elif event == 'start' and elem.tag == ROW:
                repeat = row_repeat(elem)
                row = self._parse_row(context)
                for _ in range(repeat):
                    yield list(row)
            elem.clear()
            parent = elem.getparent()
            if parent is not None:
                parent.remove(elem)

    def _parse_row(self, context):
        row = []
        idx = 0

        for event, elem in context:
            if event == 'end' and elem.tag == ROW:
                elem.clear()
                parent = elem.getparent()
                if parent is not None:
                    parent.remove(elem)
                return row
            elif event == 'start' and elem.tag == CELL:
                # ignore last repeated empty cell
                content, repeat = self._parse_cell(context)
                if content:
                    if len(row) != idx:
                        # complete missing cells if previous cell was empty and repeated
                        row.extend([''] * (idx - len(row)))
                    row.extend([content] * repeat)
                idx += repeat
            else:
                elem.clear()

    def _parse_cell(self, context):
        for event, elem in context:
            if event == 'end' and elem.tag == COVERED_CELL:
                raise ODSUnsupportedCellException('table:covered-table-cell is unsupported')
            if event == 'end' and elem.tag == CELL:
                if NUMBER_COLUMNS_SPANNED in elem.attrib or NUMBER_ROWS_SPANNED in elem.attrib:
                    raise ODSUnsupportedCellException(
                        'fusioned cells are unsupported '
                        '(table:number-rows-spanned and '
                        'table:number-columns-spanned attributes)'
                    )
                value = cell_content(elem, typed=self.typed)
                # text-cell:
                #   p:
                #     a:
                #       (text, href=...)
                if (self.xlink
                        and len(elem)
                        and elem[0].text is None
                        and len(elem[0]) == 1
                        and elem[0][0].tag == A):
                    value = value_with_href(value, href=elem[0][0].attrib.get(XLINK_HREF))
                return value, cell_repeat(elem)


def styles_xml_content(date_format, datetime_format):
    root = ET.Element(DOCUMENT_STYLES)
    styles = ET.SubElement(root, STYLES)

    # default style, inherited by other styles
    ET.SubElement(styles, STYLE, attrib={STYLE_NAME: 'Default'})

    def define_date_style(name, strftime_string):
        date_style = ET.SubElement(styles, DATE_STYLE, attrib={STYLE_NAME: name + 'NumberFormat'})

        for part in re.findall(r'%?.', strftime_string):
            if part == '%Y':
                ET.SubElement(date_style, YEAR, attrib={NUMBER_STYLE: 'long'})
            elif part == '%m':
                ET.SubElement(date_style, MONTH, attrib={NUMBER_STYLE: 'long'})
            elif part == '%d':
                ET.SubElement(date_style, DAY, attrib={NUMBER_STYLE: 'long'})
            elif part == '%H':
                ET.SubElement(date_style, HOURS, attrib={NUMBER_STYLE: 'long'})
            elif part == '%M':
                ET.SubElement(date_style, MINUTES, attrib={NUMBER_STYLE: 'long'})
            elif part == '%S':
                ET.SubElement(date_style, SECONDS, attrib={NUMBER_STYLE: 'long'})
            else:
                ET.SubElement(date_style, TEXT).text = part
        ET.SubElement(
            styles,
            STYLE,
            attrib={
                STYLE_NAME: name,
                FAMILY: 'table-cell',
                DATA_STYLE_NAME: name + 'NumberFormat',
                PARENT_STYLE: 'Default',
            },
        )

    define_date_style('Date', date_format)
    define_date_style('DateTime', datetime_format)
    return ET.tostring(root)


class Writer:
    def __init__(self, target, date_format='%Y-%m-%d', datetime_format='%Y-%m-%dT%H:%M:%S'):
        self.target = target
        self.date_format = date_format
        self.datetime_format = datetime_format

    @contextlib.contextmanager
    def _zip(self):
        with zipfile.ZipFile(self.target, 'w') as _zip:
            _zip.writestr('mimetype', 'application/vnd.oasis.opendocument.spreadsheet')
            _zip.writestr(
                'META-INF/manifest.xml',
                '''<?xml version="1.0" encoding="UTF-8"?>
    <manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
     <manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.spreadsheet"/>
     <manifest:file-entry manifest:full-path="styles.xml" manifest:media-type="text/xml"/>
     <manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/>
     <manifest:file-entry manifest:full-path="META-INF/manifest.xml" manifest:media-type="text/xml"/>
     <manifest:file-entry manifest:full-path="mimetype" manifest:media-type="text/plain"/>
    </manifest:manifest>''',
            )
            _zip.writestr(
                'styles.xml',
                styles_xml_content(date_format=self.date_format, datetime_format=self.datetime_format),
            )
            yield _zip

    @contextlib.contextmanager
    def _serialization_context(self):
        with self._zip() as _zip:
            if sys.version_info >= (3, 6):
                with _zip.open('content.xml', mode='w') as fd:
                    with ET.xmlfile(fd) as xml_writer:
                        yield xml_writer
            else:
                # we must use a temporary file before python 3.6
                with tempfile.NamedTemporaryFile() as fd:
                    with ET.xmlfile(fd) as xml_writer:
                        yield xml_writer
                    fd.flush()
                    _zip.write(fd.name, 'content.xml')

    class WriterHelper:
        def __init__(self, context):
            self.context = context

        def writerow(self, row):
            def writecell(raw_value):

                attrib = {}

                if isinstance(raw_value, LinkedValue):
                    value = raw_value.value
                else:
                    value = raw_value

                if isinstance(value, (float, int)):
                    value_type = 'float'
                    text_value = str(value)
                    attrib[VALUE] = text_value
                elif isinstance(value, datetime.datetime):
                    value_type = 'date'
                    text_value = value.isoformat()
                    attrib[DATE_VALUE] = text_value
                    attrib[TABLE_STYLE_NAME] = 'DateTime'
                elif isinstance(value, datetime.date):
                    value_type = 'date'
                    text_value = value.isoformat()
                    attrib[DATE_VALUE] = text_value
                    attrib[TABLE_STYLE_NAME] = 'Date'
                else:
                    value_type = 'string'
                    text_value = str(value)

                attrib[VALUE_TYPE] = value_type

                with self.context.element(CELL, attrib=attrib):
                    with self.context.element(P):
                        if isinstance(raw_value, LinkedValue):
                            with self.context.element(A, attrib={XLINK_HREF: raw_value.href}):
                                self.context.write(text_value)
                        else:
                            self.context.write(text_value)

            with self.context.element(ROW):
                for value in row:
                    writecell(value)

        def writerows(self, rows):
            for row in rows:
                self.writerow(row)

        @property
        @contextlib.contextmanager
        def cell_writer(self):
            row = []

            def write(value, href=None):
                if href:
                    raw_value = LinkedValue(value, href)
                else:
                    raw_value = value
                row.append(raw_value)

            yield write

            self.writerow(row)

    @contextlib.contextmanager
    def open(self):
        with self._serialization_context() as context:
            with context.element(DOCUMENT_CONTENT):
                with context.element(BODY):
                    with context.element(SPREADSHEET):
                        with context.element(TABLE):
                            yield self.WriterHelper(context)


@contextlib.contextmanager
def writer(target, **kwargs):
    with Writer(target, **kwargs).open() as _writer:
        yield _writer