This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
tabularfile/tabularfile/ods.py

455 lines
15 KiB
Python

# tabularfile.ods - simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import contextlib
import datetime
import io
from lxml import etree as ET
import re
import sys
import tempfile
import zipfile
from .common import TabularFileError, parse_date, parse_datetime
class ODSUnsupportedCellException(TabularFileError):
pass
class Namespace:
def __init__(self, url):
self.url = url
def __call__(self, name):
return '{%s}%s' % (self.url, name)
TEXT_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:text:1.0')
P = TEXT_NS('p')
A = TEXT_NS('a')
TABLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:table:1.0')
TABLE = TABLE_NS('table')
TABLE_NAME = TABLE_NS('name')
COLUMN = TABLE_NS('table-column')
ROW = TABLE_NS('table-row')
CELL = TABLE_NS('table-cell')
COVERED_CELL = TABLE_NS('covered-table-cell')
NUMBER_COLUMNS_REPEATED = TABLE_NS('number-columns-repeated')
NUMBER_ROWS_REPEATED = TABLE_NS('number-rows-repeated')
NUMBER_COLUMNS_SPANNED = TABLE_NS('number-columns-spanned')
NUMBER_ROWS_SPANNED = TABLE_NS('number-rows-spanned')
TABLE_STYLE_NAME = TABLE_NS('style-name')
OFFICE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:office:1.0')
DOCUMENT_CONTENT = OFFICE_NS('document-content')
BODY = OFFICE_NS('body')
SPREADSHEET = OFFICE_NS('spreadsheet')
VALUE_TYPE = OFFICE_NS('value-type')
VALUE = OFFICE_NS('value')
DATE_VALUE = OFFICE_NS('date-value')
DOCUMENT_STYLES = OFFICE_NS('document-styles')
FONT_FACE_DECLS = OFFICE_NS('font-face-decls')
STYLES = OFFICE_NS('styles')
STYLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:style:1.0')
STYLE = STYLE_NS('style')
STYLE_NAME = STYLE_NS('name')
FAMILY = STYLE_NS('family')
DATA_STYLE_NAME = STYLE_NS('data-style-name')
PARENT_STYLE = STYLE_NS('parent-style')
TABLE_COLUMN_PROPERTIES = STYLE_NS('table-column-properties')
COLUMN_WIDTH = STYLE_NS('column-width')
NUMBER_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0')
DATE_STYLE = NUMBER_NS('date-style')
NUMBER_STYLE = NUMBER_NS('style')
YEAR = NUMBER_NS('year')
MONTH = NUMBER_NS('month')
DAY = NUMBER_NS('day')
HOURS = NUMBER_NS('hours')
MINUTES = NUMBER_NS('minutes')
SECONDS = NUMBER_NS('seconds')
TEXT = NUMBER_NS('text')
XLINK_NS = Namespace('http://www.w3.org/1999/xlink')
XLINK_HREF = XLINK_NS('href')
VALUE_WITH_HREF_CLASS_REGISTRY = {}
def value_with_href(value, href=None):
if not href:
return value
return LinkedValue(value, href)
def text_content(node):
'''Extract text content from node and all its children. Equivalent to
xmlNodeGetContent from libxml.'''
return ''.join(node.itertext())
TYPED_VALUE_TYPES = ['float', 'date']
class LinkedValue(object):
def __init__(self, value, href):
assert href, 'href is mandatory'
self.value = value
self.href = href
def __eq__(self, other):
if not isinstance(other, self.__class__):
return False
return self.value == other.value and self.href == other.href
def __repr__(self):
return 'LinkedValue({self.value!r}, {self.href!r})'.format(self=self)
def cell_content(elem, typed=False):
text = '\n'.join(text_content(elem) for subelem in elem.iter(tag=P))
value_type = elem.attrib.get(VALUE_TYPE, 'string')
if not typed or value_type not in TYPED_VALUE_TYPES:
pass
elif value_type == 'float':
value = elem.attrib.get(VALUE, '')
try:
return float(value)
except ValueError:
pass
elif value_type == 'date':
value = elem.attrib.get(DATE_VALUE, '')
try:
return parse_datetime(value)
except ValueError:
pass
try:
return parse_date(value)
except ValueError:
pass
return text
def cell_repeat(elem):
try:
repeat = int(elem.attrib[NUMBER_COLUMNS_REPEATED])
if repeat < 1:
return 1
return repeat
except (KeyError, ValueError):
return 1
def row_repeat(elem):
try:
repeat = int(elem.attrib[NUMBER_ROWS_REPEATED])
if repeat < 1:
return 1
return repeat
except (KeyError, ValueError):
return 1
class Reader:
def __init__(self, ods_content, sheet=0, typed=False, xlink=False, **kwargs):
self.ods_content = ods_content
self.sheet = sheet
self.typed = typed
self.xlink = xlink
@contextlib.contextmanager
def _zip(self):
if isinstance(self.ods_content, bytes):
fd = io.BytesIO(self.ods_content)
with zipfile.ZipFile(fd) as _zip:
yield _zip
else:
with zipfile.ZipFile(self.ods_content) as _zip:
yield _zip
@property
def sheets(self):
sheets = []
with self._parser_context(events=('start',), tag=TABLE) as context:
idx = 0
for event, elem in context:
name = idx
idx += 1
if TABLE_NAME in elem.attrib:
name = elem.attrib[TABLE_NAME]
sheets.append(name)
self.__dict__['sheets'] = sheets
return sheets
@contextlib.contextmanager
def _parser_context(self,
events=('start', 'end'),
tag=(TABLE, ROW, CELL, COVERED_CELL),
**kwargs):
with self._zip() as _zip:
with _zip.open('content.xml', mode='r') as fd:
yield ET.iterparse(fd, events=events, tag=tag, **kwargs)
def __iter__(self):
with self._parser_context() as context:
yield from self._parse_target_sheet(context)
def _parse_target_sheet(self, context):
idx = 0
for event, elem in context:
if event == 'start' and elem.tag == TABLE:
if idx == self.sheet:
yield from self._parse_sheet(context)
break
idx += 1
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
def _parse_sheet(self, context):
for event, elem in context:
if event == 'end' and elem.tag == TABLE:
elem.clear()
break
elif event == 'start' and elem.tag == ROW:
repeat = row_repeat(elem)
row = self._parse_row(context)
for _ in range(repeat):
yield list(row)
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
def _parse_row(self, context):
row = []
idx = 0
for event, elem in context:
if event == 'end' and elem.tag == ROW:
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
return row
elif event == 'start' and elem.tag == CELL:
# ignore last repeated empty cell
content, repeat = self._parse_cell(context)
if content:
if len(row) != idx:
# complete missing cells if previous cell was empty and repeated
row.extend([''] * (idx - len(row)))
row.extend([content] * repeat)
idx += repeat
else:
elem.clear()
def _parse_cell(self, context):
for event, elem in context:
if event == 'end' and elem.tag == COVERED_CELL:
raise ODSUnsupportedCellException('table:covered-table-cell is unsupported')
if event == 'end' and elem.tag == CELL:
if NUMBER_COLUMNS_SPANNED in elem.attrib or NUMBER_ROWS_SPANNED in elem.attrib:
raise ODSUnsupportedCellException(
'fusioned cells are unsupported '
'(table:number-rows-spanned and '
'table:number-columns-spanned attributes)'
)
value = cell_content(elem, typed=self.typed)
# text-cell:
# p:
# a:
# (text, href=...)
if (self.xlink
and len(elem)
and elem[0].text is None
and len(elem[0]) == 1
and elem[0][0].tag == A):
value = value_with_href(value, href=elem[0][0].attrib.get(XLINK_HREF))
return value, cell_repeat(elem)
def styles_xml_content(date_format, datetime_format):
root = ET.Element(DOCUMENT_STYLES)
styles = ET.SubElement(root, STYLES)
# default style, inherited by other styles
ET.SubElement(styles, STYLE, attrib={STYLE_NAME: 'Default'})
def define_date_style(name, strftime_string):
date_style = ET.SubElement(styles, DATE_STYLE, attrib={STYLE_NAME: name + 'NumberFormat'})
for part in re.findall(r'%?.', strftime_string):
if part == '%Y':
ET.SubElement(date_style, YEAR, attrib={NUMBER_STYLE: 'long'})
elif part == '%m':
ET.SubElement(date_style, MONTH, attrib={NUMBER_STYLE: 'long'})
elif part == '%d':
ET.SubElement(date_style, DAY, attrib={NUMBER_STYLE: 'long'})
elif part == '%H':
ET.SubElement(date_style, HOURS, attrib={NUMBER_STYLE: 'long'})
elif part == '%M':
ET.SubElement(date_style, MINUTES, attrib={NUMBER_STYLE: 'long'})
elif part == '%S':
ET.SubElement(date_style, SECONDS, attrib={NUMBER_STYLE: 'long'})
else:
ET.SubElement(date_style, TEXT).text = part
ET.SubElement(
styles,
STYLE,
attrib={
STYLE_NAME: name,
FAMILY: 'table-cell',
DATA_STYLE_NAME: name + 'NumberFormat',
PARENT_STYLE: 'Default',
},
)
define_date_style('Date', date_format)
define_date_style('DateTime', datetime_format)
return ET.tostring(root)
class Writer:
def __init__(self, target, date_format='%Y-%m-%d', datetime_format='%Y-%m-%dT%H:%M:%S'):
self.target = target
self.date_format = date_format
self.datetime_format = datetime_format
@contextlib.contextmanager
def _zip(self):
with zipfile.ZipFile(self.target, 'w') as _zip:
_zip.writestr('mimetype', 'application/vnd.oasis.opendocument.spreadsheet')
_zip.writestr(
'META-INF/manifest.xml',
'''<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
<manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.spreadsheet"/>
<manifest:file-entry manifest:full-path="styles.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="META-INF/manifest.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="mimetype" manifest:media-type="text/plain"/>
</manifest:manifest>''',
)
_zip.writestr(
'styles.xml',
styles_xml_content(date_format=self.date_format, datetime_format=self.datetime_format),
)
yield _zip
@contextlib.contextmanager
def _serialization_context(self):
with self._zip() as _zip:
if sys.version_info >= (3, 6):
with _zip.open('content.xml', mode='w') as fd:
with ET.xmlfile(fd) as xml_writer:
yield xml_writer
else:
# we must use a temporary file before python 3.6
with tempfile.NamedTemporaryFile() as fd:
with ET.xmlfile(fd) as xml_writer:
yield xml_writer
fd.flush()
_zip.write(fd.name, 'content.xml')
class WriterHelper:
def __init__(self, context):
self.context = context
def writerow(self, row):
def writecell(raw_value):
attrib = {}
if isinstance(raw_value, LinkedValue):
value = raw_value.value
else:
value = raw_value
if isinstance(value, (float, int)):
value_type = 'float'
text_value = str(value)
attrib[VALUE] = text_value
elif isinstance(value, datetime.datetime):
value_type = 'date'
text_value = value.isoformat()
attrib[DATE_VALUE] = text_value
attrib[TABLE_STYLE_NAME] = 'DateTime'
elif isinstance(value, datetime.date):
value_type = 'date'
text_value = value.isoformat()
attrib[DATE_VALUE] = text_value
attrib[TABLE_STYLE_NAME] = 'Date'
else:
value_type = 'string'
text_value = str(value)
attrib[VALUE_TYPE] = value_type
with self.context.element(CELL, attrib=attrib):
with self.context.element(P):
if isinstance(raw_value, LinkedValue):
with self.context.element(A, attrib={XLINK_HREF: raw_value.href}):
self.context.write(text_value)
else:
self.context.write(text_value)
with self.context.element(ROW):
for value in row:
writecell(value)
def writerows(self, rows):
for row in rows:
self.writerow(row)
@property
@contextlib.contextmanager
def cell_writer(self):
row = []
def write(value, href=None):
if href:
raw_value = LinkedValue(value, href)
else:
raw_value = value
row.append(raw_value)
yield write
self.writerow(row)
@contextlib.contextmanager
def open(self):
with self._serialization_context() as context:
with context.element(DOCUMENT_CONTENT):
with context.element(BODY):
with context.element(SPREADSHEET):
with context.element(TABLE):
yield self.WriterHelper(context)
@contextlib.contextmanager
def writer(target, **kwargs):
with Writer(target, **kwargs).open() as _writer:
yield _writer