455 lines
15 KiB
Python
455 lines
15 KiB
Python
# tabularfile.ods - simple ods reader and writer
|
|
# Copyright (C) 2020 Entr'ouvert
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify it
|
|
# under the terms of the GNU Affero General Public License as published
|
|
# by the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
import contextlib
|
|
import datetime
|
|
import io
|
|
from lxml import etree as ET
|
|
import re
|
|
import sys
|
|
import tempfile
|
|
import zipfile
|
|
|
|
from .common import TabularFileError, parse_date, parse_datetime
|
|
|
|
|
|
class ODSUnsupportedCellException(TabularFileError):
|
|
pass
|
|
|
|
|
|
class Namespace:
|
|
def __init__(self, url):
|
|
self.url = url
|
|
|
|
def __call__(self, name):
|
|
return '{%s}%s' % (self.url, name)
|
|
|
|
|
|
TEXT_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:text:1.0')
|
|
P = TEXT_NS('p')
|
|
A = TEXT_NS('a')
|
|
|
|
TABLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:table:1.0')
|
|
TABLE = TABLE_NS('table')
|
|
TABLE_NAME = TABLE_NS('name')
|
|
COLUMN = TABLE_NS('table-column')
|
|
ROW = TABLE_NS('table-row')
|
|
CELL = TABLE_NS('table-cell')
|
|
COVERED_CELL = TABLE_NS('covered-table-cell')
|
|
|
|
NUMBER_COLUMNS_REPEATED = TABLE_NS('number-columns-repeated')
|
|
NUMBER_ROWS_REPEATED = TABLE_NS('number-rows-repeated')
|
|
NUMBER_COLUMNS_SPANNED = TABLE_NS('number-columns-spanned')
|
|
NUMBER_ROWS_SPANNED = TABLE_NS('number-rows-spanned')
|
|
TABLE_STYLE_NAME = TABLE_NS('style-name')
|
|
|
|
OFFICE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:office:1.0')
|
|
DOCUMENT_CONTENT = OFFICE_NS('document-content')
|
|
BODY = OFFICE_NS('body')
|
|
SPREADSHEET = OFFICE_NS('spreadsheet')
|
|
VALUE_TYPE = OFFICE_NS('value-type')
|
|
VALUE = OFFICE_NS('value')
|
|
DATE_VALUE = OFFICE_NS('date-value')
|
|
DOCUMENT_STYLES = OFFICE_NS('document-styles')
|
|
FONT_FACE_DECLS = OFFICE_NS('font-face-decls')
|
|
STYLES = OFFICE_NS('styles')
|
|
|
|
STYLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:style:1.0')
|
|
STYLE = STYLE_NS('style')
|
|
STYLE_NAME = STYLE_NS('name')
|
|
FAMILY = STYLE_NS('family')
|
|
DATA_STYLE_NAME = STYLE_NS('data-style-name')
|
|
PARENT_STYLE = STYLE_NS('parent-style')
|
|
TABLE_COLUMN_PROPERTIES = STYLE_NS('table-column-properties')
|
|
COLUMN_WIDTH = STYLE_NS('column-width')
|
|
|
|
NUMBER_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0')
|
|
DATE_STYLE = NUMBER_NS('date-style')
|
|
NUMBER_STYLE = NUMBER_NS('style')
|
|
YEAR = NUMBER_NS('year')
|
|
MONTH = NUMBER_NS('month')
|
|
DAY = NUMBER_NS('day')
|
|
HOURS = NUMBER_NS('hours')
|
|
MINUTES = NUMBER_NS('minutes')
|
|
SECONDS = NUMBER_NS('seconds')
|
|
TEXT = NUMBER_NS('text')
|
|
|
|
XLINK_NS = Namespace('http://www.w3.org/1999/xlink')
|
|
XLINK_HREF = XLINK_NS('href')
|
|
|
|
VALUE_WITH_HREF_CLASS_REGISTRY = {}
|
|
|
|
|
|
def value_with_href(value, href=None):
|
|
if not href:
|
|
return value
|
|
return LinkedValue(value, href)
|
|
|
|
|
|
def text_content(node):
|
|
'''Extract text content from node and all its children. Equivalent to
|
|
xmlNodeGetContent from libxml.'''
|
|
return ''.join(node.itertext())
|
|
|
|
|
|
TYPED_VALUE_TYPES = ['float', 'date']
|
|
|
|
|
|
class LinkedValue(object):
|
|
def __init__(self, value, href):
|
|
assert href, 'href is mandatory'
|
|
|
|
self.value = value
|
|
self.href = href
|
|
|
|
def __eq__(self, other):
|
|
if not isinstance(other, self.__class__):
|
|
return False
|
|
return self.value == other.value and self.href == other.href
|
|
|
|
def __repr__(self):
|
|
return 'LinkedValue({self.value!r}, {self.href!r})'.format(self=self)
|
|
|
|
|
|
def cell_content(elem, typed=False):
|
|
text = '\n'.join(text_content(elem) for subelem in elem.iter(tag=P))
|
|
value_type = elem.attrib.get(VALUE_TYPE, 'string')
|
|
|
|
if not typed or value_type not in TYPED_VALUE_TYPES:
|
|
pass
|
|
elif value_type == 'float':
|
|
value = elem.attrib.get(VALUE, '')
|
|
try:
|
|
return float(value)
|
|
except ValueError:
|
|
pass
|
|
elif value_type == 'date':
|
|
value = elem.attrib.get(DATE_VALUE, '')
|
|
try:
|
|
return parse_datetime(value)
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
return parse_date(value)
|
|
except ValueError:
|
|
pass
|
|
return text
|
|
|
|
|
|
def cell_repeat(elem):
|
|
try:
|
|
repeat = int(elem.attrib[NUMBER_COLUMNS_REPEATED])
|
|
if repeat < 1:
|
|
return 1
|
|
return repeat
|
|
except (KeyError, ValueError):
|
|
return 1
|
|
|
|
|
|
def row_repeat(elem):
|
|
try:
|
|
repeat = int(elem.attrib[NUMBER_ROWS_REPEATED])
|
|
if repeat < 1:
|
|
return 1
|
|
return repeat
|
|
except (KeyError, ValueError):
|
|
return 1
|
|
|
|
|
|
class Reader:
|
|
def __init__(self, ods_content, sheet=0, typed=False, xlink=False, **kwargs):
|
|
self.ods_content = ods_content
|
|
self.sheet = sheet
|
|
self.typed = typed
|
|
self.xlink = xlink
|
|
|
|
@contextlib.contextmanager
|
|
def _zip(self):
|
|
if isinstance(self.ods_content, bytes):
|
|
fd = io.BytesIO(self.ods_content)
|
|
with zipfile.ZipFile(fd) as _zip:
|
|
yield _zip
|
|
else:
|
|
with zipfile.ZipFile(self.ods_content) as _zip:
|
|
yield _zip
|
|
|
|
@property
|
|
def sheets(self):
|
|
sheets = []
|
|
with self._parser_context(events=('start',), tag=TABLE) as context:
|
|
idx = 0
|
|
for event, elem in context:
|
|
name = idx
|
|
idx += 1
|
|
if TABLE_NAME in elem.attrib:
|
|
name = elem.attrib[TABLE_NAME]
|
|
sheets.append(name)
|
|
self.__dict__['sheets'] = sheets
|
|
return sheets
|
|
|
|
@contextlib.contextmanager
|
|
def _parser_context(self,
|
|
events=('start', 'end'),
|
|
tag=(TABLE, ROW, CELL, COVERED_CELL),
|
|
**kwargs):
|
|
with self._zip() as _zip:
|
|
with _zip.open('content.xml', mode='r') as fd:
|
|
yield ET.iterparse(fd, events=events, tag=tag, **kwargs)
|
|
|
|
def __iter__(self):
|
|
with self._parser_context() as context:
|
|
yield from self._parse_target_sheet(context)
|
|
|
|
def _parse_target_sheet(self, context):
|
|
idx = 0
|
|
for event, elem in context:
|
|
if event == 'start' and elem.tag == TABLE:
|
|
if idx == self.sheet:
|
|
yield from self._parse_sheet(context)
|
|
break
|
|
idx += 1
|
|
elem.clear()
|
|
parent = elem.getparent()
|
|
if parent is not None:
|
|
parent.remove(elem)
|
|
|
|
def _parse_sheet(self, context):
|
|
for event, elem in context:
|
|
if event == 'end' and elem.tag == TABLE:
|
|
elem.clear()
|
|
break
|
|
elif event == 'start' and elem.tag == ROW:
|
|
repeat = row_repeat(elem)
|
|
row = self._parse_row(context)
|
|
for _ in range(repeat):
|
|
yield list(row)
|
|
elem.clear()
|
|
parent = elem.getparent()
|
|
if parent is not None:
|
|
parent.remove(elem)
|
|
|
|
def _parse_row(self, context):
|
|
row = []
|
|
idx = 0
|
|
|
|
for event, elem in context:
|
|
if event == 'end' and elem.tag == ROW:
|
|
elem.clear()
|
|
parent = elem.getparent()
|
|
if parent is not None:
|
|
parent.remove(elem)
|
|
return row
|
|
elif event == 'start' and elem.tag == CELL:
|
|
# ignore last repeated empty cell
|
|
content, repeat = self._parse_cell(context)
|
|
if content:
|
|
if len(row) != idx:
|
|
# complete missing cells if previous cell was empty and repeated
|
|
row.extend([''] * (idx - len(row)))
|
|
row.extend([content] * repeat)
|
|
idx += repeat
|
|
else:
|
|
elem.clear()
|
|
|
|
def _parse_cell(self, context):
|
|
for event, elem in context:
|
|
if event == 'end' and elem.tag == COVERED_CELL:
|
|
raise ODSUnsupportedCellException('table:covered-table-cell is unsupported')
|
|
if event == 'end' and elem.tag == CELL:
|
|
if NUMBER_COLUMNS_SPANNED in elem.attrib or NUMBER_ROWS_SPANNED in elem.attrib:
|
|
raise ODSUnsupportedCellException(
|
|
'fusioned cells are unsupported '
|
|
'(table:number-rows-spanned and '
|
|
'table:number-columns-spanned attributes)'
|
|
)
|
|
value = cell_content(elem, typed=self.typed)
|
|
# text-cell:
|
|
# p:
|
|
# a:
|
|
# (text, href=...)
|
|
if (self.xlink
|
|
and len(elem)
|
|
and elem[0].text is None
|
|
and len(elem[0]) == 1
|
|
and elem[0][0].tag == A):
|
|
value = value_with_href(value, href=elem[0][0].attrib.get(XLINK_HREF))
|
|
return value, cell_repeat(elem)
|
|
|
|
|
|
def styles_xml_content(date_format, datetime_format):
|
|
root = ET.Element(DOCUMENT_STYLES)
|
|
styles = ET.SubElement(root, STYLES)
|
|
|
|
# default style, inherited by other styles
|
|
ET.SubElement(styles, STYLE, attrib={STYLE_NAME: 'Default'})
|
|
|
|
def define_date_style(name, strftime_string):
|
|
date_style = ET.SubElement(styles, DATE_STYLE, attrib={STYLE_NAME: name + 'NumberFormat'})
|
|
|
|
for part in re.findall(r'%?.', strftime_string):
|
|
if part == '%Y':
|
|
ET.SubElement(date_style, YEAR, attrib={NUMBER_STYLE: 'long'})
|
|
elif part == '%m':
|
|
ET.SubElement(date_style, MONTH, attrib={NUMBER_STYLE: 'long'})
|
|
elif part == '%d':
|
|
ET.SubElement(date_style, DAY, attrib={NUMBER_STYLE: 'long'})
|
|
elif part == '%H':
|
|
ET.SubElement(date_style, HOURS, attrib={NUMBER_STYLE: 'long'})
|
|
elif part == '%M':
|
|
ET.SubElement(date_style, MINUTES, attrib={NUMBER_STYLE: 'long'})
|
|
elif part == '%S':
|
|
ET.SubElement(date_style, SECONDS, attrib={NUMBER_STYLE: 'long'})
|
|
else:
|
|
ET.SubElement(date_style, TEXT).text = part
|
|
ET.SubElement(
|
|
styles,
|
|
STYLE,
|
|
attrib={
|
|
STYLE_NAME: name,
|
|
FAMILY: 'table-cell',
|
|
DATA_STYLE_NAME: name + 'NumberFormat',
|
|
PARENT_STYLE: 'Default',
|
|
},
|
|
)
|
|
|
|
define_date_style('Date', date_format)
|
|
define_date_style('DateTime', datetime_format)
|
|
return ET.tostring(root)
|
|
|
|
|
|
class Writer:
|
|
def __init__(self, target, date_format='%Y-%m-%d', datetime_format='%Y-%m-%dT%H:%M:%S'):
|
|
self.target = target
|
|
self.date_format = date_format
|
|
self.datetime_format = datetime_format
|
|
|
|
@contextlib.contextmanager
|
|
def _zip(self):
|
|
with zipfile.ZipFile(self.target, 'w') as _zip:
|
|
_zip.writestr('mimetype', 'application/vnd.oasis.opendocument.spreadsheet')
|
|
_zip.writestr(
|
|
'META-INF/manifest.xml',
|
|
'''<?xml version="1.0" encoding="UTF-8"?>
|
|
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
|
|
<manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.spreadsheet"/>
|
|
<manifest:file-entry manifest:full-path="styles.xml" manifest:media-type="text/xml"/>
|
|
<manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/>
|
|
<manifest:file-entry manifest:full-path="META-INF/manifest.xml" manifest:media-type="text/xml"/>
|
|
<manifest:file-entry manifest:full-path="mimetype" manifest:media-type="text/plain"/>
|
|
</manifest:manifest>''',
|
|
)
|
|
_zip.writestr(
|
|
'styles.xml',
|
|
styles_xml_content(date_format=self.date_format, datetime_format=self.datetime_format),
|
|
)
|
|
yield _zip
|
|
|
|
@contextlib.contextmanager
|
|
def _serialization_context(self):
|
|
with self._zip() as _zip:
|
|
if sys.version_info >= (3, 6):
|
|
with _zip.open('content.xml', mode='w') as fd:
|
|
with ET.xmlfile(fd) as xml_writer:
|
|
yield xml_writer
|
|
else:
|
|
# we must use a temporary file before python 3.6
|
|
with tempfile.NamedTemporaryFile() as fd:
|
|
with ET.xmlfile(fd) as xml_writer:
|
|
yield xml_writer
|
|
fd.flush()
|
|
_zip.write(fd.name, 'content.xml')
|
|
|
|
class WriterHelper:
|
|
def __init__(self, context):
|
|
self.context = context
|
|
|
|
def writerow(self, row):
|
|
def writecell(raw_value):
|
|
|
|
attrib = {}
|
|
|
|
if isinstance(raw_value, LinkedValue):
|
|
value = raw_value.value
|
|
else:
|
|
value = raw_value
|
|
|
|
if isinstance(value, (float, int)):
|
|
value_type = 'float'
|
|
text_value = str(value)
|
|
attrib[VALUE] = text_value
|
|
elif isinstance(value, datetime.datetime):
|
|
value_type = 'date'
|
|
text_value = value.isoformat()
|
|
attrib[DATE_VALUE] = text_value
|
|
attrib[TABLE_STYLE_NAME] = 'DateTime'
|
|
elif isinstance(value, datetime.date):
|
|
value_type = 'date'
|
|
text_value = value.isoformat()
|
|
attrib[DATE_VALUE] = text_value
|
|
attrib[TABLE_STYLE_NAME] = 'Date'
|
|
else:
|
|
value_type = 'string'
|
|
text_value = str(value)
|
|
|
|
attrib[VALUE_TYPE] = value_type
|
|
|
|
with self.context.element(CELL, attrib=attrib):
|
|
with self.context.element(P):
|
|
if isinstance(raw_value, LinkedValue):
|
|
with self.context.element(A, attrib={XLINK_HREF: raw_value.href}):
|
|
self.context.write(text_value)
|
|
else:
|
|
self.context.write(text_value)
|
|
|
|
with self.context.element(ROW):
|
|
for value in row:
|
|
writecell(value)
|
|
|
|
def writerows(self, rows):
|
|
for row in rows:
|
|
self.writerow(row)
|
|
|
|
@property
|
|
@contextlib.contextmanager
|
|
def cell_writer(self):
|
|
row = []
|
|
|
|
def write(value, href=None):
|
|
if href:
|
|
raw_value = LinkedValue(value, href)
|
|
else:
|
|
raw_value = value
|
|
row.append(raw_value)
|
|
|
|
yield write
|
|
|
|
self.writerow(row)
|
|
|
|
@contextlib.contextmanager
|
|
def open(self):
|
|
with self._serialization_context() as context:
|
|
with context.element(DOCUMENT_CONTENT):
|
|
with context.element(BODY):
|
|
with context.element(SPREADSHEET):
|
|
with context.element(TABLE):
|
|
yield self.WriterHelper(context)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def writer(target, **kwargs):
|
|
with Writer(target, **kwargs).open() as _writer:
|
|
yield _writer
|