commit 871d96975ef693ff3051142c5ffcb019e24e5a51 Author: Benjamin Dauvergne Date: Sun Jul 12 11:15:20 2020 +0200 first commit diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..682436b --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,49 @@ +@Library('eo-jenkins-lib@master') import eo.Utils + +pipeline { + agent any + options { disableConcurrentBuilds() } + stages { + stage('Unit Tests') { + steps { + sh """rm -rf .env +virtualenv .env +. .env/bin/activate +pip install tox +tox -rv""" + } + post { + always { + script { + utils = new Utils() + utils.publish_coverage('coverage.xml') + utils.publish_coverage_native('index.html') + } + mergeJunitResults() + } + } + } + stage('Packaging') { + steps { + script { + if (env.JOB_NAME == 'tabularfile' && env.GIT_BRANCH == 'origin/master') { + sh 'sudo -H -u eobuilder /usr/local/bin/eobuilder tabularfile' + } else if (env.GIT_BRANCH.startsWith('hotfix/')) { + sh "sudo -H -u eobuilder /usr/local/bin/eobuilder --branch ${env.GIT_BRANCH} --hotfix tabularfile" + } + } + } + } + } + post { + always { + script { + utils = new Utils() + utils.mail_notify(currentBuild, env, 'ci+jenkins-tabularfile@entrouvert.org') + } + } + cleanup { + cleanWs() + } + } +} diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..fb1b2ca --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include VERSION +include sample.py +include tests/*.ods diff --git a/README b/README new file mode 100644 index 0000000..410c214 --- /dev/null +++ b/README @@ -0,0 +1,97 @@ +tabularfile +============ + +Tabular files for humans or an opinionated approach to tabular files or +whatever. Parse and write ODS files, mimicing the `csv` module interface, +keeping memory usage as low as possible. + +Reading +------- + +The main API is `tabularfile.load(path_or_file, **kwarsg)` it's a context manager returning an +iterable. It accepts as first argument a bytes string, a path or an opened +file. Other arguments depends on the backend, ods or csv. + +.. code:: pycon + + >>> from tabularfile import load + >>> with load('sheet.ods') as tabfile: + list(tabfile) + [ + ['date', 'count'], + ['01/12/2019', '123'], + ['01/01/2020', '156'], + ] + >>> with load('sheet.csv') as tabfile: + list(tabfile) + [ + ['date', 'count'], + ['01/12/2019', '123'], + ['01/01/2020', '156'], + ] + + +With `typed=True` you can ask the reader to cast cells content based on the declared OpenDocument value type. + + +.. code:: pycon + + >>> with load('sheet.ods'), typed=True) as tabfile: + list(tabfile) + [ + ['date', 'count'] + [datetime.date(2019, 12, 1), 123], + [datetime.date(2029, 1, 1), 156], + ] + + +With the `sheet` constructor attribute you can load another sheet than the +first one, only integer indexes are supported, it also supports sheet's name. +To get the list of tsheets you can use the `tabfile.sheets` accessor. + + +.. code:: pycon + + >>> with load('sheet.ods', sheet=1) as tabfile: + ... + >>> with load('sheet.ods', sheet='Sheet1') as tabfile: + tabfile.sheets + ['Sheet1', 'Sheet2'] + +Writing +------- + +To write a sheet file, use the `tabularfile.write(path_or_file, format='ods', +**kwargs)` context manager. `format` can also be `csv` and in this case it +accepts other arguments like `encoding`, `dialect`, `delimiter` or `quotechar`. + +The ODS writer accept special value `tabularfile.ods.LinkedValue` if you need +to put XLink on your data. ODS and CSV writer accets date and datetime values +which will be formatted using the `date_format` and `datetime_format` +templates. + + +.. code:: pycon + + >>> from tabularfile import ods + >>> with ods.writer('sheet.ods') as writer: + writer.writerow(['date', 'count', 'link']) + writer.writerows([ + [datetime.date(2019, 12, 1), 123, ods.LinkedValue('Click me', href='https://example.com/')], + [datetime.date(2020, 1, 1), 156, ods.LinkedValue('Click me', href='https://example.com/')], + ]) + +Parsing ISO8601 dates +--------------------- + +Base python before version 3.7 is not able to parse date with timezone, so we +try as much as possible to use other library to do it. `isodate` or +`python-dateutil` are used if present. + +Detecting CSV character encoding +-------------------------------- + +If the charamel_ package is installed, it is used for detecting the encoding of CSV files. + + +.. _charamel: https://pypi.org/project/charamel/ diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..d80d436 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +python3-tabularfile (1) unstable; urgency=medium + + * First release. + + -- Benjamin Dauvergne Sun, 12 Jul 2020 22:49:00 +0200 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..f599e28 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +10 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..44950ea --- /dev/null +++ b/debian/control @@ -0,0 +1,16 @@ +Source: python3-tabularfile +Maintainer: Benjamin Dauvergne +Section: python +Priority: optional +Build-Depends: debhelper (>= 9), + dh-python, + python3, + python3-setuptools + +Package: python3-tabularfile +Architecture: all +Depends: ${misc:Depends}, + ${python3:Depends} +Description: Parse and write tabular files of any size. + It contains: + * tools to parse/write ODS files. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..c4e8b83 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,10 @@ +Authors: Entr'ouvert + +Copyright © 2020, Entr'ouvert + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +The Software is provided “as is”, without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders X be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the Software. +Except as contained in this notice, the name of the copyright holders shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the copyright holders. diff --git a/debian/install b/debian/install new file mode 100644 index 0000000..32c99c4 --- /dev/null +++ b/debian/install @@ -0,0 +1 @@ +README /usr/share/doc/python3-tabularfile diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..7888682 --- /dev/null +++ b/debian/rules @@ -0,0 +1,12 @@ +#!/usr/bin/make -f + +export PYBUILD_NAME := tabularfile +export PYBUILD_DISABLE_python3=test + +%: + dh $@ --with python3 --buildsystem=pybuild + +override_dh_auto_clean: + rm -rf $(PACKAGE_NAME).egg-info + rm -rf PKG-INFO + dh_auto_clean diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/sample.py b/sample.py new file mode 100644 index 0000000..bd27cb5 --- /dev/null +++ b/sample.py @@ -0,0 +1,14 @@ +import datetime + +from tabularfile import write + +with write('sheet.ods', date_format='%d/%m/%Y', datetime_format='%d/%m/%Y %H:%M:%S') as writer: + writer.writerow(['date', 'count']) + writer.writerows([ + [datetime.date(2019, 12, 1), 123], + [datetime.date(2020, 1, 1), 156], + [datetime.datetime(2020, 1, 1, 12, 0, 0), 156], + ]) + with writer.cell_writer as cell_writer: + cell_writer('coucou', href='https://example.com/') + cell_writer('coucou', href='https://example.com/a/') diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3ce7708 --- /dev/null +++ b/setup.py @@ -0,0 +1,73 @@ +# tabularfile +# Copyright (C) 2020 Entr'ouvert + +import os +import subprocess + +from distutils.command.sdist import sdist +import setuptools + + +def get_version(): + '''Use the VERSION, if absent generates a version with git describe, if not + tag exists, take 0.0- and add the length of the commit log. + ''' + if os.path.exists('VERSION'): + with open('VERSION', 'r') as v: + return v.read() + if os.path.exists('.git'): + p = subprocess.Popen( + ['git', 'describe', '--dirty=.dirty', '--match=v*'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + result = p.communicate()[0] + if p.returncode == 0: + result = result.decode('ascii').strip()[1:] # strip spaces/newlines and initial v + if '-' in result: # not a tagged version + real_number, commit_count, commit_hash = result.split('-', 2) + version = '%s.post%s+%s' % (real_number, commit_count, commit_hash) + else: + version = result + return version + else: + return '0.0.post%s' % len(subprocess.check_output(['git', 'rev-list', 'HEAD']).splitlines()) + return '0.0' + + +class eo_sdist(sdist): + def run(self): + if os.path.exists('VERSION'): + os.remove('VERSION') + version = get_version() + with open('VERSION', 'w') as version_file: + version_file.write(version) + sdist.run(self) + if os.path.exists('VERSION'): + os.remove('VERSION') + + +cmdclass = {'sdist': eo_sdist} + +with open('README', 'r') as fh: + long_description = fh.read() + +setuptools.setup( + name='tabularfile', + version=get_version(), + author='Benjamin Dauvergne', + author_email='bdauvergne@entrouvert.com', + description='A small example package', + long_description=long_description, + long_description_content_type='text/x-rst', + url='https://dev.entrouvert.org/projects/tabularfile/', + packages=setuptools.find_packages(exclude=['tests']), + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.5', + install_requires=['lxml'], + cmdclass=cmdclass, +) diff --git a/tabularfile/__init__.py b/tabularfile/__init__.py new file mode 100644 index 0000000..96d482e --- /dev/null +++ b/tabularfile/__init__.py @@ -0,0 +1,21 @@ +# tabularfile - tabular files for humans +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from .loader import load +from .writer import write +from .common import TabularFileError + +__all__ = ['load', 'write', 'TabularFileError'] diff --git a/tabularfile/common.py b/tabularfile/common.py new file mode 100644 index 0000000..af22664 --- /dev/null +++ b/tabularfile/common.py @@ -0,0 +1,59 @@ +# tabularfile.common- simple ods reader and writer +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import datetime + +try: + import isodate +except ImportError: + isodate = False + +try: + import dateutil.parser as dateutil_parser +except ImportError: + dateutil_parser = None + + +class TabularFileError(Exception): + pass + + +def parse_datetime(value): + if 'T' not in value: + raise ValueError + if isodate: + return isodate.parse_datetime(value) + if dateutil_parser: + if hasattr(dateutil_parser, 'isoparse'): + return dateutil_parser.isoparse(value) + else: + return dateutil_parser.parse(value) + if hasattr(datetime.datetime, 'fromisoformat'): + return datetime.datetime.fromisoformat(value) + return datetime.datetime.strptime(value, '%Y-%m-%dT%H:%M:%S') + + +def parse_date(value): + if isodate: + return isodate.parse_date(value) + if dateutil_parser: + if hasattr(dateutil_parser, 'isoparse'): + return dateutil_parser.isoparse(value).date() + else: + return dateutil_parser.parse(value).date() + if hasattr(datetime.date, 'fromisoformat'): + return datetime.date.fromisoformat(value) + return datetime.datetime.strptime(value, '%Y-%m-%d').date() diff --git a/tabularfile/csv.py b/tabularfile/csv.py new file mode 100644 index 0000000..edf8d4d --- /dev/null +++ b/tabularfile/csv.py @@ -0,0 +1,144 @@ +# tabularfile.csv - simple ods reader and writer +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import csv +import contextlib +import datetime +import io + +from .common import TabularFileError + +# work around https://github.com/Ousret/charset_normalizer/issues/33 +try: + import charset_normalizer.normalizer + import sys + + charset_normalizer.normalizer.python_version_tuple = lambda: tuple(sys.version_info)[:3] +except ImportError: + pass + + +class Reader: + def __init__(self, fh, encoding=None, dialect=None, delimiter=None, quotechar=None, **kwargs): + offset = fh.tell() + header = fh.read(1024 * 64) + + self.encoding = encoding + + if not self.encoding or self.encoding == 'autodetect': + self.encoding = None + try: + import charamel + except ImportError: + pass + else: + self.encoding = charamel.Detector().detect(header) or 'utf-8' + + if not self.encoding: + raise TabularFileError('encoding cannot be autodetected, please install charamel or charset_normalizer') + + fh.seek(offset) + text_fh = io.TextIOWrapper(fh, encoding=self.encoding) + + offset = text_fh.tell() + self._offset = offset + + if dialect or delimiter or quotechar: + self.csv_kwargs = { + 'dialect': dialect, + 'delimiter': delimiter, + 'quotechar': quotechar, + } + else: + sniffer = csv.Sniffer() + header = '' + for i in range(10): + header += text_fh.read(1024 * 5) + try: + dialect = sniffer.sniff(header) + self.csv_kwargs = {'dialect': dialect} + break + except csv.Error: + pass + else: + # dummy detection + line0 = header.splitlines()[0] + delimiter = ',' if line0.count(',') >= line0.count(';') else ';' + self.csv_kwargs = {'delimiter': delimiter} + + text_fh.seek(offset) + self.text_fh = text_fh + + def __iter__(self): + try: + for row in csv.reader(self.text_fh, **self.csv_kwargs): + yield row + except Exception as e: + raise TabularFileError('parsing error') from e + finally: + self.text_fh.seek(self._offset) + + @property + def sheets(self): + return [0] + + +class Writer: + def __init__(self, + text_fh, + dialect=None, + delimiter=None, + quotechar=None, + date_format='%Y-%m-%d', + datetime_format='%Y-%m-%dT%H:%M:%S%z', + **kwargs): + if not dialect and not delimiter and not quotechar: + dialect = csv.excel + + csv_kwargs = {} + if dialect: + csv_kwargs['dialect'] = dialect + if delimiter: + csv_kwargs['delimiter'] = delimiter + if quotechar: + csv_kwargs['quotechar'] = quotechar + + self.csv_writer = csv.writer(text_fh, **csv_kwargs) + self.date_format = date_format + self.datetime_format = datetime_format + + def writerow(self, row): + def helper(row): + for cell in row: + if isinstance(cell, datetime.datetime): + yield cell.strftime(self.datetime_format) + elif isinstance(cell, datetime.date): + yield cell.strftime(self.date_format) + else: + yield str(cell) + self.csv_writer.writerow(helper(row)) + + def writerows(self, rows): + for row in rows: + self.writerow(row) + + +@contextlib.contextmanager +def writer(fh, encoding='utf-8', **kwargs): + text_fh = io.TextIOWrapper(fh, encoding=encoding) + yield Writer(text_fh, **kwargs) + text_fh.flush() + diff --git a/tabularfile/loader.py b/tabularfile/loader.py new file mode 100644 index 0000000..f3fc17b --- /dev/null +++ b/tabularfile/loader.py @@ -0,0 +1,59 @@ +# tabularfile.loader - tabular file for humans +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import contextlib +import io + +from .common import TabularFileError + + +@contextlib.contextmanager +def fh_from_path_or_file(path_or_file): + if isinstance(path_or_file, bytes): + yield io.BytesIO(path_or_file) + else: + if isinstance(path_or_file, io.TextIOBase): + raise TabularFileError('file handle must be a bytes stream') + if isinstance(path_or_file, io.IOBase): + if not path_or_file.readable or not path_or_file.seekable: + raise TabularFileError('file handle must be readable and seekable') + yield path_or_file + else: + with open(path_or_file, 'rb') as fh: + yield fh + + +@contextlib.contextmanager +def load(path_or_file, format=None, **kwargs): + from . import ods, csv + + with fh_from_path_or_file(path_or_file) as fh: + offset = fh.tell() + header = fh.read(1024) + fh.seek(offset) + + if format is None: + if header[:4] == b'PK\x03\x04': + format = 'ods' + else: + format = 'csv' + + if format == 'ods': + yield ods.Reader(fh, **kwargs) + elif format == 'csv': + yield csv.Reader(fh, **kwargs) + else: + raise TabularFileError('unknown format %r' % format) diff --git a/tabularfile/ods.py b/tabularfile/ods.py new file mode 100644 index 0000000..fe69455 --- /dev/null +++ b/tabularfile/ods.py @@ -0,0 +1,454 @@ +# tabularfile.ods - simple ods reader and writer +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import contextlib +import datetime +import io +from lxml import etree as ET +import re +import sys +import tempfile +import zipfile + +from .common import TabularFileError, parse_date, parse_datetime + + +class ODSUnsupportedCellException(TabularFileError): + pass + + +class Namespace: + def __init__(self, url): + self.url = url + + def __call__(self, name): + return '{%s}%s' % (self.url, name) + + +TEXT_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:text:1.0') +P = TEXT_NS('p') +A = TEXT_NS('a') + +TABLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:table:1.0') +TABLE = TABLE_NS('table') +TABLE_NAME = TABLE_NS('name') +COLUMN = TABLE_NS('table-column') +ROW = TABLE_NS('table-row') +CELL = TABLE_NS('table-cell') +COVERED_CELL = TABLE_NS('covered-table-cell') + +NUMBER_COLUMNS_REPEATED = TABLE_NS('number-columns-repeated') +NUMBER_ROWS_REPEATED = TABLE_NS('number-rows-repeated') +NUMBER_COLUMNS_SPANNED = TABLE_NS('number-columns-spanned') +NUMBER_ROWS_SPANNED = TABLE_NS('number-rows-spanned') +TABLE_STYLE_NAME = TABLE_NS('style-name') + +OFFICE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:office:1.0') +DOCUMENT_CONTENT = OFFICE_NS('document-content') +BODY = OFFICE_NS('body') +SPREADSHEET = OFFICE_NS('spreadsheet') +VALUE_TYPE = OFFICE_NS('value-type') +VALUE = OFFICE_NS('value') +DATE_VALUE = OFFICE_NS('date-value') +DOCUMENT_STYLES = OFFICE_NS('document-styles') +FONT_FACE_DECLS = OFFICE_NS('font-face-decls') +STYLES = OFFICE_NS('styles') + +STYLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:style:1.0') +STYLE = STYLE_NS('style') +STYLE_NAME = STYLE_NS('name') +FAMILY = STYLE_NS('family') +DATA_STYLE_NAME = STYLE_NS('data-style-name') +PARENT_STYLE = STYLE_NS('parent-style') +TABLE_COLUMN_PROPERTIES = STYLE_NS('table-column-properties') +COLUMN_WIDTH = STYLE_NS('column-width') + +NUMBER_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0') +DATE_STYLE = NUMBER_NS('date-style') +NUMBER_STYLE = NUMBER_NS('style') +YEAR = NUMBER_NS('year') +MONTH = NUMBER_NS('month') +DAY = NUMBER_NS('day') +HOURS = NUMBER_NS('hours') +MINUTES = NUMBER_NS('minutes') +SECONDS = NUMBER_NS('seconds') +TEXT = NUMBER_NS('text') + +XLINK_NS = Namespace('http://www.w3.org/1999/xlink') +XLINK_HREF = XLINK_NS('href') + +VALUE_WITH_HREF_CLASS_REGISTRY = {} + + +def value_with_href(value, href=None): + if not href: + return value + return LinkedValue(value, href) + + +def text_content(node): + '''Extract text content from node and all its children. Equivalent to + xmlNodeGetContent from libxml.''' + return ''.join(node.itertext()) + + +TYPED_VALUE_TYPES = ['float', 'date'] + + +class LinkedValue(object): + def __init__(self, value, href): + assert href, 'href is mandatory' + + self.value = value + self.href = href + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return False + return self.value == other.value and self.href == other.href + + def __repr__(self): + return 'LinkedValue({self.value!r}, {self.href!r})'.format(self=self) + + +def cell_content(elem, typed=False): + text = '\n'.join(text_content(elem) for subelem in elem.iter(tag=P)) + value_type = elem.attrib.get(VALUE_TYPE, 'string') + + if not typed or value_type not in TYPED_VALUE_TYPES: + pass + elif value_type == 'float': + value = elem.attrib.get(VALUE, '') + try: + return float(value) + except ValueError: + pass + elif value_type == 'date': + value = elem.attrib.get(DATE_VALUE, '') + try: + return parse_datetime(value) + except ValueError: + pass + try: + return parse_date(value) + except ValueError: + pass + return text + + +def cell_repeat(elem): + try: + repeat = int(elem.attrib[NUMBER_COLUMNS_REPEATED]) + if repeat < 1: + return 1 + return repeat + except (KeyError, ValueError): + return 1 + + +def row_repeat(elem): + try: + repeat = int(elem.attrib[NUMBER_ROWS_REPEATED]) + if repeat < 1: + return 1 + return repeat + except (KeyError, ValueError): + return 1 + + +class Reader: + def __init__(self, ods_content, sheet=0, typed=False, xlink=False, **kwargs): + self.ods_content = ods_content + self.sheet = sheet + self.typed = typed + self.xlink = xlink + + @contextlib.contextmanager + def _zip(self): + if isinstance(self.ods_content, bytes): + fd = io.BytesIO(self.ods_content) + with zipfile.ZipFile(fd) as _zip: + yield _zip + else: + with zipfile.ZipFile(self.ods_content) as _zip: + yield _zip + + @property + def sheets(self): + sheets = [] + with self._parser_context(events=('start',), tag=TABLE) as context: + idx = 0 + for event, elem in context: + name = idx + idx += 1 + if TABLE_NAME in elem.attrib: + name = elem.attrib[TABLE_NAME] + sheets.append(name) + self.__dict__['sheets'] = sheets + return sheets + + @contextlib.contextmanager + def _parser_context(self, + events=('start', 'end'), + tag=(TABLE, ROW, CELL, COVERED_CELL), + **kwargs): + with self._zip() as _zip: + with _zip.open('content.xml', mode='r') as fd: + yield ET.iterparse(fd, events=events, tag=tag, **kwargs) + + def __iter__(self): + with self._parser_context() as context: + yield from self._parse_target_sheet(context) + + def _parse_target_sheet(self, context): + idx = 0 + for event, elem in context: + if event == 'start' and elem.tag == TABLE: + if idx == self.sheet: + yield from self._parse_sheet(context) + break + idx += 1 + elem.clear() + parent = elem.getparent() + if parent is not None: + parent.remove(elem) + + def _parse_sheet(self, context): + for event, elem in context: + if event == 'end' and elem.tag == TABLE: + elem.clear() + break + elif event == 'start' and elem.tag == ROW: + repeat = row_repeat(elem) + row = self._parse_row(context) + for _ in range(repeat): + yield list(row) + elem.clear() + parent = elem.getparent() + if parent is not None: + parent.remove(elem) + + def _parse_row(self, context): + row = [] + idx = 0 + + for event, elem in context: + if event == 'end' and elem.tag == ROW: + elem.clear() + parent = elem.getparent() + if parent is not None: + parent.remove(elem) + return row + elif event == 'start' and elem.tag == CELL: + # ignore last repeated empty cell + content, repeat = self._parse_cell(context) + if content: + if len(row) != idx: + # complete missing cells if previous cell was empty and repeated + row.extend([''] * (idx - len(row))) + row.extend([content] * repeat) + idx += repeat + else: + elem.clear() + + def _parse_cell(self, context): + for event, elem in context: + if event == 'end' and elem.tag == COVERED_CELL: + raise ODSUnsupportedCellException('table:covered-table-cell is unsupported') + if event == 'end' and elem.tag == CELL: + if NUMBER_COLUMNS_SPANNED in elem.attrib or NUMBER_ROWS_SPANNED in elem.attrib: + raise ODSUnsupportedCellException( + 'fusioned cells are unsupported ' + '(table:number-rows-spanned and ' + 'table:number-columns-spanned attributes)' + ) + value = cell_content(elem, typed=self.typed) + # text-cell: + # p: + # a: + # (text, href=...) + if (self.xlink + and len(elem) + and elem[0].text is None + and len(elem[0]) == 1 + and elem[0][0].tag == A): + value = value_with_href(value, href=elem[0][0].attrib.get(XLINK_HREF)) + return value, cell_repeat(elem) + + +def styles_xml_content(date_format, datetime_format): + root = ET.Element(DOCUMENT_STYLES) + styles = ET.SubElement(root, STYLES) + + # default style, inherited by other styles + ET.SubElement(styles, STYLE, attrib={STYLE_NAME: 'Default'}) + + def define_date_style(name, strftime_string): + date_style = ET.SubElement(styles, DATE_STYLE, attrib={STYLE_NAME: name + 'NumberFormat'}) + + for part in re.findall(r'%?.', strftime_string): + if part == '%Y': + ET.SubElement(date_style, YEAR, attrib={NUMBER_STYLE: 'long'}) + elif part == '%m': + ET.SubElement(date_style, MONTH, attrib={NUMBER_STYLE: 'long'}) + elif part == '%d': + ET.SubElement(date_style, DAY, attrib={NUMBER_STYLE: 'long'}) + elif part == '%H': + ET.SubElement(date_style, HOURS, attrib={NUMBER_STYLE: 'long'}) + elif part == '%M': + ET.SubElement(date_style, MINUTES, attrib={NUMBER_STYLE: 'long'}) + elif part == '%S': + ET.SubElement(date_style, SECONDS, attrib={NUMBER_STYLE: 'long'}) + else: + ET.SubElement(date_style, TEXT).text = part + ET.SubElement( + styles, + STYLE, + attrib={ + STYLE_NAME: name, + FAMILY: 'table-cell', + DATA_STYLE_NAME: name + 'NumberFormat', + PARENT_STYLE: 'Default', + }, + ) + + define_date_style('Date', date_format) + define_date_style('DateTime', datetime_format) + return ET.tostring(root) + + +class Writer: + def __init__(self, target, date_format='%Y-%m-%d', datetime_format='%Y-%m-%dT%H:%M:%S'): + self.target = target + self.date_format = date_format + self.datetime_format = datetime_format + + @contextlib.contextmanager + def _zip(self): + with zipfile.ZipFile(self.target, 'w') as _zip: + _zip.writestr('mimetype', 'application/vnd.oasis.opendocument.spreadsheet') + _zip.writestr( + 'META-INF/manifest.xml', + ''' + + + + + + + ''', + ) + _zip.writestr( + 'styles.xml', + styles_xml_content(date_format=self.date_format, datetime_format=self.datetime_format), + ) + yield _zip + + @contextlib.contextmanager + def _serialization_context(self): + with self._zip() as _zip: + if sys.version_info >= (3, 6): + with _zip.open('content.xml', mode='w') as fd: + with ET.xmlfile(fd) as xml_writer: + yield xml_writer + else: + # we must use a temporary file before python 3.6 + with tempfile.NamedTemporaryFile() as fd: + with ET.xmlfile(fd) as xml_writer: + yield xml_writer + fd.flush() + _zip.write(fd.name, 'content.xml') + + class WriterHelper: + def __init__(self, context): + self.context = context + + def writerow(self, row): + def writecell(raw_value): + + attrib = {} + + if isinstance(raw_value, LinkedValue): + value = raw_value.value + else: + value = raw_value + + if isinstance(value, (float, int)): + value_type = 'float' + text_value = str(value) + attrib[VALUE] = text_value + elif isinstance(value, datetime.datetime): + value_type = 'date' + text_value = value.isoformat() + attrib[DATE_VALUE] = text_value + attrib[TABLE_STYLE_NAME] = 'DateTime' + elif isinstance(value, datetime.date): + value_type = 'date' + text_value = value.isoformat() + attrib[DATE_VALUE] = text_value + attrib[TABLE_STYLE_NAME] = 'Date' + else: + value_type = 'string' + text_value = str(value) + + attrib[VALUE_TYPE] = value_type + + with self.context.element(CELL, attrib=attrib): + with self.context.element(P): + if isinstance(raw_value, LinkedValue): + with self.context.element(A, attrib={XLINK_HREF: raw_value.href}): + self.context.write(text_value) + else: + self.context.write(text_value) + + with self.context.element(ROW): + for value in row: + writecell(value) + + def writerows(self, rows): + for row in rows: + self.writerow(row) + + @property + @contextlib.contextmanager + def cell_writer(self): + row = [] + + def write(value, href=None): + if href: + raw_value = LinkedValue(value, href) + else: + raw_value = value + row.append(raw_value) + + yield write + + self.writerow(row) + + @contextlib.contextmanager + def open(self): + with self._serialization_context() as context: + with context.element(DOCUMENT_CONTENT): + with context.element(BODY): + with context.element(SPREADSHEET): + with context.element(TABLE): + yield self.WriterHelper(context) + + +@contextlib.contextmanager +def writer(target, **kwargs): + with Writer(target, **kwargs).open() as _writer: + yield _writer diff --git a/tabularfile/writer.py b/tabularfile/writer.py new file mode 100644 index 0000000..b4ee512 --- /dev/null +++ b/tabularfile/writer.py @@ -0,0 +1,60 @@ +# tabularfile.writer - tabular file for humans +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import contextlib +import io +import weakref + +from .common import TabularFileError + + +@contextlib.contextmanager +def fh_from_path_or_file(path_or_file): + if isinstance(path_or_file, io.IOBase): + if not path_or_file.writable or not path_or_file.seekable: + raise TabularFileError('file handle must be writable and seekable') + yield path_or_file + else: + with open(path_or_file, 'wb') as fh: + yield fh + + +class WeakrefHolder: + def __init__(self, value): + self.value = value + + def __enter__(self): + return weakref.proxy(self.value) + + def __exit__(self, a, b, c): + self.value = None + + +@contextlib.contextmanager +def write(path_or_file, format='ods', **kwargs): + from . import ods, csv + + with fh_from_path_or_file(path_or_file) as fh: + if format == 'ods': + context = ods.writer(fh, **kwargs) + elif format == 'csv': + context = csv.writer(fh, **kwargs) + else: + raise ValueError('invalid format %r' % format) + + with context as _writer: + with WeakrefHolder(_writer) as proxy: + yield proxy diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/test1.ods b/tests/data/test1.ods new file mode 100644 index 0000000..6290838 Binary files /dev/null and b/tests/data/test1.ods differ diff --git a/tests/data/test2_with_span.ods b/tests/data/test2_with_span.ods new file mode 100644 index 0000000..eac1306 Binary files /dev/null and b/tests/data/test2_with_span.ods differ diff --git a/tests/test_csv.py b/tests/test_csv.py new file mode 100644 index 0000000..1d16944 --- /dev/null +++ b/tests/test_csv.py @@ -0,0 +1,109 @@ +# tabularfile - simple ods tabfile and writer +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import sys + +import datetime +import resource + +try: + import charamel +except ImportError: + charamel = None + +import pytz + +import pytest + +from tabularfile import load, write + + +@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15']) +def test_encoding(encoding, pass_encoding=True): + with load('''\ +username,email,password,prix +"John à Doé",john.doe@example.com,abcd1234,"10,24 €" +"J'ai du bon tabac dans ma tabatière",john.doe@example.com,abcd1234,"10,24 €" +"L'écran est tombé dans la forêt avec un œuf et un nœud à noël. John à Doé",john.doe@example.com,abcd1234,"10,24 €" +"John à Doé",john.doe@example.com,"10,24 €"'''.encode(encoding, 'replace'), + encoding=encoding if pass_encoding else None) as tabfile: + def reencode(rows): + for row in rows: + yield [cell.encode(encoding, 'replace').decode(encoding) for cell in row] + + assert list(tabfile)[:2] == list(reencode([ + ['username', 'email', 'password', 'prix'], + ['John à Doé', 'john.doe@example.com', 'abcd1234', '10,24 €'], + ])), 'received encoding %s detected encoding %s' % (encoding, tabfile.encoding) + + assert tabfile.sheets == [0] + + +@pytest.mark.skipif(charamel is None, reason='charamel is missing') +@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15']) +def test_charamel_encoding_detection(encoding): + test_encoding(encoding, pass_encoding=False) + + +@pytest.mark.parametrize('encoding', ['utf-8', 'utf-8-sig', 'latin1', 'cp1252', 'iso8859-15']) +@pytest.mark.parametrize('quotechar', ['"', '\'', '#']) +@pytest.mark.parametrize('delimiter', [',', ';', '\t']) +def test_dialect_detection(delimiter, quotechar, encoding): + import io + import csv + + fh = io.StringIO() + writer = csv.writer(fh, delimiter=delimiter, quotechar=quotechar) + rows = [ + ['username', 'email', 'password'], + ['John à Doé', 'john.doe@example.com', 'abcd1234'], + ] + writer.writerows(rows) + serialization = fh.getvalue() + + with load(serialization.encode(encoding), encoding=encoding) as tabfile: + assert list(tabfile) == rows + + +def test_massive(tmp_path): + with (tmp_path / 'massive.csv').open('w') as fh: + for i in range(300000): + fh.write('1,2,3,4\n') + + memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss + with (tmp_path / 'massive.csv').open('rb') as fh: + # charamel use a lot of memory, so we force the encoding + with load(fh, encoding='utf-8', delimiter=',') as tabfile: + for i, row in enumerate(tabfile): + # we use less than 1 Kb for parsing 300000 lines + assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000, 'row %s' % i + + +def test_writer_typed(tmp_path): + path = tmp_path / 'target.csv' + with write(path if sys.version_info >= (3, 6) else str(path), format='csv') as writer: + writer.writerows([ + [1, 2, 3], + [datetime.date(2020, 1, 1), + datetime.datetime(2020, 1, 1, 10, 10, 10, tzinfo=pytz.utc), + datetime.datetime(2020, 1, 1, 10, 10, 10)], + ]) + + with path.open() as fh: + content = fh.read() + + assert content == '1,2,3\n2020-01-01,2020-01-01T10:10:10+0000,2020-01-01T10:10:10\n' + diff --git a/tests/test_ods.py b/tests/test_ods.py new file mode 100644 index 0000000..b46e569 --- /dev/null +++ b/tests/test_ods.py @@ -0,0 +1,158 @@ +# tabularfile - simple ods reader and writer +# Copyright (C) 2020 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import datetime +import io +import resource + +from tabularfile import load, write, TabularFileError +from tabularfile.ods import LinkedValue + +import pytest + + +def test_test1_ods(): + with load('tests/data/test1.ods', sheet=0) as tabfile: + assert list(tabfile) == [[]] + assert tabfile.sheets == ['Feuille1', 'Feuille2'] + + with load('tests/data/test1.ods', sheet=1) as tabfile: + assert list(tabfile) == [ + ['123'], + ['20/06/20', '', '', '', 'efef'], + ['20/06/20', '', '', '', '123'], + [ + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + ], + [], + ['', '', '', 'https://www.entrouvert.com/'], + [], + [], + [], + ['', '', '', '', '1312'], + ] + assert tabfile.sheets == ['Feuille1', 'Feuille2'] + + +def test_test1_ods_typed(): + with load('tests/data/test1.ods', sheet=1, typed=True) as tabfile: + assert list(tabfile) == [ + [123], + [datetime.date(2020, 6, 20), '', '', '', 'efef'], + [datetime.date(2020, 6, 20), '', '', '', 123], + [ + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + ], + [], + ['', '', '', 'https://www.entrouvert.com/'], + [], + [], + [], + ['', '', '', '', 1312], + ] + + +ROWS = [ + [123], + [datetime.date(2020, 6, 20), '', '', '', 'efef'], + [datetime.date(2020, 6, 20), '', '', '', 123], + [ + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + 'je suis content', + ], + [], + [], + [], + [], + [], + ['', '', '', '', 1312], +] + + +def test_test2_with_span(): + with load('tests/data/test2_with_span.ods', sheet=1, typed=True) as tabfile: + assert tabfile.sheets == ['Feuille1', 'Feuille2'] + with pytest.raises(TabularFileError, match='fusioned cells are unsupported'): + list(tabfile) + + +def test_writer(): + fd = io.BytesIO() + with write(fd) as writer: + writer.writerows(ROWS) + with load(fd.getvalue(), typed=True) as tabfile: + assert list(tabfile) == ROWS + + +def test_writer_cell_writer(): + fd = io.BytesIO() + + with write(fd) as writer: + with writer.cell_writer as write_cell: + write_cell('date') + write_cell('count') + with writer.cell_writer as write_cell: + write_cell(datetime.date(2019, 12, 1), href='https://example.com/summary/2020/12/01/') + write_cell(123, href='http://example.com') + with writer.cell_writer as write_cell: + write_cell(datetime.date(2020, 12, 1)) + write_cell(156) + + with load(fd.getvalue(), typed=True, xlink=True) as tabfile: + rows = list(tabfile) + assert rows == [ + ['date', 'count'], + [ + LinkedValue(datetime.date(2019, 12, 1), 'https://example.com/summary/2020/12/01/'), + LinkedValue(123.0, 'http://example.com') + ], + [datetime.date(2020, 12, 1), 156.0], + ] + + +def test_massive_write(tmp_path): + memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss + with (tmp_path / 'massive.ods').open('wb') as fh: + with write(fh) as writer: + writer.writerows([1, 2, 3, 4] for i in range(100000)) + assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000 + + +def test_massive_read(tmp_path): + with (tmp_path / 'massive.ods').open('wb') as fh: + with write(fh) as writer: + writer.writerows([1, 2, 3, 4] for i in range(100000)) + + with (tmp_path / 'massive.ods').open('rb') as fh: + with load(fh) as tabfile: + memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss + for i, row in enumerate(tabfile): + # we use less than 4 Kb for parsing 100000 lines + assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 4000, 'row %s' % i diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..42b6350 --- /dev/null +++ b/tox.ini @@ -0,0 +1,25 @@ +[tox] +toxworkdir = {env:TMPDIR:/tmp}/tox-{env:USER}/tabularfile/ +envlist = py3,py{36,37,38}-charamel,py3-isodate,py3-dateutil +skip_missing_interpreters = true + +[testenv] +deps = + pytest + pytest-cov + charamel: charamel + isodate: isodate + dateutil: python-dateutil + pytz +setenv = + JUNIT={tty::-o junit_suite_name={envname} --junit-xml=junit-{envname}.xml} + COVERAGE={tty::--cov=tabularfile --cov-branch --cov-append --cov-report xml --cov-report html} + TESTS=tests + charamel: TESTS=tests/test_csv.py -k charamel + isodate: TESTS=tests/test_ods.py -k ods_typed + dateutil: TESTS=tests/test_ods.py -k ods_typed +commands = + pytest {tty:--sw --durations=30:} {env:JUNIT:} {env:COVERAGE:} {posargs:{env:TESTS:tests}} + +[pytest] +junit_family=xunit2