first commit

This commit is contained in:
Benjamin Dauvergne 2020-07-12 11:15:20 +02:00
commit 871d96975e
24 changed files with 1371 additions and 0 deletions

49
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,49 @@
@Library('eo-jenkins-lib@master') import eo.Utils
pipeline {
agent any
options { disableConcurrentBuilds() }
stages {
stage('Unit Tests') {
steps {
sh """rm -rf .env
virtualenv .env
. .env/bin/activate
pip install tox
tox -rv"""
}
post {
always {
script {
utils = new Utils()
utils.publish_coverage('coverage.xml')
utils.publish_coverage_native('index.html')
}
mergeJunitResults()
}
}
}
stage('Packaging') {
steps {
script {
if (env.JOB_NAME == 'tabularfile' && env.GIT_BRANCH == 'origin/master') {
sh 'sudo -H -u eobuilder /usr/local/bin/eobuilder tabularfile'
} else if (env.GIT_BRANCH.startsWith('hotfix/')) {
sh "sudo -H -u eobuilder /usr/local/bin/eobuilder --branch ${env.GIT_BRANCH} --hotfix tabularfile"
}
}
}
}
}
post {
always {
script {
utils = new Utils()
utils.mail_notify(currentBuild, env, 'ci+jenkins-tabularfile@entrouvert.org')
}
}
cleanup {
cleanWs()
}
}
}

3
MANIFEST.in Normal file
View File

@ -0,0 +1,3 @@
include VERSION
include sample.py
include tests/*.ods

97
README Normal file
View File

@ -0,0 +1,97 @@
tabularfile
============
Tabular files for humans or an opinionated approach to tabular files or
whatever. Parse and write ODS files, mimicing the `csv` module interface,
keeping memory usage as low as possible.
Reading
-------
The main API is `tabularfile.load(path_or_file, **kwarsg)` it's a context manager returning an
iterable. It accepts as first argument a bytes string, a path or an opened
file. Other arguments depends on the backend, ods or csv.
.. code:: pycon
>>> from tabularfile import load
>>> with load('sheet.ods') as tabfile:
list(tabfile)
[
['date', 'count'],
['01/12/2019', '123'],
['01/01/2020', '156'],
]
>>> with load('sheet.csv') as tabfile:
list(tabfile)
[
['date', 'count'],
['01/12/2019', '123'],
['01/01/2020', '156'],
]
With `typed=True` you can ask the reader to cast cells content based on the declared OpenDocument value type.
.. code:: pycon
>>> with load('sheet.ods'), typed=True) as tabfile:
list(tabfile)
[
['date', 'count']
[datetime.date(2019, 12, 1), 123],
[datetime.date(2029, 1, 1), 156],
]
With the `sheet` constructor attribute you can load another sheet than the
first one, only integer indexes are supported, it also supports sheet's name.
To get the list of tsheets you can use the `tabfile.sheets` accessor.
.. code:: pycon
>>> with load('sheet.ods', sheet=1) as tabfile:
...
>>> with load('sheet.ods', sheet='Sheet1') as tabfile:
tabfile.sheets
['Sheet1', 'Sheet2']
Writing
-------
To write a sheet file, use the `tabularfile.write(path_or_file, format='ods',
**kwargs)` context manager. `format` can also be `csv` and in this case it
accepts other arguments like `encoding`, `dialect`, `delimiter` or `quotechar`.
The ODS writer accept special value `tabularfile.ods.LinkedValue` if you need
to put XLink on your data. ODS and CSV writer accets date and datetime values
which will be formatted using the `date_format` and `datetime_format`
templates.
.. code:: pycon
>>> from tabularfile import ods
>>> with ods.writer('sheet.ods') as writer:
writer.writerow(['date', 'count', 'link'])
writer.writerows([
[datetime.date(2019, 12, 1), 123, ods.LinkedValue('Click me', href='https://example.com/')],
[datetime.date(2020, 1, 1), 156, ods.LinkedValue('Click me', href='https://example.com/')],
])
Parsing ISO8601 dates
---------------------
Base python before version 3.7 is not able to parse date with timezone, so we
try as much as possible to use other library to do it. `isodate` or
`python-dateutil` are used if present.
Detecting CSV character encoding
--------------------------------
If the charamel_ package is installed, it is used for detecting the encoding of CSV files.
.. _charamel: https://pypi.org/project/charamel/

5
debian/changelog vendored Normal file
View File

@ -0,0 +1,5 @@
python3-tabularfile (1) unstable; urgency=medium
* First release.
-- Benjamin Dauvergne <bdauvergne@entrouvert.com> Sun, 12 Jul 2020 22:49:00 +0200

1
debian/compat vendored Normal file
View File

@ -0,0 +1 @@
10

16
debian/control vendored Normal file
View File

@ -0,0 +1,16 @@
Source: python3-tabularfile
Maintainer: Benjamin Dauvergne <bdauvergne@entrouvert.com>
Section: python
Priority: optional
Build-Depends: debhelper (>= 9),
dh-python,
python3,
python3-setuptools
Package: python3-tabularfile
Architecture: all
Depends: ${misc:Depends},
${python3:Depends}
Description: Parse and write tabular files of any size.
It contains:
* tools to parse/write ODS files.

10
debian/copyright vendored Normal file
View File

@ -0,0 +1,10 @@
Authors: Entr'ouvert
Copyright © 2020, Entr'ouvert
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
The Software is provided “as is”, without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders X be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the Software.
Except as contained in this notice, the name of the copyright holders shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the copyright holders.

1
debian/install vendored Normal file
View File

@ -0,0 +1 @@
README /usr/share/doc/python3-tabularfile

12
debian/rules vendored Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/make -f
export PYBUILD_NAME := tabularfile
export PYBUILD_DISABLE_python3=test
%:
dh $@ --with python3 --buildsystem=pybuild
override_dh_auto_clean:
rm -rf $(PACKAGE_NAME).egg-info
rm -rf PKG-INFO
dh_auto_clean

1
debian/source/format vendored Normal file
View File

@ -0,0 +1 @@
3.0 (quilt)

14
sample.py Normal file
View File

@ -0,0 +1,14 @@
import datetime
from tabularfile import write
with write('sheet.ods', date_format='%d/%m/%Y', datetime_format='%d/%m/%Y %H:%M:%S') as writer:
writer.writerow(['date', 'count'])
writer.writerows([
[datetime.date(2019, 12, 1), 123],
[datetime.date(2020, 1, 1), 156],
[datetime.datetime(2020, 1, 1, 12, 0, 0), 156],
])
with writer.cell_writer as cell_writer:
cell_writer('coucou', href='https://example.com/')
cell_writer('coucou', href='https://example.com/a/')

73
setup.py Normal file
View File

@ -0,0 +1,73 @@
# tabularfile
# Copyright (C) 2020 Entr'ouvert
import os
import subprocess
from distutils.command.sdist import sdist
import setuptools
def get_version():
'''Use the VERSION, if absent generates a version with git describe, if not
tag exists, take 0.0- and add the length of the commit log.
'''
if os.path.exists('VERSION'):
with open('VERSION', 'r') as v:
return v.read()
if os.path.exists('.git'):
p = subprocess.Popen(
['git', 'describe', '--dirty=.dirty', '--match=v*'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
result = p.communicate()[0]
if p.returncode == 0:
result = result.decode('ascii').strip()[1:] # strip spaces/newlines and initial v
if '-' in result: # not a tagged version
real_number, commit_count, commit_hash = result.split('-', 2)
version = '%s.post%s+%s' % (real_number, commit_count, commit_hash)
else:
version = result
return version
else:
return '0.0.post%s' % len(subprocess.check_output(['git', 'rev-list', 'HEAD']).splitlines())
return '0.0'
class eo_sdist(sdist):
def run(self):
if os.path.exists('VERSION'):
os.remove('VERSION')
version = get_version()
with open('VERSION', 'w') as version_file:
version_file.write(version)
sdist.run(self)
if os.path.exists('VERSION'):
os.remove('VERSION')
cmdclass = {'sdist': eo_sdist}
with open('README', 'r') as fh:
long_description = fh.read()
setuptools.setup(
name='tabularfile',
version=get_version(),
author='Benjamin Dauvergne',
author_email='bdauvergne@entrouvert.com',
description='A small example package',
long_description=long_description,
long_description_content_type='text/x-rst',
url='https://dev.entrouvert.org/projects/tabularfile/',
packages=setuptools.find_packages(exclude=['tests']),
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
python_requires='>=3.5',
install_requires=['lxml'],
cmdclass=cmdclass,
)

21
tabularfile/__init__.py Normal file
View File

@ -0,0 +1,21 @@
# tabularfile - tabular files for humans
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from .loader import load
from .writer import write
from .common import TabularFileError
__all__ = ['load', 'write', 'TabularFileError']

59
tabularfile/common.py Normal file
View File

@ -0,0 +1,59 @@
# tabularfile.common- simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
try:
import isodate
except ImportError:
isodate = False
try:
import dateutil.parser as dateutil_parser
except ImportError:
dateutil_parser = None
class TabularFileError(Exception):
pass
def parse_datetime(value):
if 'T' not in value:
raise ValueError
if isodate:
return isodate.parse_datetime(value)
if dateutil_parser:
if hasattr(dateutil_parser, 'isoparse'):
return dateutil_parser.isoparse(value)
else:
return dateutil_parser.parse(value)
if hasattr(datetime.datetime, 'fromisoformat'):
return datetime.datetime.fromisoformat(value)
return datetime.datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
def parse_date(value):
if isodate:
return isodate.parse_date(value)
if dateutil_parser:
if hasattr(dateutil_parser, 'isoparse'):
return dateutil_parser.isoparse(value).date()
else:
return dateutil_parser.parse(value).date()
if hasattr(datetime.date, 'fromisoformat'):
return datetime.date.fromisoformat(value)
return datetime.datetime.strptime(value, '%Y-%m-%d').date()

144
tabularfile/csv.py Normal file
View File

@ -0,0 +1,144 @@
# tabularfile.csv - simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import contextlib
import datetime
import io
from .common import TabularFileError
# work around https://github.com/Ousret/charset_normalizer/issues/33
try:
import charset_normalizer.normalizer
import sys
charset_normalizer.normalizer.python_version_tuple = lambda: tuple(sys.version_info)[:3]
except ImportError:
pass
class Reader:
def __init__(self, fh, encoding=None, dialect=None, delimiter=None, quotechar=None, **kwargs):
offset = fh.tell()
header = fh.read(1024 * 64)
self.encoding = encoding
if not self.encoding or self.encoding == 'autodetect':
self.encoding = None
try:
import charamel
except ImportError:
pass
else:
self.encoding = charamel.Detector().detect(header) or 'utf-8'
if not self.encoding:
raise TabularFileError('encoding cannot be autodetected, please install charamel or charset_normalizer')
fh.seek(offset)
text_fh = io.TextIOWrapper(fh, encoding=self.encoding)
offset = text_fh.tell()
self._offset = offset
if dialect or delimiter or quotechar:
self.csv_kwargs = {
'dialect': dialect,
'delimiter': delimiter,
'quotechar': quotechar,
}
else:
sniffer = csv.Sniffer()
header = ''
for i in range(10):
header += text_fh.read(1024 * 5)
try:
dialect = sniffer.sniff(header)
self.csv_kwargs = {'dialect': dialect}
break
except csv.Error:
pass
else:
# dummy detection
line0 = header.splitlines()[0]
delimiter = ',' if line0.count(',') >= line0.count(';') else ';'
self.csv_kwargs = {'delimiter': delimiter}
text_fh.seek(offset)
self.text_fh = text_fh
def __iter__(self):
try:
for row in csv.reader(self.text_fh, **self.csv_kwargs):
yield row
except Exception as e:
raise TabularFileError('parsing error') from e
finally:
self.text_fh.seek(self._offset)
@property
def sheets(self):
return [0]
class Writer:
def __init__(self,
text_fh,
dialect=None,
delimiter=None,
quotechar=None,
date_format='%Y-%m-%d',
datetime_format='%Y-%m-%dT%H:%M:%S%z',
**kwargs):
if not dialect and not delimiter and not quotechar:
dialect = csv.excel
csv_kwargs = {}
if dialect:
csv_kwargs['dialect'] = dialect
if delimiter:
csv_kwargs['delimiter'] = delimiter
if quotechar:
csv_kwargs['quotechar'] = quotechar
self.csv_writer = csv.writer(text_fh, **csv_kwargs)
self.date_format = date_format
self.datetime_format = datetime_format
def writerow(self, row):
def helper(row):
for cell in row:
if isinstance(cell, datetime.datetime):
yield cell.strftime(self.datetime_format)
elif isinstance(cell, datetime.date):
yield cell.strftime(self.date_format)
else:
yield str(cell)
self.csv_writer.writerow(helper(row))
def writerows(self, rows):
for row in rows:
self.writerow(row)
@contextlib.contextmanager
def writer(fh, encoding='utf-8', **kwargs):
text_fh = io.TextIOWrapper(fh, encoding=encoding)
yield Writer(text_fh, **kwargs)
text_fh.flush()

59
tabularfile/loader.py Normal file
View File

@ -0,0 +1,59 @@
# tabularfile.loader - tabular file for humans
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import contextlib
import io
from .common import TabularFileError
@contextlib.contextmanager
def fh_from_path_or_file(path_or_file):
if isinstance(path_or_file, bytes):
yield io.BytesIO(path_or_file)
else:
if isinstance(path_or_file, io.TextIOBase):
raise TabularFileError('file handle must be a bytes stream')
if isinstance(path_or_file, io.IOBase):
if not path_or_file.readable or not path_or_file.seekable:
raise TabularFileError('file handle must be readable and seekable')
yield path_or_file
else:
with open(path_or_file, 'rb') as fh:
yield fh
@contextlib.contextmanager
def load(path_or_file, format=None, **kwargs):
from . import ods, csv
with fh_from_path_or_file(path_or_file) as fh:
offset = fh.tell()
header = fh.read(1024)
fh.seek(offset)
if format is None:
if header[:4] == b'PK\x03\x04':
format = 'ods'
else:
format = 'csv'
if format == 'ods':
yield ods.Reader(fh, **kwargs)
elif format == 'csv':
yield csv.Reader(fh, **kwargs)
else:
raise TabularFileError('unknown format %r' % format)

454
tabularfile/ods.py Normal file
View File

@ -0,0 +1,454 @@
# tabularfile.ods - simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import contextlib
import datetime
import io
from lxml import etree as ET
import re
import sys
import tempfile
import zipfile
from .common import TabularFileError, parse_date, parse_datetime
class ODSUnsupportedCellException(TabularFileError):
pass
class Namespace:
def __init__(self, url):
self.url = url
def __call__(self, name):
return '{%s}%s' % (self.url, name)
TEXT_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:text:1.0')
P = TEXT_NS('p')
A = TEXT_NS('a')
TABLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:table:1.0')
TABLE = TABLE_NS('table')
TABLE_NAME = TABLE_NS('name')
COLUMN = TABLE_NS('table-column')
ROW = TABLE_NS('table-row')
CELL = TABLE_NS('table-cell')
COVERED_CELL = TABLE_NS('covered-table-cell')
NUMBER_COLUMNS_REPEATED = TABLE_NS('number-columns-repeated')
NUMBER_ROWS_REPEATED = TABLE_NS('number-rows-repeated')
NUMBER_COLUMNS_SPANNED = TABLE_NS('number-columns-spanned')
NUMBER_ROWS_SPANNED = TABLE_NS('number-rows-spanned')
TABLE_STYLE_NAME = TABLE_NS('style-name')
OFFICE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:office:1.0')
DOCUMENT_CONTENT = OFFICE_NS('document-content')
BODY = OFFICE_NS('body')
SPREADSHEET = OFFICE_NS('spreadsheet')
VALUE_TYPE = OFFICE_NS('value-type')
VALUE = OFFICE_NS('value')
DATE_VALUE = OFFICE_NS('date-value')
DOCUMENT_STYLES = OFFICE_NS('document-styles')
FONT_FACE_DECLS = OFFICE_NS('font-face-decls')
STYLES = OFFICE_NS('styles')
STYLE_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:style:1.0')
STYLE = STYLE_NS('style')
STYLE_NAME = STYLE_NS('name')
FAMILY = STYLE_NS('family')
DATA_STYLE_NAME = STYLE_NS('data-style-name')
PARENT_STYLE = STYLE_NS('parent-style')
TABLE_COLUMN_PROPERTIES = STYLE_NS('table-column-properties')
COLUMN_WIDTH = STYLE_NS('column-width')
NUMBER_NS = Namespace('urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0')
DATE_STYLE = NUMBER_NS('date-style')
NUMBER_STYLE = NUMBER_NS('style')
YEAR = NUMBER_NS('year')
MONTH = NUMBER_NS('month')
DAY = NUMBER_NS('day')
HOURS = NUMBER_NS('hours')
MINUTES = NUMBER_NS('minutes')
SECONDS = NUMBER_NS('seconds')
TEXT = NUMBER_NS('text')
XLINK_NS = Namespace('http://www.w3.org/1999/xlink')
XLINK_HREF = XLINK_NS('href')
VALUE_WITH_HREF_CLASS_REGISTRY = {}
def value_with_href(value, href=None):
if not href:
return value
return LinkedValue(value, href)
def text_content(node):
'''Extract text content from node and all its children. Equivalent to
xmlNodeGetContent from libxml.'''
return ''.join(node.itertext())
TYPED_VALUE_TYPES = ['float', 'date']
class LinkedValue(object):
def __init__(self, value, href):
assert href, 'href is mandatory'
self.value = value
self.href = href
def __eq__(self, other):
if not isinstance(other, self.__class__):
return False
return self.value == other.value and self.href == other.href
def __repr__(self):
return 'LinkedValue({self.value!r}, {self.href!r})'.format(self=self)
def cell_content(elem, typed=False):
text = '\n'.join(text_content(elem) for subelem in elem.iter(tag=P))
value_type = elem.attrib.get(VALUE_TYPE, 'string')
if not typed or value_type not in TYPED_VALUE_TYPES:
pass
elif value_type == 'float':
value = elem.attrib.get(VALUE, '')
try:
return float(value)
except ValueError:
pass
elif value_type == 'date':
value = elem.attrib.get(DATE_VALUE, '')
try:
return parse_datetime(value)
except ValueError:
pass
try:
return parse_date(value)
except ValueError:
pass
return text
def cell_repeat(elem):
try:
repeat = int(elem.attrib[NUMBER_COLUMNS_REPEATED])
if repeat < 1:
return 1
return repeat
except (KeyError, ValueError):
return 1
def row_repeat(elem):
try:
repeat = int(elem.attrib[NUMBER_ROWS_REPEATED])
if repeat < 1:
return 1
return repeat
except (KeyError, ValueError):
return 1
class Reader:
def __init__(self, ods_content, sheet=0, typed=False, xlink=False, **kwargs):
self.ods_content = ods_content
self.sheet = sheet
self.typed = typed
self.xlink = xlink
@contextlib.contextmanager
def _zip(self):
if isinstance(self.ods_content, bytes):
fd = io.BytesIO(self.ods_content)
with zipfile.ZipFile(fd) as _zip:
yield _zip
else:
with zipfile.ZipFile(self.ods_content) as _zip:
yield _zip
@property
def sheets(self):
sheets = []
with self._parser_context(events=('start',), tag=TABLE) as context:
idx = 0
for event, elem in context:
name = idx
idx += 1
if TABLE_NAME in elem.attrib:
name = elem.attrib[TABLE_NAME]
sheets.append(name)
self.__dict__['sheets'] = sheets
return sheets
@contextlib.contextmanager
def _parser_context(self,
events=('start', 'end'),
tag=(TABLE, ROW, CELL, COVERED_CELL),
**kwargs):
with self._zip() as _zip:
with _zip.open('content.xml', mode='r') as fd:
yield ET.iterparse(fd, events=events, tag=tag, **kwargs)
def __iter__(self):
with self._parser_context() as context:
yield from self._parse_target_sheet(context)
def _parse_target_sheet(self, context):
idx = 0
for event, elem in context:
if event == 'start' and elem.tag == TABLE:
if idx == self.sheet:
yield from self._parse_sheet(context)
break
idx += 1
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
def _parse_sheet(self, context):
for event, elem in context:
if event == 'end' and elem.tag == TABLE:
elem.clear()
break
elif event == 'start' and elem.tag == ROW:
repeat = row_repeat(elem)
row = self._parse_row(context)
for _ in range(repeat):
yield list(row)
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
def _parse_row(self, context):
row = []
idx = 0
for event, elem in context:
if event == 'end' and elem.tag == ROW:
elem.clear()
parent = elem.getparent()
if parent is not None:
parent.remove(elem)
return row
elif event == 'start' and elem.tag == CELL:
# ignore last repeated empty cell
content, repeat = self._parse_cell(context)
if content:
if len(row) != idx:
# complete missing cells if previous cell was empty and repeated
row.extend([''] * (idx - len(row)))
row.extend([content] * repeat)
idx += repeat
else:
elem.clear()
def _parse_cell(self, context):
for event, elem in context:
if event == 'end' and elem.tag == COVERED_CELL:
raise ODSUnsupportedCellException('table:covered-table-cell is unsupported')
if event == 'end' and elem.tag == CELL:
if NUMBER_COLUMNS_SPANNED in elem.attrib or NUMBER_ROWS_SPANNED in elem.attrib:
raise ODSUnsupportedCellException(
'fusioned cells are unsupported '
'(table:number-rows-spanned and '
'table:number-columns-spanned attributes)'
)
value = cell_content(elem, typed=self.typed)
# text-cell:
# p:
# a:
# (text, href=...)
if (self.xlink
and len(elem)
and elem[0].text is None
and len(elem[0]) == 1
and elem[0][0].tag == A):
value = value_with_href(value, href=elem[0][0].attrib.get(XLINK_HREF))
return value, cell_repeat(elem)
def styles_xml_content(date_format, datetime_format):
root = ET.Element(DOCUMENT_STYLES)
styles = ET.SubElement(root, STYLES)
# default style, inherited by other styles
ET.SubElement(styles, STYLE, attrib={STYLE_NAME: 'Default'})
def define_date_style(name, strftime_string):
date_style = ET.SubElement(styles, DATE_STYLE, attrib={STYLE_NAME: name + 'NumberFormat'})
for part in re.findall(r'%?.', strftime_string):
if part == '%Y':
ET.SubElement(date_style, YEAR, attrib={NUMBER_STYLE: 'long'})
elif part == '%m':
ET.SubElement(date_style, MONTH, attrib={NUMBER_STYLE: 'long'})
elif part == '%d':
ET.SubElement(date_style, DAY, attrib={NUMBER_STYLE: 'long'})
elif part == '%H':
ET.SubElement(date_style, HOURS, attrib={NUMBER_STYLE: 'long'})
elif part == '%M':
ET.SubElement(date_style, MINUTES, attrib={NUMBER_STYLE: 'long'})
elif part == '%S':
ET.SubElement(date_style, SECONDS, attrib={NUMBER_STYLE: 'long'})
else:
ET.SubElement(date_style, TEXT).text = part
ET.SubElement(
styles,
STYLE,
attrib={
STYLE_NAME: name,
FAMILY: 'table-cell',
DATA_STYLE_NAME: name + 'NumberFormat',
PARENT_STYLE: 'Default',
},
)
define_date_style('Date', date_format)
define_date_style('DateTime', datetime_format)
return ET.tostring(root)
class Writer:
def __init__(self, target, date_format='%Y-%m-%d', datetime_format='%Y-%m-%dT%H:%M:%S'):
self.target = target
self.date_format = date_format
self.datetime_format = datetime_format
@contextlib.contextmanager
def _zip(self):
with zipfile.ZipFile(self.target, 'w') as _zip:
_zip.writestr('mimetype', 'application/vnd.oasis.opendocument.spreadsheet')
_zip.writestr(
'META-INF/manifest.xml',
'''<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
<manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.spreadsheet"/>
<manifest:file-entry manifest:full-path="styles.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="META-INF/manifest.xml" manifest:media-type="text/xml"/>
<manifest:file-entry manifest:full-path="mimetype" manifest:media-type="text/plain"/>
</manifest:manifest>''',
)
_zip.writestr(
'styles.xml',
styles_xml_content(date_format=self.date_format, datetime_format=self.datetime_format),
)
yield _zip
@contextlib.contextmanager
def _serialization_context(self):
with self._zip() as _zip:
if sys.version_info >= (3, 6):
with _zip.open('content.xml', mode='w') as fd:
with ET.xmlfile(fd) as xml_writer:
yield xml_writer
else:
# we must use a temporary file before python 3.6
with tempfile.NamedTemporaryFile() as fd:
with ET.xmlfile(fd) as xml_writer:
yield xml_writer
fd.flush()
_zip.write(fd.name, 'content.xml')
class WriterHelper:
def __init__(self, context):
self.context = context
def writerow(self, row):
def writecell(raw_value):
attrib = {}
if isinstance(raw_value, LinkedValue):
value = raw_value.value
else:
value = raw_value
if isinstance(value, (float, int)):
value_type = 'float'
text_value = str(value)
attrib[VALUE] = text_value
elif isinstance(value, datetime.datetime):
value_type = 'date'
text_value = value.isoformat()
attrib[DATE_VALUE] = text_value
attrib[TABLE_STYLE_NAME] = 'DateTime'
elif isinstance(value, datetime.date):
value_type = 'date'
text_value = value.isoformat()
attrib[DATE_VALUE] = text_value
attrib[TABLE_STYLE_NAME] = 'Date'
else:
value_type = 'string'
text_value = str(value)
attrib[VALUE_TYPE] = value_type
with self.context.element(CELL, attrib=attrib):
with self.context.element(P):
if isinstance(raw_value, LinkedValue):
with self.context.element(A, attrib={XLINK_HREF: raw_value.href}):
self.context.write(text_value)
else:
self.context.write(text_value)
with self.context.element(ROW):
for value in row:
writecell(value)
def writerows(self, rows):
for row in rows:
self.writerow(row)
@property
@contextlib.contextmanager
def cell_writer(self):
row = []
def write(value, href=None):
if href:
raw_value = LinkedValue(value, href)
else:
raw_value = value
row.append(raw_value)
yield write
self.writerow(row)
@contextlib.contextmanager
def open(self):
with self._serialization_context() as context:
with context.element(DOCUMENT_CONTENT):
with context.element(BODY):
with context.element(SPREADSHEET):
with context.element(TABLE):
yield self.WriterHelper(context)
@contextlib.contextmanager
def writer(target, **kwargs):
with Writer(target, **kwargs).open() as _writer:
yield _writer

60
tabularfile/writer.py Normal file
View File

@ -0,0 +1,60 @@
# tabularfile.writer - tabular file for humans
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import contextlib
import io
import weakref
from .common import TabularFileError
@contextlib.contextmanager
def fh_from_path_or_file(path_or_file):
if isinstance(path_or_file, io.IOBase):
if not path_or_file.writable or not path_or_file.seekable:
raise TabularFileError('file handle must be writable and seekable')
yield path_or_file
else:
with open(path_or_file, 'wb') as fh:
yield fh
class WeakrefHolder:
def __init__(self, value):
self.value = value
def __enter__(self):
return weakref.proxy(self.value)
def __exit__(self, a, b, c):
self.value = None
@contextlib.contextmanager
def write(path_or_file, format='ods', **kwargs):
from . import ods, csv
with fh_from_path_or_file(path_or_file) as fh:
if format == 'ods':
context = ods.writer(fh, **kwargs)
elif format == 'csv':
context = csv.writer(fh, **kwargs)
else:
raise ValueError('invalid format %r' % format)
with context as _writer:
with WeakrefHolder(_writer) as proxy:
yield proxy

0
tests/__init__.py Normal file
View File

BIN
tests/data/test1.ods Normal file

Binary file not shown.

Binary file not shown.

109
tests/test_csv.py Normal file
View File

@ -0,0 +1,109 @@
# tabularfile - simple ods tabfile and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import datetime
import resource
try:
import charamel
except ImportError:
charamel = None
import pytz
import pytest
from tabularfile import load, write
@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_encoding(encoding, pass_encoding=True):
with load('''\
username,email,password,prix
"John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"J'ai du bon tabac dans ma tabatière",john.doe@example.com,abcd1234,"10,24 €"
"L'écran est tombé dans la forêt avec un œuf et un nœud à noël. John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"John à Doé",john.doe@example.com,"10,24 €"'''.encode(encoding, 'replace'),
encoding=encoding if pass_encoding else None) as tabfile:
def reencode(rows):
for row in rows:
yield [cell.encode(encoding, 'replace').decode(encoding) for cell in row]
assert list(tabfile)[:2] == list(reencode([
['username', 'email', 'password', 'prix'],
['John à Doé', 'john.doe@example.com', 'abcd1234', '10,24 €'],
])), 'received encoding %s detected encoding %s' % (encoding, tabfile.encoding)
assert tabfile.sheets == [0]
@pytest.mark.skipif(charamel is None, reason='charamel is missing')
@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_charamel_encoding_detection(encoding):
test_encoding(encoding, pass_encoding=False)
@pytest.mark.parametrize('encoding', ['utf-8', 'utf-8-sig', 'latin1', 'cp1252', 'iso8859-15'])
@pytest.mark.parametrize('quotechar', ['"', '\'', '#'])
@pytest.mark.parametrize('delimiter', [',', ';', '\t'])
def test_dialect_detection(delimiter, quotechar, encoding):
import io
import csv
fh = io.StringIO()
writer = csv.writer(fh, delimiter=delimiter, quotechar=quotechar)
rows = [
['username', 'email', 'password'],
['John à Doé', 'john.doe@example.com', 'abcd1234'],
]
writer.writerows(rows)
serialization = fh.getvalue()
with load(serialization.encode(encoding), encoding=encoding) as tabfile:
assert list(tabfile) == rows
def test_massive(tmp_path):
with (tmp_path / 'massive.csv').open('w') as fh:
for i in range(300000):
fh.write('1,2,3,4\n')
memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss
with (tmp_path / 'massive.csv').open('rb') as fh:
# charamel use a lot of memory, so we force the encoding
with load(fh, encoding='utf-8', delimiter=',') as tabfile:
for i, row in enumerate(tabfile):
# we use less than 1 Kb for parsing 300000 lines
assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000, 'row %s' % i
def test_writer_typed(tmp_path):
path = tmp_path / 'target.csv'
with write(path if sys.version_info >= (3, 6) else str(path), format='csv') as writer:
writer.writerows([
[1, 2, 3],
[datetime.date(2020, 1, 1),
datetime.datetime(2020, 1, 1, 10, 10, 10, tzinfo=pytz.utc),
datetime.datetime(2020, 1, 1, 10, 10, 10)],
])
with path.open() as fh:
content = fh.read()
assert content == '1,2,3\n2020-01-01,2020-01-01T10:10:10+0000,2020-01-01T10:10:10\n'

158
tests/test_ods.py Normal file
View File

@ -0,0 +1,158 @@
# tabularfile - simple ods reader and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
import io
import resource
from tabularfile import load, write, TabularFileError
from tabularfile.ods import LinkedValue
import pytest
def test_test1_ods():
with load('tests/data/test1.ods', sheet=0) as tabfile:
assert list(tabfile) == [[]]
assert tabfile.sheets == ['Feuille1', 'Feuille2']
with load('tests/data/test1.ods', sheet=1) as tabfile:
assert list(tabfile) == [
['123'],
['20/06/20', '', '', '', 'efef'],
['20/06/20', '', '', '', '123'],
[
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
],
[],
['', '', '', 'https://www.entrouvert.com/'],
[],
[],
[],
['', '', '', '', '1312'],
]
assert tabfile.sheets == ['Feuille1', 'Feuille2']
def test_test1_ods_typed():
with load('tests/data/test1.ods', sheet=1, typed=True) as tabfile:
assert list(tabfile) == [
[123],
[datetime.date(2020, 6, 20), '', '', '', 'efef'],
[datetime.date(2020, 6, 20), '', '', '', 123],
[
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
],
[],
['', '', '', 'https://www.entrouvert.com/'],
[],
[],
[],
['', '', '', '', 1312],
]
ROWS = [
[123],
[datetime.date(2020, 6, 20), '', '', '', 'efef'],
[datetime.date(2020, 6, 20), '', '', '', 123],
[
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
'je suis content',
],
[],
[],
[],
[],
[],
['', '', '', '', 1312],
]
def test_test2_with_span():
with load('tests/data/test2_with_span.ods', sheet=1, typed=True) as tabfile:
assert tabfile.sheets == ['Feuille1', 'Feuille2']
with pytest.raises(TabularFileError, match='fusioned cells are unsupported'):
list(tabfile)
def test_writer():
fd = io.BytesIO()
with write(fd) as writer:
writer.writerows(ROWS)
with load(fd.getvalue(), typed=True) as tabfile:
assert list(tabfile) == ROWS
def test_writer_cell_writer():
fd = io.BytesIO()
with write(fd) as writer:
with writer.cell_writer as write_cell:
write_cell('date')
write_cell('count')
with writer.cell_writer as write_cell:
write_cell(datetime.date(2019, 12, 1), href='https://example.com/summary/2020/12/01/')
write_cell(123, href='http://example.com')
with writer.cell_writer as write_cell:
write_cell(datetime.date(2020, 12, 1))
write_cell(156)
with load(fd.getvalue(), typed=True, xlink=True) as tabfile:
rows = list(tabfile)
assert rows == [
['date', 'count'],
[
LinkedValue(datetime.date(2019, 12, 1), 'https://example.com/summary/2020/12/01/'),
LinkedValue(123.0, 'http://example.com')
],
[datetime.date(2020, 12, 1), 156.0],
]
def test_massive_write(tmp_path):
memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss
with (tmp_path / 'massive.ods').open('wb') as fh:
with write(fh) as writer:
writer.writerows([1, 2, 3, 4] for i in range(100000))
assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000
def test_massive_read(tmp_path):
with (tmp_path / 'massive.ods').open('wb') as fh:
with write(fh) as writer:
writer.writerows([1, 2, 3, 4] for i in range(100000))
with (tmp_path / 'massive.ods').open('rb') as fh:
with load(fh) as tabfile:
memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss
for i, row in enumerate(tabfile):
# we use less than 4 Kb for parsing 100000 lines
assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 4000, 'row %s' % i

25
tox.ini Normal file
View File

@ -0,0 +1,25 @@
[tox]
toxworkdir = {env:TMPDIR:/tmp}/tox-{env:USER}/tabularfile/
envlist = py3,py{36,37,38}-charamel,py3-isodate,py3-dateutil
skip_missing_interpreters = true
[testenv]
deps =
pytest
pytest-cov
charamel: charamel
isodate: isodate
dateutil: python-dateutil
pytz
setenv =
JUNIT={tty::-o junit_suite_name={envname} --junit-xml=junit-{envname}.xml}
COVERAGE={tty::--cov=tabularfile --cov-branch --cov-append --cov-report xml --cov-report html}
TESTS=tests
charamel: TESTS=tests/test_csv.py -k charamel
isodate: TESTS=tests/test_ods.py -k ods_typed
dateutil: TESTS=tests/test_ods.py -k ods_typed
commands =
pytest {tty:--sw --durations=30:} {env:JUNIT:} {env:COVERAGE:} {posargs:{env:TESTS:tests}}
[pytest]
junit_family=xunit2