This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
tabularfile/tests/test_csv.py

110 lines
4.0 KiB
Python

# tabularfile - simple ods tabfile and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import datetime
import resource
try:
import charamel
except ImportError:
charamel = None
import pytz
import pytest
from tabularfile import load, write
@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_encoding(encoding, pass_encoding=True):
with load('''\
username,email,password,prix
"John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"J'ai du bon tabac dans ma tabatière",john.doe@example.com,abcd1234,"10,24 €"
"L'écran est tombé dans la forêt avec un œuf et un nœud à noël. John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"John à Doé",john.doe@example.com,"10,24 €"'''.encode(encoding, 'replace'),
encoding=encoding if pass_encoding else None) as tabfile:
def reencode(rows):
for row in rows:
yield [cell.encode(encoding, 'replace').decode(encoding) for cell in row]
assert list(tabfile)[:2] == list(reencode([
['username', 'email', 'password', 'prix'],
['John à Doé', 'john.doe@example.com', 'abcd1234', '10,24 €'],
])), 'received encoding %s detected encoding %s' % (encoding, tabfile.encoding)
assert tabfile.sheets == [0]
@pytest.mark.skipif(charamel is None, reason='charamel is missing')
@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_charamel_encoding_detection(encoding):
test_encoding(encoding, pass_encoding=False)
@pytest.mark.parametrize('encoding', ['utf-8', 'utf-8-sig', 'latin1', 'cp1252', 'iso8859-15'])
@pytest.mark.parametrize('quotechar', ['"', '\'', '#'])
@pytest.mark.parametrize('delimiter', [',', ';', '\t'])
def test_dialect_detection(delimiter, quotechar, encoding):
import io
import csv
fh = io.StringIO()
writer = csv.writer(fh, delimiter=delimiter, quotechar=quotechar)
rows = [
['username', 'email', 'password'],
['John à Doé', 'john.doe@example.com', 'abcd1234'],
]
writer.writerows(rows)
serialization = fh.getvalue()
with load(serialization.encode(encoding), encoding=encoding) as tabfile:
assert list(tabfile) == rows
def test_massive(tmp_path):
with (tmp_path / 'massive.csv').open('w') as fh:
for i in range(300000):
fh.write('1,2,3,4\n')
memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss
with (tmp_path / 'massive.csv').open('rb') as fh:
# charamel use a lot of memory, so we force the encoding
with load(fh, encoding='utf-8', delimiter=',') as tabfile:
for i, row in enumerate(tabfile):
# we use less than 1 Kb for parsing 300000 lines
assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000, 'row %s' % i
def test_writer_typed(tmp_path):
path = tmp_path / 'target.csv'
with write(path if sys.version_info >= (3, 6) else str(path), format='csv') as writer:
writer.writerows([
[1, 2, 3],
[datetime.date(2020, 1, 1),
datetime.datetime(2020, 1, 1, 10, 10, 10, tzinfo=pytz.utc),
datetime.datetime(2020, 1, 1, 10, 10, 10)],
])
with path.open() as fh:
content = fh.read()
assert content == '1,2,3\n2020-01-01,2020-01-01T10:10:10+0000,2020-01-01T10:10:10\n'