tabularfile/tests/test_csv.py

# tabularfile - simple ods tabfile and writer
# Copyright (C) 2020 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys

import datetime
import resource

try:
    import charamel
except ImportError:
    charamel = None

import pytz

import pytest

from tabularfile import load, write


@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_encoding(encoding, pass_encoding=True):
    with load('''\
username,email,password,prix
"John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"J'ai du bon tabac dans ma tabatière",john.doe@example.com,abcd1234,"10,24 €"
"L'écran est tombé dans la forêt avec un œuf et un nœud à noël. John à Doé",john.doe@example.com,abcd1234,"10,24 €"
"John à Doé",john.doe@example.com,"10,24 €"'''.encode(encoding, 'replace'),
              encoding=encoding if pass_encoding else None) as tabfile:
        def reencode(rows):
            for row in rows:
                yield [cell.encode(encoding, 'replace').decode(encoding) for cell in row]

        assert list(tabfile)[:2] == list(reencode([
            ['username', 'email', 'password', 'prix'],
            ['John à Doé', 'john.doe@example.com', 'abcd1234', '10,24 €'],
        ])), 'received encoding %s detected encoding %s' % (encoding, tabfile.encoding)

        assert tabfile.sheets == [0]


@pytest.mark.skipif(charamel is None, reason='charamel is missing')
@pytest.mark.parametrize('encoding', ['utf_8', 'utf_8_sig', 'latin1', 'cp1252', 'iso-8859-15'])
def test_charamel_encoding_detection(encoding):
    test_encoding(encoding, pass_encoding=False)


@pytest.mark.parametrize('encoding', ['utf-8', 'utf-8-sig', 'latin1', 'cp1252', 'iso8859-15'])
@pytest.mark.parametrize('quotechar', ['"', '\'', '#'])
@pytest.mark.parametrize('delimiter', [',', ';', '\t'])
def test_dialect_detection(delimiter, quotechar, encoding):
    import io
    import csv

    fh = io.StringIO()
    writer = csv.writer(fh, delimiter=delimiter, quotechar=quotechar)
    rows = [
        ['username', 'email', 'password'],
        ['John à Doé', 'john.doe@example.com', 'abcd1234'],
    ]
    writer.writerows(rows)
    serialization = fh.getvalue()

    with load(serialization.encode(encoding), encoding=encoding) as tabfile:
        assert list(tabfile) == rows


def test_massive(tmp_path):
    with (tmp_path / 'massive.csv').open('w') as fh:
        for i in range(300000):
            fh.write('1,2,3,4\n')

    memory = resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss
    with (tmp_path / 'massive.csv').open('rb') as fh:
        # charamel use a lot of memory, so we force the encoding
        with load(fh, encoding='utf-8', delimiter=',') as tabfile:
            for i, row in enumerate(tabfile):
                # we use less than 1 Kb for parsing 300000 lines
                assert resource.getrusage(resource.RUSAGE_THREAD).ru_maxrss - memory < 1000, 'row %s' % i


def test_writer_typed(tmp_path):
    path = tmp_path / 'target.csv'
    with write(path if sys.version_info >= (3, 6) else str(path), format='csv') as writer:
        writer.writerows([
            [1, 2, 3],
            [datetime.date(2020, 1, 1),
             datetime.datetime(2020, 1, 1, 10, 10, 10, tzinfo=pytz.utc),
             datetime.datetime(2020, 1, 1, 10, 10, 10)],
        ])

    with path.open() as fh:
        content = fh.read()

    assert content == '1,2,3\n2020-01-01,2020-01-01T10:10:10+0000,2020-01-01T10:10:10\n'