debian-xmlschema/xmlschema/tests/test_regex.py

403 lines
18 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
This module runs tests on XML Schema regular expressions.
"""
from __future__ import unicode_literals
import unittest
import sys
import re
from itertools import chain
from unicodedata import category
from xmlschema.exceptions import XMLSchemaValueError, XMLSchemaRegexError
from xmlschema.compat import unicode_chr
from xmlschema.codepoints import code_point_repr, iterparse_character_group, iter_code_points, \
UnicodeSubset, build_unicode_categories, UNICODE_CATEGORIES
from xmlschema.regex import get_python_regex, XsdRegexCharGroup
class TestCodePoints(unittest.TestCase):
def test_iter_code_points(self):
self.assertEqual(list(iter_code_points([10, 20, 11, 12, 25, (9, 21), 21])), [(9, 22), 25])
self.assertEqual(list(iter_code_points([10, 20, 11, 12, 25, (9, 20), 21])), [(9, 22), 25])
self.assertEqual(list(iter_code_points({2, 120, 121, (150, 260)})), [2, (120, 122), (150, 260)])
self.assertEqual(
list(iter_code_points([10, 20, (10, 22), 11, 12, 25, 8, (9, 20), 21, 22, 9, 0])),
[0, (8, 23), 25]
)
self.assertEqual(
list(e for e in iter_code_points([10, 20, 11, 12, 25, (9, 21)], reverse=True)), [25, (9, 21)]
)
self.assertEqual(
list(iter_code_points([10, 20, (10, 22), 11, 12, 25, 8, (9, 20), 21, 22, 9, 0], reverse=True)),
[25, (8, 23), 0]
)
class TestUnicodeSubset(unittest.TestCase):
def test_creation(self):
cds = UnicodeSubset([(0, 9), 11, 12, (14, 32), (33, sys.maxunicode + 1)])
self.assertEqual(cds, [(0, 9), (11, 13), (14, 32), (33, sys.maxunicode + 1)])
self.assertEqual(UnicodeSubset('0-9'), [(48, 58)])
self.assertEqual(UnicodeSubset('0-9:'), [(48, 59)])
def test_modify(self):
cds = UnicodeSubset([50, 90, 10, 90])
self.assertEqual(cds, [10, 50, 90])
self.assertRaises(XMLSchemaValueError, cds.add, -1)
self.assertRaises(XMLSchemaValueError, cds.add, sys.maxunicode + 1)
cds.add((100, 20001))
cds.discard((100, 19001))
self.assertEqual(cds, [10, 50, 90, (19001, 20001)])
cds.add(0)
cds.discard(1)
self.assertEqual(cds, [0, 10, 50, 90, (19001, 20001)])
cds.discard(0)
self.assertEqual(cds, [10, 50, 90, (19001, 20001)])
cds.discard((10, 100))
self.assertEqual(cds, [(19001, 20001)])
cds.add(20)
cds.add(19)
cds.add(30)
cds.add([30, 33])
cds.add(30000)
cds.add(30001)
self.assertEqual(cds, [(19, 21), (30, 33), (19001, 20001), (30000, 30002)])
cds.add(22)
cds.add(21)
cds.add(22)
self.assertEqual(cds, [(19, 22), 22, (30, 33), (19001, 20001), (30000, 30002)])
cds.discard((90, 50000))
self.assertEqual(cds, [(19, 22), 22, (30, 33)])
cds.discard(21)
cds.discard(19)
self.assertEqual(cds, [20, 22, (30, 33)])
cds.discard((0, 200))
self.assertEqual(cds, [])
def test_complement(self):
cds = UnicodeSubset([50, 90, 10, 90])
self.assertEqual(list(cds.complement()), [(0, 10), (11, 50), (51, 90), (91, sys.maxunicode + 1)])
cds.add(11)
self.assertEqual(list(cds.complement()), [(0, 10), (12, 50), (51, 90), (91, sys.maxunicode + 1)])
cds.add((0, 10))
self.assertEqual(list(cds.complement()), [(12, 50), (51, 90), (91, sys.maxunicode + 1)])
cds1 = UnicodeSubset(chain(
UNICODE_CATEGORIES['L'].code_points,
UNICODE_CATEGORIES['M'].code_points,
UNICODE_CATEGORIES['N'].code_points,
UNICODE_CATEGORIES['S'].code_points
))
cds2 = UnicodeSubset(chain(
UNICODE_CATEGORIES['C'].code_points,
UNICODE_CATEGORIES['P'].code_points,
UNICODE_CATEGORIES['Z'].code_points
))
self.assertListEqual(cds1.code_points, UnicodeSubset(cds2.complement()).code_points)
def test_union_and_intersection(self):
cds1 = UnicodeSubset([50, (90, 200), 10])
cds2 = UnicodeSubset([10, 51, (89, 150), 90])
self.assertEqual(cds1 | cds2, [10, (50, 52), (89, 200)])
self.assertEqual(cds1 & cds2, [10, (90, 150)])
def test_max_and_min(self):
cds1 = UnicodeSubset([10, 51, (89, 151), 90])
cds2 = UnicodeSubset([0, 2, (80, 201), 10000])
cds3 = UnicodeSubset([1])
self.assertEqual((min(cds1), max(cds1)), (10, 150))
self.assertEqual((min(cds2), max(cds2)), (0, 10000))
self.assertEqual((min(cds3), max(cds3)), (1, 1))
def test_subtraction(self):
cds = UnicodeSubset([0, 2, (80, 200), 10000])
self.assertEqual(cds - {2, 120, 121, (150, 260)}, [0, (80, 120), (122, 150), 10000])
def test_code_point_repr_function(self):
self.assertEqual(code_point_repr((ord('2'), ord('\\') + 1)), r'2-\\')
class TestXsdRegexCharGroup(unittest.TestCase):
def test_char_group_split(self):
self.assertListEqual(XsdRegexCharGroup._re_char_group.split(r'2-\\'), [r'2-\\'])
class TestUnicodeCategories(unittest.TestCase):
"""
Test the subsets of Unicode categories, mainly to check the loaded JSON file.
"""
def test_build_unicode_categories(self):
categories = build_unicode_categories('not_existing_file.json')
self.assertEqual(sum(len(v) for k, v in categories.items() if len(k) > 1), sys.maxunicode + 1)
self.assertEqual(min([min(s) for s in categories.values()]), 0)
self.assertEqual(max([max(s) for s in categories.values()]), sys.maxunicode)
base_sets = [set(v) for k, v in categories.items() if len(k) > 1]
self.assertFalse(any(s.intersection(t) for s in base_sets for t in base_sets if s != t))
def test_unicode_categories(self):
self.assertEqual(sum(len(v) for k, v in UNICODE_CATEGORIES.items() if len(k) > 1), sys.maxunicode + 1)
self.assertEqual(min([min(s) for s in UNICODE_CATEGORIES.values()]), 0)
self.assertEqual(max([max(s) for s in UNICODE_CATEGORIES.values()]), sys.maxunicode)
base_sets = [set(v) for k, v in UNICODE_CATEGORIES.items() if len(k) > 1]
self.assertFalse(any(s.intersection(t) for s in base_sets for t in base_sets if s != t))
@unittest.skipIf(not ((3, 7) <= sys.version_info < (3, 8)), "Test only for Python 3.7")
def test_unicodedata_category(self):
for key in UNICODE_CATEGORIES:
for cp in UNICODE_CATEGORIES[key]:
uc = category(unicode_chr(cp))
if key == uc or len(key) == 1 and key == uc[0]:
continue
self.assertTrue(
False, "Wrong category %r for code point %d (should be %r)." % (uc, cp, key)
)
class TestPatterns(unittest.TestCase):
"""
Test of specific regex patterns and their application.
"""
def test_issue_079(self):
# Do not escape special characters in character class
regex = get_python_regex('[^\n\t]+')
self.assertEqual(regex, '^([^\t\n]+)$')
pattern = re.compile(regex)
self.assertIsNone(pattern.search('first\tsecond\tthird'))
self.assertEqual(pattern.search('first second third').group(0), 'first second third')
def test_dot_wildcard(self):
regex = get_python_regex('.+')
self.assertEqual(regex, '^([^\r\n]+)$')
pattern = re.compile(regex)
self.assertIsNone(pattern.search('line1\rline2\r'))
self.assertIsNone(pattern.search('line1\nline2'))
self.assertIsNone(pattern.search(''))
self.assertIsNotNone(pattern.search('\\'))
self.assertEqual(pattern.search('abc').group(0), 'abc')
regex = get_python_regex('.+T.+(Z|[+-].+)')
self.assertEqual(regex, '^([^\r\n]+T[^\r\n]+(Z|[\\+\\-][^\r\n]+))$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('12T0A3+36').group(0), '12T0A3+36')
self.assertEqual(pattern.search('12T0A3Z').group(0), '12T0A3Z')
self.assertIsNone(pattern.search(''))
self.assertIsNone(pattern.search('12T0A3Z2'))
def test_not_spaces(self):
regex = get_python_regex(r"[\S' ']{1,10}")
if sys.version_info >= (3,):
self.assertEqual(regex, "^([\x00-\x08\x0b\x0c\x0e-\x1f!-\U0010ffff ']{1,10})$")
pattern = re.compile(regex)
self.assertIsNone(pattern.search('alpha\r'))
self.assertEqual(pattern.search('beta').group(0), 'beta')
self.assertEqual(pattern.search('beta\n').group(0), 'beta') # $ matches also a \n at last position
self.assertIsNone(pattern.search('beta\n '))
self.assertIsNone(pattern.search(''))
self.assertIsNone(pattern.search('over the maximum length!'))
self.assertIsNotNone(pattern.search('\\'))
self.assertEqual(pattern.search('abc').group(0), 'abc')
def test_category_escape(self):
regex = get_python_regex('\\p{IsBasicLatin}*')
self.assertEqual(regex, '^([\x00-\x7f]*)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('').group(0), '')
self.assertEqual(pattern.search('e').group(0), 'e')
self.assertIsNone(pattern.search('è'))
regex = get_python_regex('[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}]*')
self.assertEqual(regex, '^([\x00-\xff]*)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('e').group(0), 'e')
self.assertEqual(pattern.search('è').group(0), 'è')
self.assertIsNone(pattern.search('Ĭ'))
def test_digit_shortcut(self):
regex = get_python_regex(r'\d{1,3}\.\d{1,2}')
self.assertEqual(regex, r'^(\d{1,3}\.\d{1,2})$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('12.40').group(0), '12.40')
self.assertEqual(pattern.search('867.00').group(0), '867.00')
self.assertEqual(pattern.search('867.00\n').group(0), '867.00')
self.assertIsNone(pattern.search('867.00 '))
self.assertIsNone(pattern.search('867.000'))
self.assertIsNone(pattern.search('1867.0'))
self.assertIsNone(pattern.search('a1.13'))
regex = get_python_regex(r'[-+]?(\d+|\d+(\.\d+)?%)')
self.assertEqual(regex, r'^([\+\-]?(\d+|\d+(\.\d+)?%))$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('78.8%').group(0), '78.8%')
self.assertIsNone(pattern.search('867.00'))
def test_character_class_reordering(self):
regex = get_python_regex('[A-Z ]')
self.assertEqual(regex, '^([ A-Z])$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('A').group(0), 'A')
self.assertEqual(pattern.search('Z').group(0), 'Z')
self.assertEqual(pattern.search('Q').group(0), 'Q')
self.assertEqual(pattern.search(' ').group(0), ' ')
self.assertIsNone(pattern.search(' '))
self.assertIsNone(pattern.search('AA'))
regex = get_python_regex(r'[0-9.,DHMPRSTWYZ/:+\-]+')
self.assertEqual(regex, r'^([\+-\-\.-:DHMPR-TWYZ]+)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('12,40').group(0), '12,40')
self.assertEqual(pattern.search('YYYY:MM:DD').group(0), 'YYYY:MM:DD')
self.assertIsNone(pattern.search(''))
self.assertIsNone(pattern.search('C'))
regex = get_python_regex('[^: \n\r\t]+')
self.assertEqual(regex, '^([^\t\n\r :]+)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('56,41').group(0), '56,41')
self.assertEqual(pattern.search('56,41\n').group(0), '56,41')
self.assertIsNone(pattern.search('13:20'))
regex = get_python_regex(r'[A-Za-z0-9_\-]+(:[A-Za-z0-9_\-]+)?')
self.assertEqual(regex, r'^([\-0-9A-Z_a-z]+(:[\-0-9A-Z_a-z]+)?)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('fa9').group(0), 'fa9')
self.assertEqual(pattern.search('-x_1:_tZ-\n').group(0), '-x_1:_tZ-')
self.assertIsNone(pattern.search(''))
self.assertIsNone(pattern.search('+78'))
regex = get_python_regex(r'[!%\^\*@~;#,|/]')
self.assertEqual(regex, r'^([!#%\*,/;@\^\|~])$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('#').group(0), '#')
self.assertEqual(pattern.search('!').group(0), '!')
self.assertEqual(pattern.search('^').group(0), '^')
self.assertEqual(pattern.search('|').group(0), '|')
self.assertEqual(pattern.search('*').group(0), '*')
self.assertIsNone(pattern.search('**'))
self.assertIsNone(pattern.search('b'))
self.assertIsNone(pattern.search(''))
regex = get_python_regex('[A-Za-z]+:[A-Za-z][A-Za-z0-9\\-]+')
self.assertEqual(regex, '^([A-Za-z]+:[A-Za-z][\\-0-9A-Za-z]+)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('zk:xy-9s').group(0), 'zk:xy-9s')
self.assertIsNone(pattern.search('xx:y'))
def test_iterparse_character_group(self):
self.assertListEqual(list(iterparse_character_group('a-c-1-4x-z-7-9')),
[(ord('a'), ord('c') + 1), ord('-'), (ord('1'), ord('4') + 1),
(ord('x'), ord('z') + 1), ord('-'), (55, 58)])
self.assertListEqual(list(iterparse_character_group('2-\\')), [(ord('2'), ord('\\') + 1)])
def test_occurrences_qualifiers(self):
regex = get_python_regex('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?')
self.assertEqual(regex, '^(#[0-9A-Fa-f]{3}([0-9A-Fa-f]{3})?)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('#F3D').group(0), '#F3D')
self.assertEqual(pattern.search('#F3D\n').group(0), '#F3D')
self.assertEqual(pattern.search('#F3DA30').group(0), '#F3DA30')
self.assertIsNone(pattern.search('#F3'))
self.assertIsNone(pattern.search('#F3D '))
self.assertIsNone(pattern.search('F3D'))
self.assertIsNone(pattern.search(''))
def test_or_operator(self):
regex = get_python_regex('0|1')
self.assertEqual(regex, '^(0|1)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('0').group(0), '0')
self.assertEqual(pattern.search('1').group(0), '1')
self.assertEqual(pattern.search('1\n').group(0), '1')
self.assertIsNone(pattern.search(''))
self.assertIsNone(pattern.search('2'))
self.assertIsNone(pattern.search('01'))
self.assertIsNone(pattern.search('1\n '))
regex = get_python_regex(r'\d+[%]|\d*\.\d+[%]')
self.assertEqual(regex, r'^(\d+[%]|\d*\.\d+[%])$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('99%').group(0), '99%')
self.assertEqual(pattern.search('99.9%').group(0), '99.9%')
self.assertEqual(pattern.search('.90%').group(0), '.90%')
self.assertIsNone(pattern.search('%'))
self.assertIsNone(pattern.search('90.%'))
regex = get_python_regex('([ -~]|\n|\r|\t)*')
self.assertEqual(regex, '^(([ -~]|\n|\r|\t)*)$')
pattern = re.compile(regex)
self.assertEqual(pattern.search('ciao\t-~ ').group(0), 'ciao\t-~ ')
self.assertEqual(pattern.search('\r\r').group(0), '\r\r')
self.assertEqual(pattern.search('\n -.abc').group(0), '\n -.abc')
self.assertIsNone(pattern.search('à'))
self.assertIsNone(pattern.search('\t\n à'))
def test_character_class_shortcuts(self):
regex = get_python_regex(r"[\i-[:]][\c-[:]]*")
pattern = re.compile(regex)
self.assertEqual(pattern.search('x11').group(0), 'x11')
self.assertIsNone(pattern.search('3a'))
regex = get_python_regex(r"\w*")
pattern = re.compile(regex)
self.assertEqual(pattern.search('aA_x7').group(0), 'aA_x7')
self.assertIsNone(pattern.search('.'))
self.assertIsNone(pattern.search('-'))
regex = get_python_regex(r"\W*")
pattern = re.compile(regex)
self.assertIsNone(pattern.search('aA_x7'))
self.assertEqual(pattern.search('.-').group(0), '.-')
regex = get_python_regex(r"\d*")
pattern = re.compile(regex)
self.assertEqual(pattern.search('6410').group(0), '6410')
self.assertIsNone(pattern.search('a'))
self.assertIsNone(pattern.search('-'))
regex = get_python_regex(r"\D*")
pattern = re.compile(regex)
self.assertIsNone(pattern.search('6410'))
self.assertEqual(pattern.search('a').group(0), 'a')
self.assertEqual(pattern.search('-').group(0), '-')
# Pull Request 114
regex = get_python_regex(r"[\w]{0,5}")
pattern = re.compile(regex)
self.assertEqual(pattern.search('abc').group(0), 'abc')
self.assertIsNone(pattern.search('.'))
regex = get_python_regex(r"[\W]{0,5}")
pattern = re.compile(regex)
self.assertEqual(pattern.search('.').group(0), '.')
self.assertIsNone(pattern.search('abc'))
def test_empty_character_group_repr(self):
regex = get_python_regex('[a-[a-f]]')
self.assertEqual(regex, r'^([^\w\W])$')
self.assertRaises(XMLSchemaRegexError, get_python_regex, '[]')
def test_character_class_range(self):
regex = get_python_regex('[bc-]')
self.assertEqual(regex, r'^([\-bc])$')
if __name__ == '__main__':
from xmlschema.tests import print_test_header
print_test_header()
unittest.main()