403 lines
18 KiB
Python
403 lines
18 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
|
|
# All rights reserved.
|
|
# This file is distributed under the terms of the MIT License.
|
|
# See the file 'LICENSE' in the root directory of the present
|
|
# distribution, or http://opensource.org/licenses/MIT.
|
|
#
|
|
# @author Davide Brunato <brunato@sissa.it>
|
|
#
|
|
"""
|
|
This module runs tests on XML Schema regular expressions.
|
|
"""
|
|
from __future__ import unicode_literals
|
|
import unittest
|
|
import sys
|
|
import re
|
|
from itertools import chain
|
|
from unicodedata import category
|
|
|
|
from xmlschema.exceptions import XMLSchemaValueError, XMLSchemaRegexError
|
|
from xmlschema.compat import unicode_chr
|
|
from xmlschema.codepoints import code_point_repr, iterparse_character_group, iter_code_points, \
|
|
UnicodeSubset, build_unicode_categories, UNICODE_CATEGORIES
|
|
from xmlschema.regex import get_python_regex, XsdRegexCharGroup
|
|
|
|
|
|
class TestCodePoints(unittest.TestCase):
|
|
|
|
def test_iter_code_points(self):
|
|
self.assertEqual(list(iter_code_points([10, 20, 11, 12, 25, (9, 21), 21])), [(9, 22), 25])
|
|
self.assertEqual(list(iter_code_points([10, 20, 11, 12, 25, (9, 20), 21])), [(9, 22), 25])
|
|
self.assertEqual(list(iter_code_points({2, 120, 121, (150, 260)})), [2, (120, 122), (150, 260)])
|
|
self.assertEqual(
|
|
list(iter_code_points([10, 20, (10, 22), 11, 12, 25, 8, (9, 20), 21, 22, 9, 0])),
|
|
[0, (8, 23), 25]
|
|
)
|
|
self.assertEqual(
|
|
list(e for e in iter_code_points([10, 20, 11, 12, 25, (9, 21)], reverse=True)), [25, (9, 21)]
|
|
)
|
|
self.assertEqual(
|
|
list(iter_code_points([10, 20, (10, 22), 11, 12, 25, 8, (9, 20), 21, 22, 9, 0], reverse=True)),
|
|
[25, (8, 23), 0]
|
|
)
|
|
|
|
|
|
class TestUnicodeSubset(unittest.TestCase):
|
|
|
|
def test_creation(self):
|
|
cds = UnicodeSubset([(0, 9), 11, 12, (14, 32), (33, sys.maxunicode + 1)])
|
|
self.assertEqual(cds, [(0, 9), (11, 13), (14, 32), (33, sys.maxunicode + 1)])
|
|
self.assertEqual(UnicodeSubset('0-9'), [(48, 58)])
|
|
self.assertEqual(UnicodeSubset('0-9:'), [(48, 59)])
|
|
|
|
def test_modify(self):
|
|
cds = UnicodeSubset([50, 90, 10, 90])
|
|
self.assertEqual(cds, [10, 50, 90])
|
|
self.assertRaises(XMLSchemaValueError, cds.add, -1)
|
|
self.assertRaises(XMLSchemaValueError, cds.add, sys.maxunicode + 1)
|
|
cds.add((100, 20001))
|
|
cds.discard((100, 19001))
|
|
self.assertEqual(cds, [10, 50, 90, (19001, 20001)])
|
|
cds.add(0)
|
|
cds.discard(1)
|
|
self.assertEqual(cds, [0, 10, 50, 90, (19001, 20001)])
|
|
cds.discard(0)
|
|
self.assertEqual(cds, [10, 50, 90, (19001, 20001)])
|
|
cds.discard((10, 100))
|
|
self.assertEqual(cds, [(19001, 20001)])
|
|
cds.add(20)
|
|
cds.add(19)
|
|
cds.add(30)
|
|
cds.add([30, 33])
|
|
cds.add(30000)
|
|
cds.add(30001)
|
|
self.assertEqual(cds, [(19, 21), (30, 33), (19001, 20001), (30000, 30002)])
|
|
cds.add(22)
|
|
cds.add(21)
|
|
cds.add(22)
|
|
self.assertEqual(cds, [(19, 22), 22, (30, 33), (19001, 20001), (30000, 30002)])
|
|
cds.discard((90, 50000))
|
|
self.assertEqual(cds, [(19, 22), 22, (30, 33)])
|
|
cds.discard(21)
|
|
cds.discard(19)
|
|
self.assertEqual(cds, [20, 22, (30, 33)])
|
|
cds.discard((0, 200))
|
|
self.assertEqual(cds, [])
|
|
|
|
def test_complement(self):
|
|
cds = UnicodeSubset([50, 90, 10, 90])
|
|
self.assertEqual(list(cds.complement()), [(0, 10), (11, 50), (51, 90), (91, sys.maxunicode + 1)])
|
|
cds.add(11)
|
|
self.assertEqual(list(cds.complement()), [(0, 10), (12, 50), (51, 90), (91, sys.maxunicode + 1)])
|
|
cds.add((0, 10))
|
|
self.assertEqual(list(cds.complement()), [(12, 50), (51, 90), (91, sys.maxunicode + 1)])
|
|
|
|
cds1 = UnicodeSubset(chain(
|
|
UNICODE_CATEGORIES['L'].code_points,
|
|
UNICODE_CATEGORIES['M'].code_points,
|
|
UNICODE_CATEGORIES['N'].code_points,
|
|
UNICODE_CATEGORIES['S'].code_points
|
|
))
|
|
cds2 = UnicodeSubset(chain(
|
|
UNICODE_CATEGORIES['C'].code_points,
|
|
UNICODE_CATEGORIES['P'].code_points,
|
|
UNICODE_CATEGORIES['Z'].code_points
|
|
))
|
|
self.assertListEqual(cds1.code_points, UnicodeSubset(cds2.complement()).code_points)
|
|
|
|
def test_union_and_intersection(self):
|
|
cds1 = UnicodeSubset([50, (90, 200), 10])
|
|
cds2 = UnicodeSubset([10, 51, (89, 150), 90])
|
|
self.assertEqual(cds1 | cds2, [10, (50, 52), (89, 200)])
|
|
self.assertEqual(cds1 & cds2, [10, (90, 150)])
|
|
|
|
def test_max_and_min(self):
|
|
cds1 = UnicodeSubset([10, 51, (89, 151), 90])
|
|
cds2 = UnicodeSubset([0, 2, (80, 201), 10000])
|
|
cds3 = UnicodeSubset([1])
|
|
self.assertEqual((min(cds1), max(cds1)), (10, 150))
|
|
self.assertEqual((min(cds2), max(cds2)), (0, 10000))
|
|
self.assertEqual((min(cds3), max(cds3)), (1, 1))
|
|
|
|
def test_subtraction(self):
|
|
cds = UnicodeSubset([0, 2, (80, 200), 10000])
|
|
self.assertEqual(cds - {2, 120, 121, (150, 260)}, [0, (80, 120), (122, 150), 10000])
|
|
|
|
def test_code_point_repr_function(self):
|
|
self.assertEqual(code_point_repr((ord('2'), ord('\\') + 1)), r'2-\\')
|
|
|
|
|
|
class TestXsdRegexCharGroup(unittest.TestCase):
|
|
|
|
def test_char_group_split(self):
|
|
self.assertListEqual(XsdRegexCharGroup._re_char_group.split(r'2-\\'), [r'2-\\'])
|
|
|
|
|
|
class TestUnicodeCategories(unittest.TestCase):
|
|
"""
|
|
Test the subsets of Unicode categories, mainly to check the loaded JSON file.
|
|
"""
|
|
def test_build_unicode_categories(self):
|
|
categories = build_unicode_categories('not_existing_file.json')
|
|
self.assertEqual(sum(len(v) for k, v in categories.items() if len(k) > 1), sys.maxunicode + 1)
|
|
self.assertEqual(min([min(s) for s in categories.values()]), 0)
|
|
self.assertEqual(max([max(s) for s in categories.values()]), sys.maxunicode)
|
|
base_sets = [set(v) for k, v in categories.items() if len(k) > 1]
|
|
self.assertFalse(any(s.intersection(t) for s in base_sets for t in base_sets if s != t))
|
|
|
|
def test_unicode_categories(self):
|
|
self.assertEqual(sum(len(v) for k, v in UNICODE_CATEGORIES.items() if len(k) > 1), sys.maxunicode + 1)
|
|
self.assertEqual(min([min(s) for s in UNICODE_CATEGORIES.values()]), 0)
|
|
self.assertEqual(max([max(s) for s in UNICODE_CATEGORIES.values()]), sys.maxunicode)
|
|
base_sets = [set(v) for k, v in UNICODE_CATEGORIES.items() if len(k) > 1]
|
|
self.assertFalse(any(s.intersection(t) for s in base_sets for t in base_sets if s != t))
|
|
|
|
@unittest.skipIf(not ((3, 7) <= sys.version_info < (3, 8)), "Test only for Python 3.7")
|
|
def test_unicodedata_category(self):
|
|
for key in UNICODE_CATEGORIES:
|
|
for cp in UNICODE_CATEGORIES[key]:
|
|
uc = category(unicode_chr(cp))
|
|
if key == uc or len(key) == 1 and key == uc[0]:
|
|
continue
|
|
self.assertTrue(
|
|
False, "Wrong category %r for code point %d (should be %r)." % (uc, cp, key)
|
|
)
|
|
|
|
|
|
class TestPatterns(unittest.TestCase):
|
|
"""
|
|
Test of specific regex patterns and their application.
|
|
"""
|
|
def test_issue_079(self):
|
|
# Do not escape special characters in character class
|
|
regex = get_python_regex('[^\n\t]+')
|
|
self.assertEqual(regex, '^([^\t\n]+)$')
|
|
pattern = re.compile(regex)
|
|
self.assertIsNone(pattern.search('first\tsecond\tthird'))
|
|
self.assertEqual(pattern.search('first second third').group(0), 'first second third')
|
|
|
|
def test_dot_wildcard(self):
|
|
regex = get_python_regex('.+')
|
|
self.assertEqual(regex, '^([^\r\n]+)$')
|
|
pattern = re.compile(regex)
|
|
self.assertIsNone(pattern.search('line1\rline2\r'))
|
|
self.assertIsNone(pattern.search('line1\nline2'))
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNotNone(pattern.search('\\'))
|
|
self.assertEqual(pattern.search('abc').group(0), 'abc')
|
|
|
|
regex = get_python_regex('.+T.+(Z|[+-].+)')
|
|
self.assertEqual(regex, '^([^\r\n]+T[^\r\n]+(Z|[\\+\\-][^\r\n]+))$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('12T0A3+36').group(0), '12T0A3+36')
|
|
self.assertEqual(pattern.search('12T0A3Z').group(0), '12T0A3Z')
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNone(pattern.search('12T0A3Z2'))
|
|
|
|
def test_not_spaces(self):
|
|
regex = get_python_regex(r"[\S' ']{1,10}")
|
|
if sys.version_info >= (3,):
|
|
self.assertEqual(regex, "^([\x00-\x08\x0b\x0c\x0e-\x1f!-\U0010ffff ']{1,10})$")
|
|
|
|
pattern = re.compile(regex)
|
|
self.assertIsNone(pattern.search('alpha\r'))
|
|
self.assertEqual(pattern.search('beta').group(0), 'beta')
|
|
self.assertEqual(pattern.search('beta\n').group(0), 'beta') # $ matches also a \n at last position
|
|
self.assertIsNone(pattern.search('beta\n '))
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNone(pattern.search('over the maximum length!'))
|
|
self.assertIsNotNone(pattern.search('\\'))
|
|
self.assertEqual(pattern.search('abc').group(0), 'abc')
|
|
|
|
def test_category_escape(self):
|
|
regex = get_python_regex('\\p{IsBasicLatin}*')
|
|
self.assertEqual(regex, '^([\x00-\x7f]*)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('').group(0), '')
|
|
self.assertEqual(pattern.search('e').group(0), 'e')
|
|
self.assertIsNone(pattern.search('è'))
|
|
|
|
regex = get_python_regex('[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}]*')
|
|
self.assertEqual(regex, '^([\x00-\xff]*)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('e').group(0), 'e')
|
|
self.assertEqual(pattern.search('è').group(0), 'è')
|
|
self.assertIsNone(pattern.search('Ĭ'))
|
|
|
|
def test_digit_shortcut(self):
|
|
regex = get_python_regex(r'\d{1,3}\.\d{1,2}')
|
|
self.assertEqual(regex, r'^(\d{1,3}\.\d{1,2})$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('12.40').group(0), '12.40')
|
|
self.assertEqual(pattern.search('867.00').group(0), '867.00')
|
|
self.assertEqual(pattern.search('867.00\n').group(0), '867.00')
|
|
self.assertIsNone(pattern.search('867.00 '))
|
|
self.assertIsNone(pattern.search('867.000'))
|
|
self.assertIsNone(pattern.search('1867.0'))
|
|
self.assertIsNone(pattern.search('a1.13'))
|
|
|
|
regex = get_python_regex(r'[-+]?(\d+|\d+(\.\d+)?%)')
|
|
self.assertEqual(regex, r'^([\+\-]?(\d+|\d+(\.\d+)?%))$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('78.8%').group(0), '78.8%')
|
|
self.assertIsNone(pattern.search('867.00'))
|
|
|
|
def test_character_class_reordering(self):
|
|
regex = get_python_regex('[A-Z ]')
|
|
self.assertEqual(regex, '^([ A-Z])$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('A').group(0), 'A')
|
|
self.assertEqual(pattern.search('Z').group(0), 'Z')
|
|
self.assertEqual(pattern.search('Q').group(0), 'Q')
|
|
self.assertEqual(pattern.search(' ').group(0), ' ')
|
|
self.assertIsNone(pattern.search(' '))
|
|
self.assertIsNone(pattern.search('AA'))
|
|
|
|
regex = get_python_regex(r'[0-9.,DHMPRSTWYZ/:+\-]+')
|
|
self.assertEqual(regex, r'^([\+-\-\.-:DHMPR-TWYZ]+)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('12,40').group(0), '12,40')
|
|
self.assertEqual(pattern.search('YYYY:MM:DD').group(0), 'YYYY:MM:DD')
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNone(pattern.search('C'))
|
|
|
|
regex = get_python_regex('[^: \n\r\t]+')
|
|
self.assertEqual(regex, '^([^\t\n\r :]+)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('56,41').group(0), '56,41')
|
|
self.assertEqual(pattern.search('56,41\n').group(0), '56,41')
|
|
self.assertIsNone(pattern.search('13:20'))
|
|
|
|
regex = get_python_regex(r'[A-Za-z0-9_\-]+(:[A-Za-z0-9_\-]+)?')
|
|
self.assertEqual(regex, r'^([\-0-9A-Z_a-z]+(:[\-0-9A-Z_a-z]+)?)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('fa9').group(0), 'fa9')
|
|
self.assertEqual(pattern.search('-x_1:_tZ-\n').group(0), '-x_1:_tZ-')
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNone(pattern.search('+78'))
|
|
|
|
regex = get_python_regex(r'[!%\^\*@~;#,|/]')
|
|
self.assertEqual(regex, r'^([!#%\*,/;@\^\|~])$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('#').group(0), '#')
|
|
self.assertEqual(pattern.search('!').group(0), '!')
|
|
self.assertEqual(pattern.search('^').group(0), '^')
|
|
self.assertEqual(pattern.search('|').group(0), '|')
|
|
self.assertEqual(pattern.search('*').group(0), '*')
|
|
self.assertIsNone(pattern.search('**'))
|
|
self.assertIsNone(pattern.search('b'))
|
|
self.assertIsNone(pattern.search(''))
|
|
|
|
regex = get_python_regex('[A-Za-z]+:[A-Za-z][A-Za-z0-9\\-]+')
|
|
self.assertEqual(regex, '^([A-Za-z]+:[A-Za-z][\\-0-9A-Za-z]+)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('zk:xy-9s').group(0), 'zk:xy-9s')
|
|
self.assertIsNone(pattern.search('xx:y'))
|
|
|
|
def test_iterparse_character_group(self):
|
|
self.assertListEqual(list(iterparse_character_group('a-c-1-4x-z-7-9')),
|
|
[(ord('a'), ord('c') + 1), ord('-'), (ord('1'), ord('4') + 1),
|
|
(ord('x'), ord('z') + 1), ord('-'), (55, 58)])
|
|
self.assertListEqual(list(iterparse_character_group('2-\\')), [(ord('2'), ord('\\') + 1)])
|
|
|
|
def test_occurrences_qualifiers(self):
|
|
regex = get_python_regex('#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?')
|
|
self.assertEqual(regex, '^(#[0-9A-Fa-f]{3}([0-9A-Fa-f]{3})?)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('#F3D').group(0), '#F3D')
|
|
self.assertEqual(pattern.search('#F3D\n').group(0), '#F3D')
|
|
self.assertEqual(pattern.search('#F3DA30').group(0), '#F3DA30')
|
|
self.assertIsNone(pattern.search('#F3'))
|
|
self.assertIsNone(pattern.search('#F3D '))
|
|
self.assertIsNone(pattern.search('F3D'))
|
|
self.assertIsNone(pattern.search(''))
|
|
|
|
def test_or_operator(self):
|
|
regex = get_python_regex('0|1')
|
|
self.assertEqual(regex, '^(0|1)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('0').group(0), '0')
|
|
self.assertEqual(pattern.search('1').group(0), '1')
|
|
self.assertEqual(pattern.search('1\n').group(0), '1')
|
|
self.assertIsNone(pattern.search(''))
|
|
self.assertIsNone(pattern.search('2'))
|
|
self.assertIsNone(pattern.search('01'))
|
|
self.assertIsNone(pattern.search('1\n '))
|
|
|
|
regex = get_python_regex(r'\d+[%]|\d*\.\d+[%]')
|
|
self.assertEqual(regex, r'^(\d+[%]|\d*\.\d+[%])$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('99%').group(0), '99%')
|
|
self.assertEqual(pattern.search('99.9%').group(0), '99.9%')
|
|
self.assertEqual(pattern.search('.90%').group(0), '.90%')
|
|
self.assertIsNone(pattern.search('%'))
|
|
self.assertIsNone(pattern.search('90.%'))
|
|
|
|
regex = get_python_regex('([ -~]|\n|\r|\t)*')
|
|
self.assertEqual(regex, '^(([ -~]|\n|\r|\t)*)$')
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('ciao\t-~ ').group(0), 'ciao\t-~ ')
|
|
self.assertEqual(pattern.search('\r\r').group(0), '\r\r')
|
|
self.assertEqual(pattern.search('\n -.abc').group(0), '\n -.abc')
|
|
self.assertIsNone(pattern.search('à'))
|
|
self.assertIsNone(pattern.search('\t\n à'))
|
|
|
|
def test_character_class_shortcuts(self):
|
|
regex = get_python_regex(r"[\i-[:]][\c-[:]]*")
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('x11').group(0), 'x11')
|
|
self.assertIsNone(pattern.search('3a'))
|
|
|
|
regex = get_python_regex(r"\w*")
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('aA_x7').group(0), 'aA_x7')
|
|
self.assertIsNone(pattern.search('.'))
|
|
self.assertIsNone(pattern.search('-'))
|
|
|
|
regex = get_python_regex(r"\W*")
|
|
pattern = re.compile(regex)
|
|
self.assertIsNone(pattern.search('aA_x7'))
|
|
self.assertEqual(pattern.search('.-').group(0), '.-')
|
|
|
|
regex = get_python_regex(r"\d*")
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('6410').group(0), '6410')
|
|
self.assertIsNone(pattern.search('a'))
|
|
self.assertIsNone(pattern.search('-'))
|
|
|
|
regex = get_python_regex(r"\D*")
|
|
pattern = re.compile(regex)
|
|
self.assertIsNone(pattern.search('6410'))
|
|
self.assertEqual(pattern.search('a').group(0), 'a')
|
|
self.assertEqual(pattern.search('-').group(0), '-')
|
|
|
|
# Pull Request 114
|
|
regex = get_python_regex(r"[\w]{0,5}")
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('abc').group(0), 'abc')
|
|
self.assertIsNone(pattern.search('.'))
|
|
|
|
regex = get_python_regex(r"[\W]{0,5}")
|
|
pattern = re.compile(regex)
|
|
self.assertEqual(pattern.search('.').group(0), '.')
|
|
self.assertIsNone(pattern.search('abc'))
|
|
|
|
def test_empty_character_group_repr(self):
|
|
regex = get_python_regex('[a-[a-f]]')
|
|
self.assertEqual(regex, r'^([^\w\W])$')
|
|
self.assertRaises(XMLSchemaRegexError, get_python_regex, '[]')
|
|
|
|
def test_character_class_range(self):
|
|
regex = get_python_regex('[bc-]')
|
|
self.assertEqual(regex, r'^([\-bc])$')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from xmlschema.tests import print_test_header
|
|
|
|
print_test_header()
|
|
unittest.main()
|