debian-xmlschema/xmlschema/regex.py

368 lines
13 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
Parse and translate XML Schema regular expressions to Python regex syntax.
"""
from __future__ import unicode_literals
import re
from itertools import chain
from sys import maxunicode
from .compat import PY3, unicode_type, string_base_type, MutableSet
from .exceptions import XMLSchemaValueError, XMLSchemaRegexError
from .codepoints import UnicodeSubset, UNICODE_CATEGORIES, unicode_subset
_RE_HYPHENS = re.compile(r'(?<!\\)--')
_RE_QUANTIFIER = re.compile(r'{\d+(,(\d+)?)?}')
_RE_FORBIDDEN_ESCAPES = re.compile(
r'(?<!\\)\\(U[0-9a-fA-F]{8}|u[0-9a-fA-F]{4}|x[0-9a-fA-F]{2}|o{\d+}|\d+|A|Z|z|B|b|o)'
)
I_SHORTCUT_REPLACE = (
":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
"\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
)
C_SHORTCUT_REPLACE = (
"-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-"
"\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
)
S_SHORTCUT_SET = UnicodeSubset(' \n\t\r')
D_SHORTCUT_SET = UnicodeSubset()
D_SHORTCUT_SET._code_points = UNICODE_CATEGORIES['Nd'].code_points
I_SHORTCUT_SET = UnicodeSubset(I_SHORTCUT_REPLACE)
C_SHORTCUT_SET = UnicodeSubset(C_SHORTCUT_REPLACE)
W_SHORTCUT_SET = UnicodeSubset(chain(
UNICODE_CATEGORIES['L'].code_points,
UNICODE_CATEGORIES['M'].code_points,
UNICODE_CATEGORIES['N'].code_points,
UNICODE_CATEGORIES['S'].code_points
))
# Single and Multi character escapes
CHARACTER_ESCAPES = {
# Single-character escapes
'\\n': '\n',
'\\r': '\r',
'\\t': '\t',
'\\|': '|',
'\\.': '.',
'\\-': '-',
'\\^': '^',
'\\?': '?',
'\\*': '*',
'\\+': '+',
'\\{': '{',
'\\}': '}',
'\\(': '(',
'\\)': ')',
'\\[': '[',
'\\]': ']',
'\\\\': '\\',
# Multi-character escapes
'\\s': S_SHORTCUT_SET,
'\\S': S_SHORTCUT_SET,
'\\d': D_SHORTCUT_SET,
'\\D': D_SHORTCUT_SET,
'\\i': I_SHORTCUT_SET,
'\\I': I_SHORTCUT_SET,
'\\c': C_SHORTCUT_SET,
'\\C': C_SHORTCUT_SET,
'\\w': W_SHORTCUT_SET,
'\\W': W_SHORTCUT_SET,
}
class XsdRegexCharGroup(MutableSet):
"""
A set subclass to represent XML Schema regex character groups.
"""
_re_char_group = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
_re_unicode_ref = re.compile(r'\\([pP]){([\w\d-]+)}')
def __init__(self, xsd_version='1.0', *args):
self.xsd_version = xsd_version
self.positive = UnicodeSubset()
self.negative = UnicodeSubset()
for char in args:
self.add(char)
def __repr__(self):
return '<%s at %d>' % (self.__class__.__name__, id(self))
def __str__(self):
return unicode(self).encode("utf-8")
def __unicode__(self):
if not self.negative:
return '[%s]' % unicode_type(self.positive)
elif not self.positive:
return '[^%s]' % unicode_type(self.negative)
else:
return '[%s%s]' % (
unicode_type(UnicodeSubset(self.negative.complement())), unicode_type(self.positive)
)
if PY3:
__str__ = __unicode__
def __contains__(self, char):
if self.negative:
return ord(char) not in self.negative or ord(char) in self.positive
return ord(char) in self.positive
def __iter__(self):
if self.negative:
return (
cp for cp in range(maxunicode + 1)
if cp in self.positive or cp not in self.negative
)
return iter(sorted(self.positive))
def __len__(self):
return len(self.positive) + len(self.negative)
# Operators override
def __isub__(self, other):
if self.negative:
self.positive |= (other.negative - self.negative)
if other.negative:
self.negative.clear()
elif other.negative:
self.positive &= other.negative
self.positive -= other.positive
return self
def add(self, s):
for part in self._re_char_group.split(s):
if part in CHARACTER_ESCAPES:
value = CHARACTER_ESCAPES[part]
if isinstance(value, string_base_type):
self.positive.update(value)
elif part[-1].islower():
self.positive |= value
else:
self.negative |= value
elif part.startswith('\\p'):
if self._re_unicode_ref.search(part) is None:
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
self.positive |= unicode_subset(part[3:-1], self.xsd_version > '1.0')
elif part.startswith('\\P'):
if self._re_unicode_ref.search(part) is None:
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
self.negative |= unicode_subset(part[3:-1], self.xsd_version > '1.0')
else:
self.positive.update(part)
def discard(self, s):
for part in self._re_char_group.split(s):
if part in CHARACTER_ESCAPES:
value = CHARACTER_ESCAPES[part]
if isinstance(value, string_base_type):
self.positive.difference_update(value)
elif part[-1].islower():
self.positive -= value
else:
self.negative -= value
elif part.startswith('\\p'):
if self._re_unicode_ref.search(part) is None:
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
self.positive -= unicode_subset(part[3:-1], self.xsd_version > '1.0')
elif part.startswith('\\P'):
if self._re_unicode_ref.search(part) is None:
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
self.negative -= unicode_subset(part[3:-1], self.xsd_version > '1.0')
else:
self.positive.difference_update(part)
def clear(self):
self.positive.clear()
self.negative.clear()
def complement(self):
self.positive, self.negative = self.negative, self.positive
def parse_character_class(xml_regex, class_pos, xsd_version='1.0'):
"""
Parses a character class of an XML Schema regular expression.
:param xml_regex: the source XML Schema regular expression.
:param class_pos: the position of the character class in the source string, \
must coincide with a '[' character.
:param xsd_version: the version of the XML Schema processor ('1.0' or '1.1') \
that called the regular expression parsing.
:return: an `XsdRegexCharGroup` instance and the first position after the character class.
"""
if xml_regex[class_pos] != '[':
raise XMLSchemaRegexError('not a character class at position %d: %r' % (class_pos, xml_regex))
pos = class_pos + 1
if xml_regex[pos] == '^':
pos += 1
negative = True
else:
negative = False
group_pos = pos
while True:
if xml_regex[pos] == '[':
raise XMLSchemaRegexError("'[' is invalid in a character class: %r" % xml_regex)
elif xml_regex[pos] == '\\':
pos += 2
elif xml_regex[pos] == ']' or xml_regex[pos:pos + 2] == '-[':
if pos == group_pos:
raise XMLSchemaRegexError(
"empty character class at position %d: %r" % (class_pos, xml_regex)
)
if _RE_HYPHENS.search(xml_regex[group_pos:pos]) and pos - group_pos > 2:
raise XMLSchemaRegexError(
"invalid character range '--' at position %d: %r" % (class_pos, xml_regex)
)
char_group = XsdRegexCharGroup(xsd_version, xml_regex[group_pos:pos])
if negative:
char_group.complement()
break
else:
pos += 1
if xml_regex[pos] != ']':
# Parse a group subtraction
pos += 1
subtracted_group, pos = parse_character_class(xml_regex, pos)
pos += 1
if xml_regex[pos] != ']':
raise XMLSchemaRegexError(
"unterminated character group at position %d: %r" % (class_pos, xml_regex)
)
char_group -= subtracted_group
return char_group, pos
def get_python_regex(xml_regex, xsd_version='1.0'):
"""
Translates an XML regex expression to a Python compatible expression.
:param xml_regex: the source XML Schema regular expression.
:param xsd_version: the version of the XML Schema processor ('1.0' or '1.1') \
that called the regular expression parsing.
"""
regex = ['^(']
pos = 0
xml_regex_len = len(xml_regex)
nested_groups = 0
match = _RE_FORBIDDEN_ESCAPES.search(xml_regex)
if match:
raise XMLSchemaRegexError(
"not allowed escape sequence %r at position %d: %r" % (match.group(), match.span()[0], xml_regex)
)
while pos < xml_regex_len:
ch = xml_regex[pos]
if ch == '.':
regex.append('[^\r\n]')
elif ch in ('^', '$'):
regex.append(r'\%s' % ch)
elif ch == '[':
try:
char_group, pos = parse_character_class(xml_regex, pos, xsd_version)
except IndexError:
raise XMLSchemaRegexError(
"unterminated character group at position %d: %r" % (pos, xml_regex)
)
else:
char_group_repr = unicode_type(char_group)
if char_group_repr == '[^]':
regex.append(r'[\w\W]')
elif char_group_repr == '[]':
regex.append(r'[^\w\W]')
else:
regex.append(char_group_repr)
elif ch == '{':
if pos == 0:
raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
match = _RE_QUANTIFIER.match(xml_regex[pos:])
if match is None:
raise XMLSchemaRegexError("invalid quantifier %r at position %d: %r" % (ch, pos, xml_regex))
regex.append(match.group())
pos += len(match.group())
if pos < xml_regex_len and xml_regex[pos] in ('?', '+', '*'):
raise XMLSchemaRegexError(
"unexpected meta character %r at position %d: %r" % (xml_regex[pos], pos, xml_regex)
)
continue
elif ch == '(':
if xml_regex[pos:pos + 2] == '(?':
raise XMLSchemaRegexError("'(?...)' extension notation is not allowed: %r" % xml_regex)
nested_groups += 1
regex.append(ch)
elif ch == ']':
raise XMLSchemaRegexError("unexpected meta character %r at position %d: %r" % (ch, pos, xml_regex))
elif ch == ')':
if nested_groups == 0:
raise XMLSchemaRegexError("unbalanced parenthesis ')' at position %d: %r" % (pos, xml_regex))
nested_groups -= 1
regex.append(ch)
elif ch in ('?', '+', '*'):
if pos == 0:
raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
elif pos < xml_regex_len - 1 and xml_regex[pos + 1] in ('?', '+', '*', '{'):
raise XMLSchemaRegexError(
"unexpected meta character %r at position %d: %r" % (xml_regex[pos + 1], pos + 1, xml_regex)
)
regex.append(ch)
elif ch == '\\':
pos += 1
if pos >= xml_regex_len:
regex.append('\\')
elif xml_regex[pos] == 'i':
regex.append('[%s]' % I_SHORTCUT_REPLACE)
elif xml_regex[pos] == 'I':
regex.append('[^%s]' % I_SHORTCUT_REPLACE)
elif xml_regex[pos] == 'c':
regex.append('[%s]' % C_SHORTCUT_REPLACE)
elif xml_regex[pos] == 'C':
regex.append('[^%s]' % C_SHORTCUT_REPLACE)
elif xml_regex[pos] in 'pP':
block_pos = pos - 1
try:
if xml_regex[pos + 1] != '{':
raise XMLSchemaValueError("a '{' expected, found %r." % xml_regex[pos + 1])
while xml_regex[pos] != '}':
pos += 1
except (IndexError, ValueError):
raise XMLSchemaRegexError(
"truncated unicode block escape at position %d: %r" % (block_pos, xml_regex))
p_shortcut_set = unicode_subset(xml_regex[block_pos + 3:pos], xsd_version > '1.0')
if xml_regex[block_pos + 1] == 'p':
regex.append('[%s]' % p_shortcut_set)
else:
regex.append('[^%s]' % p_shortcut_set)
else:
regex.append('\\%s' % xml_regex[pos])
else:
regex.append(ch)
pos += 1
if nested_groups > 0:
raise XMLSchemaRegexError("unterminated subpattern in expression: %r" % xml_regex)
regex.append(r')$')
return ''.join(regex)