368 lines
13 KiB
Python
368 lines
13 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
|
|
# All rights reserved.
|
|
# This file is distributed under the terms of the MIT License.
|
|
# See the file 'LICENSE' in the root directory of the present
|
|
# distribution, or http://opensource.org/licenses/MIT.
|
|
#
|
|
# @author Davide Brunato <brunato@sissa.it>
|
|
#
|
|
"""
|
|
Parse and translate XML Schema regular expressions to Python regex syntax.
|
|
"""
|
|
from __future__ import unicode_literals
|
|
import re
|
|
from itertools import chain
|
|
from sys import maxunicode
|
|
|
|
from .compat import PY3, unicode_type, string_base_type, MutableSet
|
|
from .exceptions import XMLSchemaValueError, XMLSchemaRegexError
|
|
from .codepoints import UnicodeSubset, UNICODE_CATEGORIES, unicode_subset
|
|
|
|
_RE_HYPHENS = re.compile(r'(?<!\\)--')
|
|
_RE_QUANTIFIER = re.compile(r'{\d+(,(\d+)?)?}')
|
|
_RE_FORBIDDEN_ESCAPES = re.compile(
|
|
r'(?<!\\)\\(U[0-9a-fA-F]{8}|u[0-9a-fA-F]{4}|x[0-9a-fA-F]{2}|o{\d+}|\d+|A|Z|z|B|b|o)'
|
|
)
|
|
|
|
|
|
I_SHORTCUT_REPLACE = (
|
|
":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
|
|
"\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
|
|
)
|
|
|
|
C_SHORTCUT_REPLACE = (
|
|
"-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-"
|
|
"\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
|
|
)
|
|
|
|
S_SHORTCUT_SET = UnicodeSubset(' \n\t\r')
|
|
D_SHORTCUT_SET = UnicodeSubset()
|
|
D_SHORTCUT_SET._code_points = UNICODE_CATEGORIES['Nd'].code_points
|
|
I_SHORTCUT_SET = UnicodeSubset(I_SHORTCUT_REPLACE)
|
|
C_SHORTCUT_SET = UnicodeSubset(C_SHORTCUT_REPLACE)
|
|
W_SHORTCUT_SET = UnicodeSubset(chain(
|
|
UNICODE_CATEGORIES['L'].code_points,
|
|
UNICODE_CATEGORIES['M'].code_points,
|
|
UNICODE_CATEGORIES['N'].code_points,
|
|
UNICODE_CATEGORIES['S'].code_points
|
|
))
|
|
|
|
# Single and Multi character escapes
|
|
CHARACTER_ESCAPES = {
|
|
# Single-character escapes
|
|
'\\n': '\n',
|
|
'\\r': '\r',
|
|
'\\t': '\t',
|
|
'\\|': '|',
|
|
'\\.': '.',
|
|
'\\-': '-',
|
|
'\\^': '^',
|
|
'\\?': '?',
|
|
'\\*': '*',
|
|
'\\+': '+',
|
|
'\\{': '{',
|
|
'\\}': '}',
|
|
'\\(': '(',
|
|
'\\)': ')',
|
|
'\\[': '[',
|
|
'\\]': ']',
|
|
'\\\\': '\\',
|
|
|
|
# Multi-character escapes
|
|
'\\s': S_SHORTCUT_SET,
|
|
'\\S': S_SHORTCUT_SET,
|
|
'\\d': D_SHORTCUT_SET,
|
|
'\\D': D_SHORTCUT_SET,
|
|
'\\i': I_SHORTCUT_SET,
|
|
'\\I': I_SHORTCUT_SET,
|
|
'\\c': C_SHORTCUT_SET,
|
|
'\\C': C_SHORTCUT_SET,
|
|
'\\w': W_SHORTCUT_SET,
|
|
'\\W': W_SHORTCUT_SET,
|
|
}
|
|
|
|
|
|
class XsdRegexCharGroup(MutableSet):
|
|
"""
|
|
A set subclass to represent XML Schema regex character groups.
|
|
"""
|
|
_re_char_group = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
|
|
_re_unicode_ref = re.compile(r'\\([pP]){([\w\d-]+)}')
|
|
|
|
def __init__(self, xsd_version='1.0', *args):
|
|
self.xsd_version = xsd_version
|
|
self.positive = UnicodeSubset()
|
|
self.negative = UnicodeSubset()
|
|
for char in args:
|
|
self.add(char)
|
|
|
|
def __repr__(self):
|
|
return '<%s at %d>' % (self.__class__.__name__, id(self))
|
|
|
|
def __str__(self):
|
|
return unicode(self).encode("utf-8")
|
|
|
|
def __unicode__(self):
|
|
if not self.negative:
|
|
return '[%s]' % unicode_type(self.positive)
|
|
elif not self.positive:
|
|
return '[^%s]' % unicode_type(self.negative)
|
|
else:
|
|
return '[%s%s]' % (
|
|
unicode_type(UnicodeSubset(self.negative.complement())), unicode_type(self.positive)
|
|
)
|
|
|
|
if PY3:
|
|
__str__ = __unicode__
|
|
|
|
def __contains__(self, char):
|
|
if self.negative:
|
|
return ord(char) not in self.negative or ord(char) in self.positive
|
|
return ord(char) in self.positive
|
|
|
|
def __iter__(self):
|
|
if self.negative:
|
|
return (
|
|
cp for cp in range(maxunicode + 1)
|
|
if cp in self.positive or cp not in self.negative
|
|
)
|
|
return iter(sorted(self.positive))
|
|
|
|
def __len__(self):
|
|
return len(self.positive) + len(self.negative)
|
|
|
|
# Operators override
|
|
def __isub__(self, other):
|
|
if self.negative:
|
|
self.positive |= (other.negative - self.negative)
|
|
if other.negative:
|
|
self.negative.clear()
|
|
elif other.negative:
|
|
self.positive &= other.negative
|
|
self.positive -= other.positive
|
|
return self
|
|
|
|
def add(self, s):
|
|
for part in self._re_char_group.split(s):
|
|
if part in CHARACTER_ESCAPES:
|
|
value = CHARACTER_ESCAPES[part]
|
|
if isinstance(value, string_base_type):
|
|
self.positive.update(value)
|
|
elif part[-1].islower():
|
|
self.positive |= value
|
|
else:
|
|
self.negative |= value
|
|
elif part.startswith('\\p'):
|
|
if self._re_unicode_ref.search(part) is None:
|
|
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
|
|
self.positive |= unicode_subset(part[3:-1], self.xsd_version > '1.0')
|
|
elif part.startswith('\\P'):
|
|
if self._re_unicode_ref.search(part) is None:
|
|
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
|
|
self.negative |= unicode_subset(part[3:-1], self.xsd_version > '1.0')
|
|
else:
|
|
self.positive.update(part)
|
|
|
|
def discard(self, s):
|
|
for part in self._re_char_group.split(s):
|
|
if part in CHARACTER_ESCAPES:
|
|
value = CHARACTER_ESCAPES[part]
|
|
if isinstance(value, string_base_type):
|
|
self.positive.difference_update(value)
|
|
elif part[-1].islower():
|
|
self.positive -= value
|
|
else:
|
|
self.negative -= value
|
|
elif part.startswith('\\p'):
|
|
if self._re_unicode_ref.search(part) is None:
|
|
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
|
|
self.positive -= unicode_subset(part[3:-1], self.xsd_version > '1.0')
|
|
elif part.startswith('\\P'):
|
|
if self._re_unicode_ref.search(part) is None:
|
|
raise XMLSchemaValueError("wrong Unicode subset specification %r" % part)
|
|
self.negative -= unicode_subset(part[3:-1], self.xsd_version > '1.0')
|
|
else:
|
|
self.positive.difference_update(part)
|
|
|
|
def clear(self):
|
|
self.positive.clear()
|
|
self.negative.clear()
|
|
|
|
def complement(self):
|
|
self.positive, self.negative = self.negative, self.positive
|
|
|
|
|
|
def parse_character_class(xml_regex, class_pos, xsd_version='1.0'):
|
|
"""
|
|
Parses a character class of an XML Schema regular expression.
|
|
|
|
:param xml_regex: the source XML Schema regular expression.
|
|
:param class_pos: the position of the character class in the source string, \
|
|
must coincide with a '[' character.
|
|
:param xsd_version: the version of the XML Schema processor ('1.0' or '1.1') \
|
|
that called the regular expression parsing.
|
|
:return: an `XsdRegexCharGroup` instance and the first position after the character class.
|
|
"""
|
|
if xml_regex[class_pos] != '[':
|
|
raise XMLSchemaRegexError('not a character class at position %d: %r' % (class_pos, xml_regex))
|
|
|
|
pos = class_pos + 1
|
|
if xml_regex[pos] == '^':
|
|
pos += 1
|
|
negative = True
|
|
else:
|
|
negative = False
|
|
|
|
group_pos = pos
|
|
while True:
|
|
if xml_regex[pos] == '[':
|
|
raise XMLSchemaRegexError("'[' is invalid in a character class: %r" % xml_regex)
|
|
elif xml_regex[pos] == '\\':
|
|
pos += 2
|
|
elif xml_regex[pos] == ']' or xml_regex[pos:pos + 2] == '-[':
|
|
if pos == group_pos:
|
|
raise XMLSchemaRegexError(
|
|
"empty character class at position %d: %r" % (class_pos, xml_regex)
|
|
)
|
|
if _RE_HYPHENS.search(xml_regex[group_pos:pos]) and pos - group_pos > 2:
|
|
raise XMLSchemaRegexError(
|
|
"invalid character range '--' at position %d: %r" % (class_pos, xml_regex)
|
|
)
|
|
|
|
char_group = XsdRegexCharGroup(xsd_version, xml_regex[group_pos:pos])
|
|
if negative:
|
|
char_group.complement()
|
|
break
|
|
else:
|
|
pos += 1
|
|
|
|
if xml_regex[pos] != ']':
|
|
# Parse a group subtraction
|
|
pos += 1
|
|
subtracted_group, pos = parse_character_class(xml_regex, pos)
|
|
pos += 1
|
|
if xml_regex[pos] != ']':
|
|
raise XMLSchemaRegexError(
|
|
"unterminated character group at position %d: %r" % (class_pos, xml_regex)
|
|
)
|
|
char_group -= subtracted_group
|
|
|
|
return char_group, pos
|
|
|
|
|
|
def get_python_regex(xml_regex, xsd_version='1.0'):
|
|
"""
|
|
Translates an XML regex expression to a Python compatible expression.
|
|
|
|
:param xml_regex: the source XML Schema regular expression.
|
|
:param xsd_version: the version of the XML Schema processor ('1.0' or '1.1') \
|
|
that called the regular expression parsing.
|
|
"""
|
|
regex = ['^(']
|
|
pos = 0
|
|
xml_regex_len = len(xml_regex)
|
|
nested_groups = 0
|
|
|
|
match = _RE_FORBIDDEN_ESCAPES.search(xml_regex)
|
|
if match:
|
|
raise XMLSchemaRegexError(
|
|
"not allowed escape sequence %r at position %d: %r" % (match.group(), match.span()[0], xml_regex)
|
|
)
|
|
|
|
while pos < xml_regex_len:
|
|
ch = xml_regex[pos]
|
|
if ch == '.':
|
|
regex.append('[^\r\n]')
|
|
elif ch in ('^', '$'):
|
|
regex.append(r'\%s' % ch)
|
|
elif ch == '[':
|
|
try:
|
|
char_group, pos = parse_character_class(xml_regex, pos, xsd_version)
|
|
except IndexError:
|
|
raise XMLSchemaRegexError(
|
|
"unterminated character group at position %d: %r" % (pos, xml_regex)
|
|
)
|
|
else:
|
|
char_group_repr = unicode_type(char_group)
|
|
if char_group_repr == '[^]':
|
|
regex.append(r'[\w\W]')
|
|
elif char_group_repr == '[]':
|
|
regex.append(r'[^\w\W]')
|
|
else:
|
|
regex.append(char_group_repr)
|
|
|
|
elif ch == '{':
|
|
if pos == 0:
|
|
raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
|
|
match = _RE_QUANTIFIER.match(xml_regex[pos:])
|
|
if match is None:
|
|
raise XMLSchemaRegexError("invalid quantifier %r at position %d: %r" % (ch, pos, xml_regex))
|
|
regex.append(match.group())
|
|
pos += len(match.group())
|
|
if pos < xml_regex_len and xml_regex[pos] in ('?', '+', '*'):
|
|
raise XMLSchemaRegexError(
|
|
"unexpected meta character %r at position %d: %r" % (xml_regex[pos], pos, xml_regex)
|
|
)
|
|
continue
|
|
|
|
elif ch == '(':
|
|
if xml_regex[pos:pos + 2] == '(?':
|
|
raise XMLSchemaRegexError("'(?...)' extension notation is not allowed: %r" % xml_regex)
|
|
nested_groups += 1
|
|
regex.append(ch)
|
|
elif ch == ']':
|
|
raise XMLSchemaRegexError("unexpected meta character %r at position %d: %r" % (ch, pos, xml_regex))
|
|
elif ch == ')':
|
|
if nested_groups == 0:
|
|
raise XMLSchemaRegexError("unbalanced parenthesis ')' at position %d: %r" % (pos, xml_regex))
|
|
nested_groups -= 1
|
|
regex.append(ch)
|
|
elif ch in ('?', '+', '*'):
|
|
if pos == 0:
|
|
raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
|
|
elif pos < xml_regex_len - 1 and xml_regex[pos + 1] in ('?', '+', '*', '{'):
|
|
raise XMLSchemaRegexError(
|
|
"unexpected meta character %r at position %d: %r" % (xml_regex[pos + 1], pos + 1, xml_regex)
|
|
)
|
|
regex.append(ch)
|
|
elif ch == '\\':
|
|
pos += 1
|
|
if pos >= xml_regex_len:
|
|
regex.append('\\')
|
|
elif xml_regex[pos] == 'i':
|
|
regex.append('[%s]' % I_SHORTCUT_REPLACE)
|
|
elif xml_regex[pos] == 'I':
|
|
regex.append('[^%s]' % I_SHORTCUT_REPLACE)
|
|
elif xml_regex[pos] == 'c':
|
|
regex.append('[%s]' % C_SHORTCUT_REPLACE)
|
|
elif xml_regex[pos] == 'C':
|
|
regex.append('[^%s]' % C_SHORTCUT_REPLACE)
|
|
elif xml_regex[pos] in 'pP':
|
|
block_pos = pos - 1
|
|
try:
|
|
if xml_regex[pos + 1] != '{':
|
|
raise XMLSchemaValueError("a '{' expected, found %r." % xml_regex[pos + 1])
|
|
while xml_regex[pos] != '}':
|
|
pos += 1
|
|
except (IndexError, ValueError):
|
|
raise XMLSchemaRegexError(
|
|
"truncated unicode block escape at position %d: %r" % (block_pos, xml_regex))
|
|
|
|
p_shortcut_set = unicode_subset(xml_regex[block_pos + 3:pos], xsd_version > '1.0')
|
|
if xml_regex[block_pos + 1] == 'p':
|
|
regex.append('[%s]' % p_shortcut_set)
|
|
else:
|
|
regex.append('[^%s]' % p_shortcut_set)
|
|
else:
|
|
regex.append('\\%s' % xml_regex[pos])
|
|
else:
|
|
regex.append(ch)
|
|
pos += 1
|
|
|
|
if nested_groups > 0:
|
|
raise XMLSchemaRegexError("unterminated subpattern in expression: %r" % xml_regex)
|
|
regex.append(r')$')
|
|
return ''.join(regex)
|