# -*- coding: utf-8 -*- # # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). # All rights reserved. # This file is distributed under the terms of the MIT License. # See the file 'LICENSE' in the root directory of the present # distribution, or http://opensource.org/licenses/MIT. # # @author Davide Brunato # """ This module defines Unicode character categories and blocks, defined as sets of code points. """ from __future__ import unicode_literals import json import os from sys import maxunicode from .compat import PY3, unicode_chr, string_base_type, Iterable, MutableSet from .exceptions import XMLSchemaValueError, XMLSchemaTypeError, XMLSchemaRegexError CHARACTER_GROUP_ESCAPED = {ord(c) for c in r'-|.^?*+{}()[]\\'} """Code Points of escaped chars in a character group.""" UCS4_MAXUNICODE = 1114111 def code_point_order(cp): """Ordering function for code points.""" return cp if isinstance(cp, int) else cp[0] def code_point_reverse_order(cp): """Reverse ordering function for code points.""" return cp if isinstance(cp, int) else cp[1] - 1 def iter_code_points(code_points, reverse=False): """ Iterates a code points sequence. The code points are accorpated in ranges when are contiguous. :param code_points: an iterable with code points and code point ranges. :param reverse: if `True` reverses the order of the sequence. :return: yields code points or code point ranges. """ start_cp = end_cp = None if reverse: code_points = sorted(code_points, key=code_point_reverse_order, reverse=True) else: code_points = sorted(code_points, key=code_point_order) for cp in code_points: if isinstance(cp, int): cp = cp, cp + 1 if start_cp is None: start_cp, end_cp = cp continue elif reverse: if start_cp <= cp[1]: start_cp = min(start_cp, cp[0]) continue elif end_cp >= cp[0]: end_cp = max(end_cp, cp[1]) continue if end_cp > start_cp + 1: yield start_cp, end_cp else: yield start_cp start_cp, end_cp = cp else: if start_cp is not None: if end_cp > start_cp + 1: yield start_cp, end_cp else: yield start_cp def check_code_point(cp): """ Checks a code point or code point range. :return: a valid code point range. """ if isinstance(cp, int): if not (0 <= cp <= maxunicode): raise XMLSchemaValueError("not a Unicode code point: %r" % cp) return cp, cp + 1 else: if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \ or not isinstance(cp[0], int) or not isinstance(cp[1], int): raise XMLSchemaValueError("not a Unicode code point range: %r" % cp) return cp def code_point_repr(cp): """ Returns the string representation of a code point. :param cp: an integer or a tuple with at least two integers. Values must be in interval [0, sys.maxunicode]. """ if isinstance(cp, int): if cp in CHARACTER_GROUP_ESCAPED: return r'\%s' % unicode_chr(cp) return unicode_chr(cp) if cp[0] in CHARACTER_GROUP_ESCAPED: start_char = r'\%s' % unicode_chr(cp[0]) else: start_char = unicode_chr(cp[0]) end_cp = cp[1] - 1 # Character ranges include the right bound if end_cp in CHARACTER_GROUP_ESCAPED: end_char = r'\%s' % unicode_chr(end_cp) else: end_char = unicode_chr(end_cp) if end_cp > cp[0] + 1: return '%s-%s' % (start_char, end_char) else: return start_char + end_char def iterparse_character_group(s, expand_ranges=False): """ Parse a regex character group part, generating a sequence of code points and code points ranges. An unescaped hyphen (-) that is not at the start or at the and is interpreted as range specifier. :param s: a string representing a character group part. :param expand_ranges: if set to `True` then expands character ranges. :return: yields integers or couples of integers. """ escaped = False on_range = False char = None length = len(s) string_iter = iter(range(len(s))) for k in string_iter: if k == 0: char = s[0] if char == '\\': escaped = True elif char in r'[]' and length > 1: raise XMLSchemaRegexError("bad character %r at position 0" % char) elif expand_ranges: yield ord(char) elif length <= 2 or s[1] != '-': yield ord(char) elif s[k] == '-': if escaped or (k == length - 1): char = s[k] yield ord(char) escaped = False elif on_range: char = s[k] yield ord(char) on_range = False else: # Parse character range on_range = True try: k = next(string_iter) end_char = s[k] if end_char == '\\' and (k < length - 1): if s[k + 1] in r'-|.^?*+{}()[]': k = next(string_iter) end_char = s[k] elif s[k + 1] in r'sSdDiIcCwWpP': msg = "bad character range '%s-\\%s' at position %d: %r" % (char, s[k + 1], k - 2, s) raise XMLSchemaRegexError(msg) except StopIteration: msg = "bad character range '%s-%s' at position %d: %r" % (char, s[-1], k - 2, s) raise XMLSchemaRegexError(msg) if ord(char) > ord(end_char): msg = "bad character range '%s-%s' at position %d: %r" % (char, end_char, k - 2, s) raise XMLSchemaRegexError(msg) elif expand_ranges: for cp in range(ord(char) + 1, ord(end_char) + 1): yield cp else: yield ord(char), ord(end_char) + 1 elif s[k] in r'|.^?*+{}()': if escaped: escaped = False on_range = False char = s[k] yield ord(char) elif s[k] in r'[]': if not escaped and length > 1: raise XMLSchemaRegexError("bad character %r at position %d" % (s[k], k)) escaped = on_range = False char = s[k] if k >= length - 2 or s[k + 1] != '-': yield ord(char) elif s[k] == '\\': if escaped: escaped = on_range = False char = '\\' yield ord(char) else: escaped = True else: if escaped: escaped = False yield ord('\\') on_range = False char = s[k] if k >= length - 2 or s[k + 1] != '-': yield ord(char) if escaped: yield ord('\\') class UnicodeSubset(MutableSet): """ Represent a subset of Unicode code points, implemented with an ordered list of integer values and ranges. It manages character ranges for adding or for discarding elements from a string and for a compressed representation. """ def __init__(self, *args, **kwargs): if len(args) > 1: raise XMLSchemaTypeError( '%s expected at most 1 arguments, got %d' % (self.__class__.__name__, len(args)) ) if kwargs: raise XMLSchemaTypeError( '%s does not take keyword arguments' % self.__class__.__name__ ) if not args: self._code_points = list() elif isinstance(args[0], UnicodeSubset): self._code_points = args[0].code_points.copy() else: self._code_points = list() self.update(args[0]) @classmethod def fromlist(cls, code_points): subset = cls() subset._code_points = sorted(code_points, key=code_point_order) return subset @property def code_points(self): return self._code_points def __repr__(self): return "<%s %r at %d>" % (self.__class__.__name__, str(self._code_points), id(self)) def __str__(self): return unicode(self).encode("utf-8") def __unicode__(self): return ''.join(code_point_repr(cp) for cp in self._code_points) if PY3: __str__ = __unicode__ def copy(self): return self.__copy__() def __copy__(self): return UnicodeSubset(self._code_points) def __reversed__(self): for item in reversed(self._code_points): if isinstance(item, int): yield item else: for cp in reversed(range(item[0], item[1])): yield cp def complement(self): last_cp = 0 for cp in self._code_points: if last_cp > maxunicode: break elif isinstance(cp, int): cp = cp, cp + 1 diff = cp[0] - last_cp if diff > 2: yield last_cp, cp[0] elif diff == 2: yield last_cp yield last_cp + 1 elif diff == 1: yield last_cp elif diff != 0: raise XMLSchemaValueError("instance code points unordered") last_cp = cp[1] if last_cp < maxunicode: yield last_cp, maxunicode + 1 elif last_cp == maxunicode: yield maxunicode def iter_characters(self): return map(chr, self.__iter__()) # # MutableSet's abstract methods implementation def __contains__(self, value): if not isinstance(value, int): try: value = ord(value) except TypeError: raise XMLSchemaTypeError("%r: argument must be a code point or a character." % value) for cp in self._code_points: if not isinstance(cp, int): if cp[0] > value: return False elif cp[1] <= value: continue else: return True elif cp > value: return False elif cp == value: return True return False def __iter__(self): for cp in self._code_points: if isinstance(cp, int): yield cp else: for k in range(*cp): yield k def __len__(self): k = 0 for _ in self: k += 1 return k def update(self, *others): for value in others: if isinstance(value, string_base_type): for cp in iter_code_points(iterparse_character_group(value), reverse=True): self.add(cp) else: for cp in iter_code_points(value, reverse=True): self.add(cp) def add(self, value): start_value, end_value = check_code_point(value) code_points = self._code_points last_index = len(code_points) - 1 for k, cp in enumerate(code_points): if isinstance(cp, int): cp = cp, cp + 1 if end_value < cp[0]: code_points.insert(k, value if isinstance(value, int) else tuple(value)) elif start_value > cp[1]: continue elif end_value > cp[1]: if k == last_index: code_points[k] = min(cp[0], start_value), end_value else: next_cp = code_points[k + 1] higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0] if end_value <= higher_bound: code_points[k] = min(cp[0], start_value), end_value else: code_points[k] = min(cp[0], start_value), higher_bound start_value = higher_bound continue elif start_value < cp[0]: code_points[k] = start_value, cp[1] break else: self._code_points.append(tuple(value) if isinstance(value, list) else value) def difference_update(self, *others): for value in others: if isinstance(value, string_base_type): for cp in iter_code_points(iterparse_character_group(value), reverse=True): self.discard(cp) else: for cp in iter_code_points(value, reverse=True): self.discard(cp) def discard(self, value): start_cp, end_cp = check_code_point(value) code_points = self._code_points for k in reversed(range(len(code_points))): cp = code_points[k] if isinstance(cp, int): cp = cp, cp + 1 if start_cp >= cp[1]: break elif end_cp >= cp[1]: if start_cp <= cp[0]: del code_points[k] elif start_cp - cp[0] > 1: code_points[k] = cp[0], start_cp else: code_points[k] = cp[0] elif end_cp > cp[0]: if start_cp <= cp[0]: if cp[1] - end_cp > 1: code_points[k] = end_cp, cp[1] else: code_points[k] = cp[1] - 1 else: if cp[1] - end_cp > 1: code_points.insert(k + 1, (end_cp, cp[1])) else: code_points.insert(k + 1, cp[1] - 1) if start_cp - cp[0] > 1: code_points[k] = cp[0], start_cp else: code_points[k] = cp[0] # # MutableSet's mixin methods override def clear(self): del self._code_points[:] def __eq__(self, other): if not isinstance(other, Iterable): return NotImplemented elif isinstance(other, UnicodeSubset): return self._code_points == other._code_points else: return self._code_points == other def __ior__(self, other): if not isinstance(other, Iterable): return NotImplemented elif isinstance(other, UnicodeSubset): other = reversed(other._code_points) else: other = iter_code_points(other, reverse=True) for cp in other: self.add(cp) return self def __isub__(self, other): if not isinstance(other, Iterable): return NotImplemented elif isinstance(other, UnicodeSubset): other = reversed(other._code_points) else: other = iter_code_points(other, reverse=True) for cp in other: self.discard(cp) return self def __sub__(self, other): obj = self.copy() return obj.__isub__(other) __rsub__ = __sub__ def __iand__(self, other): for value in (self - other): self.discard(value) return self def __ixor__(self, other): if other is self: self.clear() return self elif not isinstance(other, Iterable): return NotImplemented elif not isinstance(other, UnicodeSubset): other = UnicodeSubset(other) for value in other: if value in self: self.discard(value) else: self.add(value) return self def get_unicodedata_categories(): """ Extracts Unicode categories information from unicodedata library. Each category is represented with an ordered list containing code points and code point ranges. :return: a dictionary with category names as keys and lists as values. """ from unicodedata import category categories = {k: [] for k in ( 'C', 'Cc', 'Cf', 'Cs', 'Co', 'Cn', 'L', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'M', 'Mn', 'Mc', 'Me', 'N', 'Nd', 'Nl', 'No', 'P', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'S', 'Sm', 'Sc', 'Sk', 'So', 'Z', 'Zs', 'Zl', 'Zp' )} minor_category = 'Cc' start_cp, next_cp = 0, 1 for cp in range(maxunicode + 1): if category(unicode_chr(cp)) != minor_category: if cp > next_cp: categories[minor_category].append((start_cp, cp)) categories[minor_category[0]].append(categories[minor_category][-1]) else: categories[minor_category].append(start_cp) categories[minor_category[0]].append(start_cp) minor_category = category(unicode_chr(cp)) start_cp, next_cp = cp, cp + 1 else: if next_cp == maxunicode + 1: categories[minor_category].append(start_cp) categories[minor_category[0]].append(start_cp) else: categories[minor_category].append((start_cp, maxunicode + 1)) categories[minor_category[0]].append(categories[minor_category][-1]) return categories def save_unicode_categories(filename=None): """ Save Unicode categories to a JSON file. :param filename: the JSON file to save. If it's `None` uses the predefined filename 'unicode_categories.json' and try to save in the directory of this module. """ if filename is None: filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json') print("Saving Unicode categories to %r" % filename) with open(filename, 'w') as fp: json.dump(get_unicodedata_categories(), fp) def build_unicode_categories(filename=None): """ Builds the Unicode categories as `UnicodeSubset` instances. For a fast building a pre-built JSON file with Unicode categories data can be used. If the JSON file is missing or is not accessible the categories data is rebuild using `unicodedata.category()` API. :param filename: the name of the JSON file to load for a fast building of the categories. \ If not provided the predefined filename 'unicode_categories.json' is used. :return: a dictionary that associates Unicode category names with `UnicodeSubset` instances. """ if maxunicode < UCS4_MAXUNICODE: categories = get_unicodedata_categories() # for Python 2.7 else: if filename is None: filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json') try: with open(filename, 'r') as fp: categories = json.load(fp) except (IOError, SystemError, ValueError): categories = get_unicodedata_categories() else: if any(not v for v in categories): categories = get_unicodedata_categories() return {k: UnicodeSubset.fromlist(v) for k, v in categories.items()} UNICODE_CATEGORIES = build_unicode_categories() UNICODE_BLOCKS = { 'IsBasicLatin': UnicodeSubset('\u0000-\u007F'), 'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'), 'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'), 'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'), 'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'), 'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'), 'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'), 'IsGreek': UnicodeSubset('\u0370-\u03FF'), 'IsCyrillic': UnicodeSubset('\u0400-\u04FF'), 'IsArmenian': UnicodeSubset('\u0530-\u058F'), 'IsHebrew': UnicodeSubset('\u0590-\u05FF'), 'IsArabic': UnicodeSubset('\u0600-\u06FF'), 'IsSyriac': UnicodeSubset('\u0700-\u074F'), 'IsThaana': UnicodeSubset('\u0780-\u07BF'), 'IsDevanagari': UnicodeSubset('\u0900-\u097F'), 'IsBengali': UnicodeSubset('\u0980-\u09FF'), 'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'), 'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'), 'IsOriya': UnicodeSubset('\u0B00-\u0B7F'), 'IsTamil': UnicodeSubset('\u0B80-\u0BFF'), 'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'), 'IsKannada': UnicodeSubset('\u0C80-\u0CFF'), 'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'), 'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'), 'IsThai': UnicodeSubset('\u0E00-\u0E7F'), 'IsLao': UnicodeSubset('\u0E80-\u0EFF'), 'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'), 'IsMyanmar': UnicodeSubset('\u1000-\u109F'), 'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'), 'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'), 'IsEthiopic': UnicodeSubset('\u1200-\u137F'), 'IsCherokee': UnicodeSubset('\u13A0-\u13FF'), 'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'), 'IsOgham': UnicodeSubset('\u1680-\u169F'), 'IsRunic': UnicodeSubset('\u16A0-\u16FF'), 'IsKhmer': UnicodeSubset('\u1780-\u17FF'), 'IsMongolian': UnicodeSubset('\u1800-\u18AF'), 'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'), 'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'), 'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'), 'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'), 'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'), 'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'), 'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'), 'IsNumberForms': UnicodeSubset('\u2150-\u218F'), 'IsArrows': UnicodeSubset('\u2190-\u21FF'), 'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'), 'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'), 'IsControlPictures': UnicodeSubset('\u2400-\u243F'), 'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'), 'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'), 'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'), 'IsBlockElements': UnicodeSubset('\u2580-\u259F'), 'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'), 'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'), 'IsDingbats': UnicodeSubset('\u2700-\u27BF'), 'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'), 'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'), 'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'), 'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'), 'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'), 'IsHiragana': UnicodeSubset('\u3040-\u309F'), 'IsKatakana': UnicodeSubset('\u30A0-\u30FF'), 'IsBopomofo': UnicodeSubset('\u3100-\u312F'), 'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'), 'IsKanbun': UnicodeSubset('\u3190-\u319F'), 'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'), 'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'), 'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'), 'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'), 'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'), 'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'), 'IsYiRadicals': UnicodeSubset('\uA490-\uA4CF'), 'IsHangulSyllables': UnicodeSubset('\uAC00-\uD7A3'), 'IsHighSurrogates': UnicodeSubset('\uD800-\uDB7F'), 'IsHighPrivateUseSurrogates': UnicodeSubset('\uDB80-\uDBFF'), 'IsLowSurrogates': UnicodeSubset('\uDC00-\uDFFF'), 'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF'), 'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'), 'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'), 'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'), 'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'), 'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'), 'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'), 'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'), 'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'), 'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF') } if maxunicode == UCS4_MAXUNICODE: UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD'), UNICODE_BLOCKS.update({ 'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'), 'IsGothic': UnicodeSubset('\U00010330-\U0001034F'), 'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'), 'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'), 'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'), 'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'), 'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'), 'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'), 'IsTags': UnicodeSubset('\U000E0000-\U000E007F') }) def unicode_subset(name, block_safe=False): if name.startswith('Is'): try: return UNICODE_BLOCKS[name] except KeyError: if block_safe: return UnicodeSubset.fromlist([0, maxunicode]) raise XMLSchemaRegexError("%r doesn't match to any Unicode block." % name) else: try: return UNICODE_CATEGORIES[name] except KeyError: raise XMLSchemaRegexError("%r doesn't match to any Unicode category." % name)