debian-xmlschema/xmlschema/codepoints.py

# -*- coding: utf-8 -*-
#
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
This module defines Unicode character categories and blocks, defined as sets of code points.
"""
from __future__ import unicode_literals

import json
import os
from sys import maxunicode

from .compat import PY3, unicode_chr, string_base_type, Iterable, MutableSet
from .exceptions import XMLSchemaValueError, XMLSchemaTypeError, XMLSchemaRegexError

CHARACTER_GROUP_ESCAPED = {ord(c) for c in r'-|.^?*+{}()[]\\'}
"""Code Points of escaped chars in a character group."""

UCS4_MAXUNICODE = 1114111


def code_point_order(cp):
    """Ordering function for code points."""
    return cp if isinstance(cp, int) else cp[0]


def code_point_reverse_order(cp):
    """Reverse ordering function for code points."""
    return cp if isinstance(cp, int) else cp[1] - 1


def iter_code_points(code_points, reverse=False):
    """
    Iterates a code points sequence. The code points are accorpated in ranges when are contiguous.

    :param code_points: an iterable with code points and code point ranges.
    :param reverse: if `True` reverses the order of the sequence.
    :return: yields code points or code point ranges.
    """
    start_cp = end_cp = None
    if reverse:
        code_points = sorted(code_points, key=code_point_reverse_order, reverse=True)
    else:
        code_points = sorted(code_points, key=code_point_order)

    for cp in code_points:
        if isinstance(cp, int):
            cp = cp, cp + 1

        if start_cp is None:
            start_cp, end_cp = cp
            continue
        elif reverse:
            if start_cp <= cp[1]:
                start_cp = min(start_cp, cp[0])
                continue
        elif end_cp >= cp[0]:
            end_cp = max(end_cp, cp[1])
            continue

        if end_cp > start_cp + 1:
            yield start_cp, end_cp
        else:
            yield start_cp
        start_cp, end_cp = cp
    else:
        if start_cp is not None:
            if end_cp > start_cp + 1:
                yield start_cp, end_cp
            else:
                yield start_cp


def check_code_point(cp):
    """
    Checks a code point or code point range.

    :return: a valid code point range.
    """
    if isinstance(cp, int):
        if not (0 <= cp <= maxunicode):
            raise XMLSchemaValueError("not a Unicode code point: %r" % cp)
        return cp, cp + 1
    else:
        if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \
                or not isinstance(cp[0], int) or not isinstance(cp[1], int):
            raise XMLSchemaValueError("not a Unicode code point range: %r" % cp)
        return cp


def code_point_repr(cp):
    """
    Returns the string representation of a code point.

    :param cp: an integer or a tuple with at least two integers. Values must be in interval [0, sys.maxunicode].
    """
    if isinstance(cp, int):
        if cp in CHARACTER_GROUP_ESCAPED:
            return r'\%s' % unicode_chr(cp)
        return unicode_chr(cp)

    if cp[0] in CHARACTER_GROUP_ESCAPED:
        start_char = r'\%s' % unicode_chr(cp[0])
    else:
        start_char = unicode_chr(cp[0])

    end_cp = cp[1] - 1  # Character ranges include the right bound
    if end_cp in CHARACTER_GROUP_ESCAPED:
        end_char = r'\%s' % unicode_chr(end_cp)
    else:
        end_char = unicode_chr(end_cp)

    if end_cp > cp[0] + 1:
        return '%s-%s' % (start_char, end_char)
    else:
        return start_char + end_char


def iterparse_character_group(s, expand_ranges=False):
    """
    Parse a regex character group part, generating a sequence of code points
    and code points ranges. An unescaped hyphen (-) that is not at the start
    or at the and is interpreted as range specifier.

    :param s: a string representing a character group part.
    :param expand_ranges: if set to `True` then expands character ranges.
    :return: yields integers or couples of integers.
    """
    escaped = False
    on_range = False
    char = None
    length = len(s)
    string_iter = iter(range(len(s)))
    for k in string_iter:
        if k == 0:
            char = s[0]
            if char == '\\':
                escaped = True
            elif char in r'[]' and length > 1:
                raise XMLSchemaRegexError("bad character %r at position 0" % char)
            elif expand_ranges:
                yield ord(char)
            elif length <= 2 or s[1] != '-':
                yield ord(char)
        elif s[k] == '-':
            if escaped or (k == length - 1):
                char = s[k]
                yield ord(char)
                escaped = False
            elif on_range:
                char = s[k]
                yield ord(char)
                on_range = False
            else:
                # Parse character range
                on_range = True
                try:
                    k = next(string_iter)
                    end_char = s[k]
                    if end_char == '\\' and (k < length - 1):
                        if s[k + 1] in r'-|.^?*+{}()[]':
                            k = next(string_iter)
                            end_char = s[k]
                        elif s[k + 1] in r'sSdDiIcCwWpP':
                            msg = "bad character range '%s-\\%s' at position %d: %r" % (char, s[k + 1], k - 2, s)
                            raise XMLSchemaRegexError(msg)
                except StopIteration:
                    msg = "bad character range '%s-%s' at position %d: %r" % (char, s[-1], k - 2, s)
                    raise XMLSchemaRegexError(msg)

                if ord(char) > ord(end_char):
                    msg = "bad character range '%s-%s' at position %d: %r" % (char, end_char, k - 2, s)
                    raise XMLSchemaRegexError(msg)
                elif expand_ranges:
                    for cp in range(ord(char) + 1, ord(end_char) + 1):
                        yield cp
                else:
                    yield ord(char), ord(end_char) + 1
        elif s[k] in r'|.^?*+{}()':
            if escaped:
                escaped = False
            on_range = False
            char = s[k]
            yield ord(char)
        elif s[k] in r'[]':
            if not escaped and length > 1:
                raise XMLSchemaRegexError("bad character %r at position %d" % (s[k], k))
            escaped = on_range = False
            char = s[k]
            if k >= length - 2 or s[k + 1] != '-':
                yield ord(char)
        elif s[k] == '\\':
            if escaped:
                escaped = on_range = False
                char = '\\'
                yield ord(char)
            else:
                escaped = True
        else:
            if escaped:
                escaped = False
                yield ord('\\')
            on_range = False
            char = s[k]
            if k >= length - 2 or s[k + 1] != '-':
                yield ord(char)
    if escaped:
        yield ord('\\')


class UnicodeSubset(MutableSet):
    """
    Represent a subset of Unicode code points, implemented with an ordered list of integer values
    and ranges. It manages character ranges for adding or for discarding elements from a string
    and for a compressed representation.
    """

    def __init__(self, *args, **kwargs):
        if len(args) > 1:
            raise XMLSchemaTypeError(
                '%s expected at most 1 arguments, got %d' % (self.__class__.__name__, len(args))
            )
        if kwargs:
            raise XMLSchemaTypeError(
                '%s does not take keyword arguments' % self.__class__.__name__
            )

        if not args:
            self._code_points = list()
        elif isinstance(args[0], UnicodeSubset):
            self._code_points = args[0].code_points.copy()
        else:
            self._code_points = list()
            self.update(args[0])

    @classmethod
    def fromlist(cls, code_points):
        subset = cls()
        subset._code_points = sorted(code_points, key=code_point_order)
        return subset

    @property
    def code_points(self):
        return self._code_points

    def __repr__(self):
        return "<%s %r at %d>" % (self.__class__.__name__, str(self._code_points), id(self))

    def __str__(self):
        return unicode(self).encode("utf-8")

    def __unicode__(self):
        return ''.join(code_point_repr(cp) for cp in self._code_points)

    if PY3:
        __str__ = __unicode__

    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return UnicodeSubset(self._code_points)

    def __reversed__(self):
        for item in reversed(self._code_points):
            if isinstance(item, int):
                yield item
            else:
                for cp in reversed(range(item[0], item[1])):
                    yield cp

    def complement(self):
        last_cp = 0
        for cp in self._code_points:
            if last_cp > maxunicode:
                break
            elif isinstance(cp, int):
                cp = cp, cp + 1

            diff = cp[0] - last_cp
            if diff > 2:
                yield last_cp, cp[0]
            elif diff == 2:
                yield last_cp
                yield last_cp + 1
            elif diff == 1:
                yield last_cp
            elif diff != 0:
                raise XMLSchemaValueError("instance code points unordered")
            last_cp = cp[1]

        if last_cp < maxunicode:
            yield last_cp, maxunicode + 1
        elif last_cp == maxunicode:
            yield maxunicode

    def iter_characters(self):
        return map(chr, self.__iter__())

    #
    # MutableSet's abstract methods implementation
    def __contains__(self, value):
        if not isinstance(value, int):
            try:
                value = ord(value)
            except TypeError:
                raise XMLSchemaTypeError("%r: argument must be a code point or a character." % value)

        for cp in self._code_points:
            if not isinstance(cp, int):
                if cp[0] > value:
                    return False
                elif cp[1] <= value:
                    continue
                else:
                    return True
            elif cp > value:
                return False
            elif cp == value:
                return True
        return False

    def __iter__(self):
        for cp in self._code_points:
            if isinstance(cp, int):
                yield cp
            else:
                for k in range(*cp):
                    yield k

    def __len__(self):
        k = 0
        for _ in self:
            k += 1
        return k

    def update(self, *others):
        for value in others:
            if isinstance(value, string_base_type):
                for cp in iter_code_points(iterparse_character_group(value), reverse=True):
                    self.add(cp)
            else:
                for cp in iter_code_points(value, reverse=True):
                    self.add(cp)

    def add(self, value):
        start_value, end_value = check_code_point(value)
        code_points = self._code_points
        last_index = len(code_points) - 1
        for k, cp in enumerate(code_points):
            if isinstance(cp, int):
                cp = cp, cp + 1

            if end_value < cp[0]:
                code_points.insert(k, value if isinstance(value, int) else tuple(value))
            elif start_value > cp[1]:
                continue
            elif end_value > cp[1]:
                if k == last_index:
                    code_points[k] = min(cp[0], start_value), end_value
                else:
                    next_cp = code_points[k + 1]
                    higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0]
                    if end_value <= higher_bound:
                        code_points[k] = min(cp[0], start_value), end_value
                    else:
                        code_points[k] = min(cp[0], start_value), higher_bound
                        start_value = higher_bound
                        continue
            elif start_value < cp[0]:
                code_points[k] = start_value, cp[1]
            break
        else:
            self._code_points.append(tuple(value) if isinstance(value, list) else value)

    def difference_update(self, *others):
        for value in others:
            if isinstance(value, string_base_type):
                for cp in iter_code_points(iterparse_character_group(value), reverse=True):
                    self.discard(cp)
            else:
                for cp in iter_code_points(value, reverse=True):
                    self.discard(cp)

    def discard(self, value):
        start_cp, end_cp = check_code_point(value)
        code_points = self._code_points
        for k in reversed(range(len(code_points))):
            cp = code_points[k]
            if isinstance(cp, int):
                cp = cp, cp + 1

            if start_cp >= cp[1]:
                break
            elif end_cp >= cp[1]:
                if start_cp <= cp[0]:
                    del code_points[k]
                elif start_cp - cp[0] > 1:
                    code_points[k] = cp[0], start_cp
                else:
                    code_points[k] = cp[0]
            elif end_cp > cp[0]:
                if start_cp <= cp[0]:
                    if cp[1] - end_cp > 1:
                        code_points[k] = end_cp, cp[1]
                    else:
                        code_points[k] = cp[1] - 1
                else:
                    if cp[1] - end_cp > 1:
                        code_points.insert(k + 1, (end_cp, cp[1]))
                    else:
                        code_points.insert(k + 1, cp[1] - 1)
                    if start_cp - cp[0] > 1:
                        code_points[k] = cp[0], start_cp
                    else:
                        code_points[k] = cp[0]

    #
    # MutableSet's mixin methods override
    def clear(self):
        del self._code_points[:]

    def __eq__(self, other):
        if not isinstance(other, Iterable):
            return NotImplemented
        elif isinstance(other, UnicodeSubset):
            return self._code_points == other._code_points
        else:
            return self._code_points == other

    def __ior__(self, other):
        if not isinstance(other, Iterable):
            return NotImplemented
        elif isinstance(other, UnicodeSubset):
            other = reversed(other._code_points)
        else:
            other = iter_code_points(other, reverse=True)

        for cp in other:
            self.add(cp)
        return self

    def __isub__(self, other):
        if not isinstance(other, Iterable):
            return NotImplemented
        elif isinstance(other, UnicodeSubset):
            other = reversed(other._code_points)
        else:
            other = iter_code_points(other, reverse=True)

        for cp in other:
            self.discard(cp)
        return self

    def __sub__(self, other):
        obj = self.copy()
        return obj.__isub__(other)

    __rsub__ = __sub__

    def __iand__(self, other):
        for value in (self - other):
            self.discard(value)
        return self

    def __ixor__(self, other):
        if other is self:
            self.clear()
            return self
        elif not isinstance(other, Iterable):
            return NotImplemented
        elif not isinstance(other, UnicodeSubset):
            other = UnicodeSubset(other)

        for value in other:
            if value in self:
                self.discard(value)
            else:
                self.add(value)
        return self


def get_unicodedata_categories():
    """
    Extracts Unicode categories information from unicodedata library. Each category is
    represented with an ordered list containing code points and code point ranges.

    :return: a dictionary with category names as keys and lists as values.
    """
    from unicodedata import category

    categories = {k: [] for k in (
        'C', 'Cc', 'Cf', 'Cs', 'Co', 'Cn',
        'L', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo',
        'M', 'Mn', 'Mc', 'Me',
        'N', 'Nd', 'Nl', 'No',
        'P', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po',
        'S', 'Sm', 'Sc', 'Sk', 'So',
        'Z', 'Zs', 'Zl', 'Zp'
    )}

    minor_category = 'Cc'
    start_cp, next_cp = 0, 1
    for cp in range(maxunicode + 1):
        if category(unicode_chr(cp)) != minor_category:
            if cp > next_cp:
                categories[minor_category].append((start_cp, cp))
                categories[minor_category[0]].append(categories[minor_category][-1])
            else:
                categories[minor_category].append(start_cp)
                categories[minor_category[0]].append(start_cp)

            minor_category = category(unicode_chr(cp))
            start_cp, next_cp = cp, cp + 1
    else:
        if next_cp == maxunicode + 1:
            categories[minor_category].append(start_cp)
            categories[minor_category[0]].append(start_cp)
        else:
            categories[minor_category].append((start_cp, maxunicode + 1))
            categories[minor_category[0]].append(categories[minor_category][-1])

    return categories


def save_unicode_categories(filename=None):
    """
    Save Unicode categories to a JSON file.

    :param filename: the JSON file to save. If it's `None` uses the predefined filename
    'unicode_categories.json' and try to save in the directory of this module.
    """
    if filename is None:
        filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json')

    print("Saving Unicode categories to %r" % filename)
    with open(filename, 'w') as fp:
        json.dump(get_unicodedata_categories(), fp)


def build_unicode_categories(filename=None):
    """
    Builds the Unicode categories as `UnicodeSubset` instances. For a fast building a pre-built
    JSON file with Unicode categories data can be used. If the JSON file is missing or is not
    accessible the categories data is rebuild using `unicodedata.category()` API.

    :param filename: the name of the JSON file to load for a fast building of the categories. \
    If not provided the predefined filename  'unicode_categories.json' is used.
    :return: a dictionary that associates Unicode category names with `UnicodeSubset` instances.
    """
    if maxunicode < UCS4_MAXUNICODE:
        categories = get_unicodedata_categories()  # for Python 2.7
    else:
        if filename is None:
            filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json')
        try:
            with open(filename, 'r') as fp:
                categories = json.load(fp)
        except (IOError, SystemError, ValueError):
            categories = get_unicodedata_categories()
        else:
            if any(not v for v in categories):
                categories = get_unicodedata_categories()

    return {k: UnicodeSubset.fromlist(v) for k, v in categories.items()}


UNICODE_CATEGORIES = build_unicode_categories()


UNICODE_BLOCKS = {
    'IsBasicLatin': UnicodeSubset('\u0000-\u007F'),
    'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'),
    'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'),
    'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'),
    'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'),
    'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'),
    'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'),
    'IsGreek': UnicodeSubset('\u0370-\u03FF'),
    'IsCyrillic': UnicodeSubset('\u0400-\u04FF'),
    'IsArmenian': UnicodeSubset('\u0530-\u058F'),
    'IsHebrew': UnicodeSubset('\u0590-\u05FF'),
    'IsArabic': UnicodeSubset('\u0600-\u06FF'),
    'IsSyriac': UnicodeSubset('\u0700-\u074F'),
    'IsThaana': UnicodeSubset('\u0780-\u07BF'),
    'IsDevanagari': UnicodeSubset('\u0900-\u097F'),
    'IsBengali': UnicodeSubset('\u0980-\u09FF'),
    'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'),
    'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'),
    'IsOriya': UnicodeSubset('\u0B00-\u0B7F'),
    'IsTamil': UnicodeSubset('\u0B80-\u0BFF'),
    'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'),
    'IsKannada': UnicodeSubset('\u0C80-\u0CFF'),
    'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'),
    'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'),
    'IsThai': UnicodeSubset('\u0E00-\u0E7F'),
    'IsLao': UnicodeSubset('\u0E80-\u0EFF'),
    'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'),
    'IsMyanmar': UnicodeSubset('\u1000-\u109F'),
    'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'),
    'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'),
    'IsEthiopic': UnicodeSubset('\u1200-\u137F'),
    'IsCherokee': UnicodeSubset('\u13A0-\u13FF'),
    'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'),
    'IsOgham': UnicodeSubset('\u1680-\u169F'),
    'IsRunic': UnicodeSubset('\u16A0-\u16FF'),
    'IsKhmer': UnicodeSubset('\u1780-\u17FF'),
    'IsMongolian': UnicodeSubset('\u1800-\u18AF'),
    'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'),
    'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'),
    'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'),
    'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'),
    'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'),
    'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'),
    'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'),
    'IsNumberForms': UnicodeSubset('\u2150-\u218F'),
    'IsArrows': UnicodeSubset('\u2190-\u21FF'),
    'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'),
    'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'),
    'IsControlPictures': UnicodeSubset('\u2400-\u243F'),
    'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'),
    'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'),
    'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'),
    'IsBlockElements': UnicodeSubset('\u2580-\u259F'),
    'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'),
    'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'),
    'IsDingbats': UnicodeSubset('\u2700-\u27BF'),
    'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'),
    'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'),
    'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'),
    'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'),
    'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'),
    'IsHiragana': UnicodeSubset('\u3040-\u309F'),
    'IsKatakana': UnicodeSubset('\u30A0-\u30FF'),
    'IsBopomofo': UnicodeSubset('\u3100-\u312F'),
    'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'),
    'IsKanbun': UnicodeSubset('\u3190-\u319F'),
    'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'),
    'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'),
    'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'),
    'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'),
    'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'),
    'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'),
    'IsYiRadicals': UnicodeSubset('\uA490-\uA4CF'),
    'IsHangulSyllables': UnicodeSubset('\uAC00-\uD7A3'),
    'IsHighSurrogates': UnicodeSubset('\uD800-\uDB7F'),
    'IsHighPrivateUseSurrogates': UnicodeSubset('\uDB80-\uDBFF'),
    'IsLowSurrogates': UnicodeSubset('\uDC00-\uDFFF'),
    'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF'),
    'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'),
    'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'),
    'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'),
    'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'),
    'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'),
    'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'),
    'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'),
    'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'),
    'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF')
}

if maxunicode == UCS4_MAXUNICODE:
    UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD'),
    UNICODE_BLOCKS.update({
        'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'),
        'IsGothic': UnicodeSubset('\U00010330-\U0001034F'),
        'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'),
        'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'),
        'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'),
        'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'),
        'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'),
        'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'),
        'IsTags': UnicodeSubset('\U000E0000-\U000E007F')
    })


def unicode_subset(name, block_safe=False):
    if name.startswith('Is'):
        try:
            return UNICODE_BLOCKS[name]
        except KeyError:
            if block_safe:
                return UnicodeSubset.fromlist([0, maxunicode])
            raise XMLSchemaRegexError("%r doesn't match to any Unicode block." % name)
    else:
        try:
            return UNICODE_CATEGORIES[name]
        except KeyError:
            raise XMLSchemaRegexError("%r doesn't match to any Unicode category." % name)