diff --git a/doc/pratt_api.rst b/doc/pratt_api.rst index e218260..d496792 100644 --- a/doc/pratt_api.rst +++ b/doc/pratt_api.rst @@ -42,7 +42,6 @@ Parser base class Parsing methods: - .. automethod:: build_tokenizer .. automethod:: parse .. automethod:: advance .. automethod:: raw_advance @@ -54,10 +53,11 @@ Parser base class .. automethod:: is_line_start .. automethod:: is_spaced - Helper methods for building effective parser classes: + Helper methods for building new parsers: .. automethod:: register .. automethod:: unregister + .. automethod:: duplicate .. automethod:: literal .. automethod:: nullary .. automethod:: prefix @@ -65,3 +65,5 @@ Parser base class .. automethod:: infix .. automethod:: infixr .. automethod:: method + .. automethod:: build + .. automethod:: create_tokenizer diff --git a/elementpath/schema_proxy.py b/elementpath/schema_proxy.py index 6be48ab..b01cc24 100644 --- a/elementpath/schema_proxy.py +++ b/elementpath/schema_proxy.py @@ -144,6 +144,21 @@ class AbstractSchemaProxy(object): self._schema = schema self._base_element = base_element + def bind_parser(self, parser): + """ + Binds a parser instance with schema proxy adding the schema's atomic types constructors. + This method can be redefined in a concrete proxy to optimize schema bindings. + + :param parser: a parser instance. + """ + if parser.schema is not self: + parser.schema = self + + parser.symbol_table = parser.__class__.symbol_table.copy() + for xsd_type in self.iter_atomic_types(): + parser.schema_constructor(xsd_type.name) + parser.tokenizer = parser.create_tokenizer(parser.symbol_table) + def get_context(self): """ Get a context instance for static analysis phase. @@ -152,6 +167,15 @@ class AbstractSchemaProxy(object): """ return XPathSchemaContext(root=self._schema, item=self._base_element) + def find(self, path, namespaces=None): + """ + Find a schema element or attribute using an XPath expression. + + :param path: an XPath expression that selects an element or an attribute node. + :param namespaces: an optional mapping from namespace prefix to namespace URI. + :return: The first matching schema component, or ``None`` if there is no match. + """ + @abstractmethod def get_type(self, qname): """ @@ -185,12 +209,6 @@ class AbstractSchemaProxy(object): :returns: an object that represents an XSD element or `None`. """ - # TODO: can make this as @abstractmethod from release v1.3.1 - def find(self, path, namespaces=None): - """ - Find the schema component using an XPath expression. - """ - @abstractmethod def get_substitution_group(self, qname): """ diff --git a/elementpath/tdop_parser.py b/elementpath/tdop_parser.py index 6f62a95..c41fd50 100644 --- a/elementpath/tdop_parser.py +++ b/elementpath/tdop_parser.py @@ -20,8 +20,6 @@ from abc import ABCMeta from .compat import PY3, add_metaclass, MutableSequence from .exceptions import ElementPathSyntaxError, ElementPathNameError, ElementPathValueError, ElementPathTypeError -### -# Regex based tokenizer SPECIAL_SYMBOL_PATTERN = re.compile(r'\(\w+\)') """Compiled regular expression for matching special symbols, that are names between round brackets.""" @@ -50,49 +48,6 @@ def symbol_to_identifier(symbol): return ''.join(get_id_name(c) for c in symbol) -def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'): - """ - Returns a regex based tokenizer built from a symbol table of token classes. - The returned tokenizer skips extra spaces between symbols. - - A regular expression is created from the symbol table of the parser using a template. - The symbols are inserted in the template putting the longer symbols first. Symbols and - their patterns can't contain spaces. - - :param symbol_table: a dictionary containing the token classes of the formal language. - :param name_pattern: pattern to use to match names. - """ - tokenizer_pattern_template = r""" - ('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers) - (%s|[%s]) | # Symbol's patterns - (%s) | # Names - (\S) | # Unexpected characters - \s+ # Skip extra spaces - """ - patterns = [ - t.pattern for s, t in symbol_table.items() - if SPECIAL_SYMBOL_PATTERN.match(s) is None - ] - string_patterns = [] - character_patterns = [] - - for p in patterns: - if ' ' in p: - raise ElementPathValueError('pattern %r contains spaces' % p) - length = len(p) - if length == 1 or length == 2 and p[0] == '\\': - character_patterns.append(p) - else: - string_patterns.append(p) - - pattern = tokenizer_pattern_template % ( - '|'.join(sorted(string_patterns, key=lambda x: -len(x))), - ''.join(character_patterns), - name_pattern - ) - return re.compile(pattern, re.VERBOSE) - - # # Simple top down parser based on Vaughan Pratt's algorithm (Top Down Operator Precedence). # @@ -358,21 +313,9 @@ class Parser(object): tokenizer = None symbol_table = {} - @classmethod - def build_tokenizer(cls, name_pattern='[A-Za-z0-9_]+'): - """ - Builds the parser class tokenizer using the symbol related patterns. - - :param name_pattern: Pattern to use to match names. - """ - if not cls.SYMBOLS.issubset(cls.symbol_table.keys()): - unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table] - raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered)) - cls.tokenizer = create_tokenizer(cls.symbol_table, name_pattern) - def __init__(self): if self.tokenizer is None: - raise ValueError("Incomplete parser class %s registration." % self.__class__.__name__) + raise ValueError("The parser %r is not built!" % self.__class__) self.token = None self.match = None self.next_token = None @@ -730,3 +673,59 @@ class Parser(object): setattr(token_class, func.__name__, func) return func return bind + + @classmethod + def build(cls): + """ + Builds the parser class. Checks if all declared symbols are defined + and builds a the regex tokenizer using the symbol related patterns. + """ + if not cls.SYMBOLS.issubset(cls.symbol_table.keys()): + unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table] + raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered)) + cls.tokenizer = cls.create_tokenizer(cls.symbol_table) + + build_tokenizer = build # For backward compatibility + + @staticmethod + def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'): + """ + Returns a regex based tokenizer built from a symbol table of token classes. + The returned tokenizer skips extra spaces between symbols. + + A regular expression is created from the symbol table of the parser using a template. + The symbols are inserted in the template putting the longer symbols first. Symbols and + their patterns can't contain spaces. + + :param symbol_table: a dictionary containing the token classes of the formal language. + :param name_pattern: pattern to use to match names. + """ + tokenizer_pattern_template = r""" + ('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers) + (%s|[%s]) | # Symbol's patterns + (%s) | # Names + (\S) | # Unexpected characters + \s+ # Skip extra spaces + """ + patterns = [ + t.pattern for s, t in symbol_table.items() + if SPECIAL_SYMBOL_PATTERN.match(s) is None + ] + string_patterns = [] + character_patterns = [] + + for p in patterns: + if ' ' in p: + raise ElementPathValueError('pattern %r contains spaces' % p) + length = len(p) + if length == 1 or length == 2 and p[0] == '\\': + character_patterns.append(p) + else: + string_patterns.append(p) + + pattern = tokenizer_pattern_template % ( + '|'.join(sorted(string_patterns, key=lambda x: -len(x))), + ''.join(character_patterns), + name_pattern + ) + return re.compile(pattern, re.VERBOSE) diff --git a/elementpath/xpath1_parser.py b/elementpath/xpath1_parser.py index 9b1658c..aba0016 100644 --- a/elementpath/xpath1_parser.py +++ b/elementpath/xpath1_parser.py @@ -26,6 +26,7 @@ from .xpath_nodes import AttributeNode, NamespaceNode, TypedAttribute, TypedElem is_etree_element, is_xpath_node, is_element_node, is_document_node, is_attribute_node, \ is_text_node, is_comment_node, is_processing_instruction_node, node_name + XML_NAME_CHARACTER = (u"A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF" u"\u200C\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD") XML_NCNAME_PATTERN = u"[{0}][-.0-9\u00B7\u0300-\u036F\u203F-\u2040{0}]*".format(XML_NAME_CHARACTER) @@ -93,9 +94,9 @@ class XPath1Parser(Parser): self.variables = dict(variables if variables is not None else []) self.strict = strict - @classmethod - def build_tokenizer(cls, name_pattern=XML_NCNAME_PATTERN): - super(XPath1Parser, cls).build_tokenizer(name_pattern) + @staticmethod + def create_tokenizer(symbol_table, name_pattern=XML_NCNAME_PATTERN): + return Parser.create_tokenizer(symbol_table, name_pattern) @property def version(self): @@ -1282,4 +1283,4 @@ def evaluate(self, context=None): register('(end)') -XPath1Parser.build_tokenizer() +XPath1Parser.build() diff --git a/elementpath/xpath2_constructors.py b/elementpath/xpath2_constructors.py index f017e9c..87eccd6 100644 --- a/elementpath/xpath2_constructors.py +++ b/elementpath/xpath2_constructors.py @@ -601,4 +601,4 @@ def evaluate(self, context=None): return DateTime10(dt.year, dt.month, dt.day, tm.hour, tm.minute, tm.second, tm.microsecond, tzinfo) -XPath2Parser.build_tokenizer() # XPath 2.0 definitions completed, build the tokenizer. +XPath2Parser.build() # XPath 2.0 definition complete, can build the parser class. diff --git a/elementpath/xpath2_parser.py b/elementpath/xpath2_parser.py index 8e3681a..4c63e25 100644 --- a/elementpath/xpath2_parser.py +++ b/elementpath/xpath2_parser.py @@ -26,8 +26,7 @@ from .namespaces import XSD_NAMESPACE, XPATH_FUNCTIONS_NAMESPACE, \ qname_to_prefixed, prefixed_to_qname, XSD_UNTYPED_ATOMIC from .datatypes import UntypedAtomic, XSD_BUILTIN_TYPES from .xpath_nodes import is_xpath_node -from .tdop_parser import create_tokenizer -from .xpath1_parser import XML_NCNAME_PATTERN, XPath1Parser +from .xpath1_parser import XPath1Parser from .xpath_context import XPathSchemaContext from .schema_proxy import AbstractSchemaProxy @@ -167,19 +166,21 @@ class XPath2Parser(XPath1Parser): self.function_namespace = function_namespace if schema is None: - self.schema = None + pass elif not isinstance(schema, AbstractSchemaProxy): - raise ElementPathTypeError("schema argument must be a subclass or instance of AbstractSchemaProxy!") + raise ElementPathTypeError("argument 'schema' must be an instance of AbstractSchemaProxy") else: - self.schema = schema - self.symbol_table = self.symbol_table.copy() - for xsd_type in self.schema.iter_atomic_types(): - self.schema_constructor(xsd_type.name) - self.tokenizer = create_tokenizer(self.symbol_table, XML_NCNAME_PATTERN) + schema.bind_parser(self) self.base_uri = None if base_uri is None else urlparse(base_uri).geturl() self._compatibility_mode = compatibility_mode + def __getstate__(self): + state = self.__dict__.copy() + state.pop('symbol_table', None) + state.pop('tokenizer', None) + return state + @property def version(self): return '2.0' @@ -337,6 +338,9 @@ class XPath2Parser(XPath1Parser): except KeyError: raise ElementPathKeyError("unknown type %r" % type_qname) + def is_schema_bound(self): + return 'symbol_table' in self.__dict__ + def parse(self, source): root_token = super(XPath1Parser, self).parse(source) if self.schema is None: diff --git a/elementpath/xpath_selectors.py b/elementpath/xpath_selectors.py index 152e5ef..159fb6d 100644 --- a/elementpath/xpath_selectors.py +++ b/elementpath/xpath_selectors.py @@ -8,6 +8,8 @@ # # @author Davide Brunato # +from __future__ import unicode_literals + from .xpath_context import XPathContext from .xpath2_parser import XPath2Parser as XPath2Parser