Add __getstate__() for XPath2Parser pickling

- An XPath2Parser drops schema bind on pickling, the bind has to be restored by the schema objects. This is necessary because when a schema is bound to a parser token classes are created for XSD atomic types constructors. - Add bind_parser() to AbstractSchemaProxy - Add is_schema_bound() to XPath2Parser - Add statimethod create_tokenizer to TDOP Parser - Rename Parser.build_tokenizer() to build()
2019-10-08 17:36:06 +02:00 · 2019-10-08 17:36:06 +02:00 · d11524f50b
parent 9e2e5d45e3
commit d11524f50b
7 changed files with 106 additions and 80 deletions
--- a/doc/pratt_api.rst
+++ b/doc/pratt_api.rst
@ -42,7 +42,6 @@ Parser base class

    Parsing methods:

-    .. automethod:: build_tokenizer
    .. automethod:: parse
    .. automethod:: advance
    .. automethod:: raw_advance
@ -54,10 +53,11 @@ Parser base class
    .. automethod:: is_line_start
    .. automethod:: is_spaced

-    Helper methods for building effective parser classes:
+    Helper methods for building new parsers:

    .. automethod:: register
    .. automethod:: unregister
+    .. automethod:: duplicate
    .. automethod:: literal
    .. automethod:: nullary
    .. automethod:: prefix
@ -65,3 +65,5 @@ Parser base class
    .. automethod:: infix
    .. automethod:: infixr
    .. automethod:: method
+    .. automethod:: build
+    .. automethod:: create_tokenizer
--- a/elementpath/schema_proxy.py
+++ b/elementpath/schema_proxy.py
@ -144,6 +144,21 @@ class AbstractSchemaProxy(object):
        self._schema = schema
        self._base_element = base_element

+    def bind_parser(self, parser):
+        """
+        Binds a parser instance with schema proxy adding the schema's atomic types constructors.
+        This method can be redefined in a concrete proxy to optimize schema bindings.
+
+        :param parser: a parser instance.
+        """
+        if parser.schema is not self:
+            parser.schema = self
+
+        parser.symbol_table = parser.__class__.symbol_table.copy()
+        for xsd_type in self.iter_atomic_types():
+            parser.schema_constructor(xsd_type.name)
+        parser.tokenizer = parser.create_tokenizer(parser.symbol_table)
+
    def get_context(self):
        """
        Get a context instance for static analysis phase.
@ -152,6 +167,15 @@ class AbstractSchemaProxy(object):
        """
        return XPathSchemaContext(root=self._schema, item=self._base_element)

+    def find(self, path, namespaces=None):
+        """
+        Find a schema element or attribute using an XPath expression.
+
+        :param path: an XPath expression that selects an element or an attribute node.
+        :param namespaces: an optional mapping from namespace prefix to namespace URI.
+        :return: The first matching schema component, or ``None`` if there is no match.
+        """
+
    @abstractmethod
    def get_type(self, qname):
        """
@ -185,12 +209,6 @@ class AbstractSchemaProxy(object):
        :returns: an object that represents an XSD element or `None`.
        """

-    # TODO: can make this as @abstractmethod from release v1.3.1
-    def find(self, path, namespaces=None):
-        """
-        Find the schema component using an XPath expression.
-        """
-
    @abstractmethod
    def get_substitution_group(self, qname):
        """
--- a/elementpath/tdop_parser.py
+++ b/elementpath/tdop_parser.py
@ -20,8 +20,6 @@ from abc import ABCMeta
 from .compat import PY3, add_metaclass, MutableSequence
 from .exceptions import ElementPathSyntaxError, ElementPathNameError, ElementPathValueError, ElementPathTypeError

-###
-# Regex based tokenizer
 SPECIAL_SYMBOL_PATTERN = re.compile(r'\(\w+\)')
 """Compiled regular expression for matching special symbols, that are names between round brackets."""

@ -50,49 +48,6 @@ def symbol_to_identifier(symbol):
        return ''.join(get_id_name(c) for c in symbol)


-def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
-    """
-    Returns a regex based tokenizer built from a symbol table of token classes.
-    The returned tokenizer skips extra spaces between symbols.
-
-    A regular expression is created from the symbol table of the parser using a template.
-    The symbols are inserted in the template putting the longer symbols first. Symbols and
-    their patterns can't contain spaces.
-
-    :param symbol_table: a dictionary containing the token classes of the formal language.
-    :param name_pattern: pattern to use to match names.
-    """
-    tokenizer_pattern_template = r"""
-        ('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) |  # Literals (string and numbers)
-        (%s|[%s]) |                                                       # Symbol's patterns
-        (%s) |                                                            # Names
-        (\S) |                                                            # Unexpected characters
-        \s+                                                               # Skip extra spaces
-    """
-    patterns = [
-        t.pattern for s, t in symbol_table.items()
-        if SPECIAL_SYMBOL_PATTERN.match(s) is None
-    ]
-    string_patterns = []
-    character_patterns = []
-
-    for p in patterns:
-        if ' ' in p:
-            raise ElementPathValueError('pattern %r contains spaces' % p)
-        length = len(p)
-        if length == 1 or length == 2 and p[0] == '\\':
-            character_patterns.append(p)
-        else:
-            string_patterns.append(p)
-
-    pattern = tokenizer_pattern_template % (
-        '|'.join(sorted(string_patterns, key=lambda x: -len(x))),
-        ''.join(character_patterns),
-        name_pattern
-    )
-    return re.compile(pattern, re.VERBOSE)
-
-
 #
 # Simple top down parser based on Vaughan Pratt's algorithm (Top Down Operator Precedence).
 #
@ -358,21 +313,9 @@ class Parser(object):
    tokenizer = None
    symbol_table = {}

-    @classmethod
-    def build_tokenizer(cls, name_pattern='[A-Za-z0-9_]+'):
-        """
-        Builds the parser class tokenizer using the symbol related patterns.
-
-        :param name_pattern: Pattern to use to match names.
-        """
-        if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
-            unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
-            raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
-        cls.tokenizer = create_tokenizer(cls.symbol_table, name_pattern)
-
    def __init__(self):
        if self.tokenizer is None:
-            raise ValueError("Incomplete parser class %s registration." % self.__class__.__name__)
+            raise ValueError("The parser %r is not built!" % self.__class__)
        self.token = None
        self.match = None
        self.next_token = None
@ -730,3 +673,59 @@ class Parser(object):
            setattr(token_class, func.__name__, func)
            return func
        return bind
+
+    @classmethod
+    def build(cls):
+        """
+        Builds the parser class. Checks if all declared symbols are defined
+        and builds a the regex tokenizer using the symbol related patterns.
+        """
+        if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
+            unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
+            raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
+        cls.tokenizer = cls.create_tokenizer(cls.symbol_table)
+
+    build_tokenizer = build  # For backward compatibility
+
+    @staticmethod
+    def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
+        """
+        Returns a regex based tokenizer built from a symbol table of token classes.
+        The returned tokenizer skips extra spaces between symbols.
+
+        A regular expression is created from the symbol table of the parser using a template.
+        The symbols are inserted in the template putting the longer symbols first. Symbols and
+        their patterns can't contain spaces.
+
+        :param symbol_table: a dictionary containing the token classes of the formal language.
+        :param name_pattern: pattern to use to match names.
+        """
+        tokenizer_pattern_template = r"""
+            ('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) |  # Literals (string and numbers)
+            (%s|[%s]) |                                                       # Symbol's patterns
+            (%s) |                                                            # Names
+            (\S) |                                                            # Unexpected characters
+            \s+                                                               # Skip extra spaces
+        """
+        patterns = [
+            t.pattern for s, t in symbol_table.items()
+            if SPECIAL_SYMBOL_PATTERN.match(s) is None
+        ]
+        string_patterns = []
+        character_patterns = []
+
+        for p in patterns:
+            if ' ' in p:
+                raise ElementPathValueError('pattern %r contains spaces' % p)
+            length = len(p)
+            if length == 1 or length == 2 and p[0] == '\\':
+                character_patterns.append(p)
+            else:
+                string_patterns.append(p)
+
+        pattern = tokenizer_pattern_template % (
+            '|'.join(sorted(string_patterns, key=lambda x: -len(x))),
+            ''.join(character_patterns),
+            name_pattern
+        )
+        return re.compile(pattern, re.VERBOSE)
--- a/elementpath/xpath1_parser.py
+++ b/elementpath/xpath1_parser.py
@ -26,6 +26,7 @@ from .xpath_nodes import AttributeNode, NamespaceNode, TypedAttribute, TypedElem
    is_etree_element, is_xpath_node, is_element_node, is_document_node, is_attribute_node, \
    is_text_node, is_comment_node, is_processing_instruction_node, node_name

+
 XML_NAME_CHARACTER = (u"A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
                      u"\u200C\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD")
 XML_NCNAME_PATTERN = u"[{0}][-.0-9\u00B7\u0300-\u036F\u203F-\u2040{0}]*".format(XML_NAME_CHARACTER)
@ -93,9 +94,9 @@ class XPath1Parser(Parser):
        self.variables = dict(variables if variables is not None else [])
        self.strict = strict

-    @classmethod
-    def build_tokenizer(cls, name_pattern=XML_NCNAME_PATTERN):
-        super(XPath1Parser, cls).build_tokenizer(name_pattern)
+    @staticmethod
+    def create_tokenizer(symbol_table, name_pattern=XML_NCNAME_PATTERN):
+        return Parser.create_tokenizer(symbol_table, name_pattern)

    @property
    def version(self):
@ -1282,4 +1283,4 @@ def evaluate(self, context=None):


 register('(end)')
-XPath1Parser.build_tokenizer()
+XPath1Parser.build()
--- a/elementpath/xpath2_constructors.py
+++ b/elementpath/xpath2_constructors.py
@ -601,4 +601,4 @@ def evaluate(self, context=None):
        return DateTime10(dt.year, dt.month, dt.day, tm.hour, tm.minute, tm.second, tm.microsecond, tzinfo)


-XPath2Parser.build_tokenizer()  # XPath 2.0 definitions completed, build the tokenizer.
+XPath2Parser.build()  # XPath 2.0 definition complete, can build the parser class.
--- a/elementpath/xpath2_parser.py
+++ b/elementpath/xpath2_parser.py
@ -26,8 +26,7 @@ from .namespaces import XSD_NAMESPACE, XPATH_FUNCTIONS_NAMESPACE, \
    qname_to_prefixed, prefixed_to_qname, XSD_UNTYPED_ATOMIC
 from .datatypes import UntypedAtomic, XSD_BUILTIN_TYPES
 from .xpath_nodes import is_xpath_node
-from .tdop_parser import create_tokenizer
-from .xpath1_parser import XML_NCNAME_PATTERN, XPath1Parser
+from .xpath1_parser import XPath1Parser
 from .xpath_context import XPathSchemaContext
 from .schema_proxy import AbstractSchemaProxy

@ -167,19 +166,21 @@ class XPath2Parser(XPath1Parser):
            self.function_namespace = function_namespace

        if schema is None:
-            self.schema = None
+            pass
        elif not isinstance(schema, AbstractSchemaProxy):
-            raise ElementPathTypeError("schema argument must be a subclass or instance of AbstractSchemaProxy!")
+            raise ElementPathTypeError("argument 'schema' must be an instance of AbstractSchemaProxy")
        else:
-            self.schema = schema
-            self.symbol_table = self.symbol_table.copy()
-            for xsd_type in self.schema.iter_atomic_types():
-                self.schema_constructor(xsd_type.name)
-            self.tokenizer = create_tokenizer(self.symbol_table, XML_NCNAME_PATTERN)
+            schema.bind_parser(self)

        self.base_uri = None if base_uri is None else urlparse(base_uri).geturl()
        self._compatibility_mode = compatibility_mode

+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop('symbol_table', None)
+        state.pop('tokenizer', None)
+        return state
+
    @property
    def version(self):
        return '2.0'
@ -337,6 +338,9 @@ class XPath2Parser(XPath1Parser):
        except KeyError:
            raise ElementPathKeyError("unknown type %r" % type_qname)

+    def is_schema_bound(self):
+        return 'symbol_table' in self.__dict__
+
    def parse(self, source):
        root_token = super(XPath1Parser, self).parse(source)
        if self.schema is None:
--- a/elementpath/xpath_selectors.py
+++ b/elementpath/xpath_selectors.py
@ -8,6 +8,8 @@
 #
 # @author Davide Brunato <brunato@sissa.it>
 #
+from __future__ import unicode_literals
+
 from .xpath_context import XPathContext
 from .xpath2_parser import XPath2Parser as XPath2Parser