Add __getstate__() for XPath2Parser pickling

- An XPath2Parser drops schema bind on pickling, the bind has to
    be restored by the schema objects. This is necessary because
    when a schema is bound to a parser token classes are created
    for XSD atomic types constructors.
  - Add bind_parser() to AbstractSchemaProxy
  - Add is_schema_bound() to XPath2Parser
  - Add statimethod create_tokenizer to TDOP Parser
  - Rename Parser.build_tokenizer() to build()
This commit is contained in:
Davide Brunato 2019-10-08 17:36:06 +02:00
parent 9e2e5d45e3
commit d11524f50b
7 changed files with 106 additions and 80 deletions

View File

@ -42,7 +42,6 @@ Parser base class
Parsing methods:
.. automethod:: build_tokenizer
.. automethod:: parse
.. automethod:: advance
.. automethod:: raw_advance
@ -54,10 +53,11 @@ Parser base class
.. automethod:: is_line_start
.. automethod:: is_spaced
Helper methods for building effective parser classes:
Helper methods for building new parsers:
.. automethod:: register
.. automethod:: unregister
.. automethod:: duplicate
.. automethod:: literal
.. automethod:: nullary
.. automethod:: prefix
@ -65,3 +65,5 @@ Parser base class
.. automethod:: infix
.. automethod:: infixr
.. automethod:: method
.. automethod:: build
.. automethod:: create_tokenizer

View File

@ -144,6 +144,21 @@ class AbstractSchemaProxy(object):
self._schema = schema
self._base_element = base_element
def bind_parser(self, parser):
Binds a parser instance with schema proxy adding the schema's atomic types constructors.
This method can be redefined in a concrete proxy to optimize schema bindings.
:param parser: a parser instance.
if parser.schema is not self:
parser.schema = self
parser.symbol_table = parser.__class__.symbol_table.copy()
for xsd_type in self.iter_atomic_types():
parser.tokenizer = parser.create_tokenizer(parser.symbol_table)
def get_context(self):
Get a context instance for static analysis phase.
@ -152,6 +167,15 @@ class AbstractSchemaProxy(object):
return XPathSchemaContext(root=self._schema, item=self._base_element)
def find(self, path, namespaces=None):
Find a schema element or attribute using an XPath expression.
:param path: an XPath expression that selects an element or an attribute node.
:param namespaces: an optional mapping from namespace prefix to namespace URI.
:return: The first matching schema component, or ``None`` if there is no match.
def get_type(self, qname):
@ -185,12 +209,6 @@ class AbstractSchemaProxy(object):
:returns: an object that represents an XSD element or `None`.
# TODO: can make this as @abstractmethod from release v1.3.1
def find(self, path, namespaces=None):
Find the schema component using an XPath expression.
def get_substitution_group(self, qname):

View File

@ -20,8 +20,6 @@ from abc import ABCMeta
from .compat import PY3, add_metaclass, MutableSequence
from .exceptions import ElementPathSyntaxError, ElementPathNameError, ElementPathValueError, ElementPathTypeError
# Regex based tokenizer
SPECIAL_SYMBOL_PATTERN = re.compile(r'\(\w+\)')
"""Compiled regular expression for matching special symbols, that are names between round brackets."""
@ -50,49 +48,6 @@ def symbol_to_identifier(symbol):
return ''.join(get_id_name(c) for c in symbol)
def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
Returns a regex based tokenizer built from a symbol table of token classes.
The returned tokenizer skips extra spaces between symbols.
A regular expression is created from the symbol table of the parser using a template.
The symbols are inserted in the template putting the longer symbols first. Symbols and
their patterns can't contain spaces.
:param symbol_table: a dictionary containing the token classes of the formal language.
:param name_pattern: pattern to use to match names.
tokenizer_pattern_template = r"""
('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers)
(%s|[%s]) | # Symbol's patterns
(%s) | # Names
(\S) | # Unexpected characters
\s+ # Skip extra spaces
patterns = [
t.pattern for s, t in symbol_table.items()
if SPECIAL_SYMBOL_PATTERN.match(s) is None
string_patterns = []
character_patterns = []
for p in patterns:
if ' ' in p:
raise ElementPathValueError('pattern %r contains spaces' % p)
length = len(p)
if length == 1 or length == 2 and p[0] == '\\':
pattern = tokenizer_pattern_template % (
'|'.join(sorted(string_patterns, key=lambda x: -len(x))),
return re.compile(pattern, re.VERBOSE)
# Simple top down parser based on Vaughan Pratt's algorithm (Top Down Operator Precedence).
@ -358,21 +313,9 @@ class Parser(object):
tokenizer = None
symbol_table = {}
def build_tokenizer(cls, name_pattern='[A-Za-z0-9_]+'):
Builds the parser class tokenizer using the symbol related patterns.
:param name_pattern: Pattern to use to match names.
if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
cls.tokenizer = create_tokenizer(cls.symbol_table, name_pattern)
def __init__(self):
if self.tokenizer is None:
raise ValueError("Incomplete parser class %s registration." % self.__class__.__name__)
raise ValueError("The parser %r is not built!" % self.__class__)
self.token = None
self.match = None
self.next_token = None
@ -730,3 +673,59 @@ class Parser(object):
setattr(token_class, func.__name__, func)
return func
return bind
def build(cls):
Builds the parser class. Checks if all declared symbols are defined
and builds a the regex tokenizer using the symbol related patterns.
if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
cls.tokenizer = cls.create_tokenizer(cls.symbol_table)
build_tokenizer = build # For backward compatibility
def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
Returns a regex based tokenizer built from a symbol table of token classes.
The returned tokenizer skips extra spaces between symbols.
A regular expression is created from the symbol table of the parser using a template.
The symbols are inserted in the template putting the longer symbols first. Symbols and
their patterns can't contain spaces.
:param symbol_table: a dictionary containing the token classes of the formal language.
:param name_pattern: pattern to use to match names.
tokenizer_pattern_template = r"""
('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers)
(%s|[%s]) | # Symbol's patterns
(%s) | # Names
(\S) | # Unexpected characters
\s+ # Skip extra spaces
patterns = [
t.pattern for s, t in symbol_table.items()
if SPECIAL_SYMBOL_PATTERN.match(s) is None
string_patterns = []
character_patterns = []
for p in patterns:
if ' ' in p:
raise ElementPathValueError('pattern %r contains spaces' % p)
length = len(p)
if length == 1 or length == 2 and p[0] == '\\':
pattern = tokenizer_pattern_template % (
'|'.join(sorted(string_patterns, key=lambda x: -len(x))),
return re.compile(pattern, re.VERBOSE)

View File

@ -26,6 +26,7 @@ from .xpath_nodes import AttributeNode, NamespaceNode, TypedAttribute, TypedElem
is_etree_element, is_xpath_node, is_element_node, is_document_node, is_attribute_node, \
is_text_node, is_comment_node, is_processing_instruction_node, node_name
XML_NAME_CHARACTER = (u"A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
XML_NCNAME_PATTERN = u"[{0}][-.0-9\u00B7\u0300-\u036F\u203F-\u2040{0}]*".format(XML_NAME_CHARACTER)
@ -93,9 +94,9 @@ class XPath1Parser(Parser):
self.variables = dict(variables if variables is not None else [])
self.strict = strict
def build_tokenizer(cls, name_pattern=XML_NCNAME_PATTERN):
super(XPath1Parser, cls).build_tokenizer(name_pattern)
def create_tokenizer(symbol_table, name_pattern=XML_NCNAME_PATTERN):
return Parser.create_tokenizer(symbol_table, name_pattern)
def version(self):
@ -1282,4 +1283,4 @@ def evaluate(self, context=None):

View File

@ -601,4 +601,4 @@ def evaluate(self, context=None):
return DateTime10(dt.year, dt.month,, tm.hour, tm.minute, tm.second, tm.microsecond, tzinfo)
XPath2Parser.build_tokenizer() # XPath 2.0 definitions completed, build the tokenizer. # XPath 2.0 definition complete, can build the parser class.

View File

@ -26,8 +26,7 @@ from .namespaces import XSD_NAMESPACE, XPATH_FUNCTIONS_NAMESPACE, \
qname_to_prefixed, prefixed_to_qname, XSD_UNTYPED_ATOMIC
from .datatypes import UntypedAtomic, XSD_BUILTIN_TYPES
from .xpath_nodes import is_xpath_node
from .tdop_parser import create_tokenizer
from .xpath1_parser import XML_NCNAME_PATTERN, XPath1Parser
from .xpath1_parser import XPath1Parser
from .xpath_context import XPathSchemaContext
from .schema_proxy import AbstractSchemaProxy
@ -167,19 +166,21 @@ class XPath2Parser(XPath1Parser):
self.function_namespace = function_namespace
if schema is None:
self.schema = None
elif not isinstance(schema, AbstractSchemaProxy):
raise ElementPathTypeError("schema argument must be a subclass or instance of AbstractSchemaProxy!")
raise ElementPathTypeError("argument 'schema' must be an instance of AbstractSchemaProxy")
self.schema = schema
self.symbol_table = self.symbol_table.copy()
for xsd_type in self.schema.iter_atomic_types():
self.tokenizer = create_tokenizer(self.symbol_table, XML_NCNAME_PATTERN)
self.base_uri = None if base_uri is None else urlparse(base_uri).geturl()
self._compatibility_mode = compatibility_mode
def __getstate__(self):
state = self.__dict__.copy()
state.pop('symbol_table', None)
state.pop('tokenizer', None)
return state
def version(self):
return '2.0'
@ -337,6 +338,9 @@ class XPath2Parser(XPath1Parser):
except KeyError:
raise ElementPathKeyError("unknown type %r" % type_qname)
def is_schema_bound(self):
return 'symbol_table' in self.__dict__
def parse(self, source):
root_token = super(XPath1Parser, self).parse(source)
if self.schema is None:

View File

@ -8,6 +8,8 @@
# @author Davide Brunato <>
from __future__ import unicode_literals
from .xpath_context import XPathContext
from .xpath2_parser import XPath2Parser as XPath2Parser