Add __getstate__() for XPath2Parser pickling
- An XPath2Parser drops schema bind on pickling, the bind has to be restored by the schema objects. This is necessary because when a schema is bound to a parser token classes are created for XSD atomic types constructors. - Add bind_parser() to AbstractSchemaProxy - Add is_schema_bound() to XPath2Parser - Add statimethod create_tokenizer to TDOP Parser - Rename Parser.build_tokenizer() to build()
This commit is contained in:
parent
9e2e5d45e3
commit
d11524f50b
|
@ -42,7 +42,6 @@ Parser base class
|
|||
|
||||
Parsing methods:
|
||||
|
||||
.. automethod:: build_tokenizer
|
||||
.. automethod:: parse
|
||||
.. automethod:: advance
|
||||
.. automethod:: raw_advance
|
||||
|
@ -54,10 +53,11 @@ Parser base class
|
|||
.. automethod:: is_line_start
|
||||
.. automethod:: is_spaced
|
||||
|
||||
Helper methods for building effective parser classes:
|
||||
Helper methods for building new parsers:
|
||||
|
||||
.. automethod:: register
|
||||
.. automethod:: unregister
|
||||
.. automethod:: duplicate
|
||||
.. automethod:: literal
|
||||
.. automethod:: nullary
|
||||
.. automethod:: prefix
|
||||
|
@ -65,3 +65,5 @@ Parser base class
|
|||
.. automethod:: infix
|
||||
.. automethod:: infixr
|
||||
.. automethod:: method
|
||||
.. automethod:: build
|
||||
.. automethod:: create_tokenizer
|
||||
|
|
|
@ -144,6 +144,21 @@ class AbstractSchemaProxy(object):
|
|||
self._schema = schema
|
||||
self._base_element = base_element
|
||||
|
||||
def bind_parser(self, parser):
|
||||
"""
|
||||
Binds a parser instance with schema proxy adding the schema's atomic types constructors.
|
||||
This method can be redefined in a concrete proxy to optimize schema bindings.
|
||||
|
||||
:param parser: a parser instance.
|
||||
"""
|
||||
if parser.schema is not self:
|
||||
parser.schema = self
|
||||
|
||||
parser.symbol_table = parser.__class__.symbol_table.copy()
|
||||
for xsd_type in self.iter_atomic_types():
|
||||
parser.schema_constructor(xsd_type.name)
|
||||
parser.tokenizer = parser.create_tokenizer(parser.symbol_table)
|
||||
|
||||
def get_context(self):
|
||||
"""
|
||||
Get a context instance for static analysis phase.
|
||||
|
@ -152,6 +167,15 @@ class AbstractSchemaProxy(object):
|
|||
"""
|
||||
return XPathSchemaContext(root=self._schema, item=self._base_element)
|
||||
|
||||
def find(self, path, namespaces=None):
|
||||
"""
|
||||
Find a schema element or attribute using an XPath expression.
|
||||
|
||||
:param path: an XPath expression that selects an element or an attribute node.
|
||||
:param namespaces: an optional mapping from namespace prefix to namespace URI.
|
||||
:return: The first matching schema component, or ``None`` if there is no match.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_type(self, qname):
|
||||
"""
|
||||
|
@ -185,12 +209,6 @@ class AbstractSchemaProxy(object):
|
|||
:returns: an object that represents an XSD element or `None`.
|
||||
"""
|
||||
|
||||
# TODO: can make this as @abstractmethod from release v1.3.1
|
||||
def find(self, path, namespaces=None):
|
||||
"""
|
||||
Find the schema component using an XPath expression.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_substitution_group(self, qname):
|
||||
"""
|
||||
|
|
|
@ -20,8 +20,6 @@ from abc import ABCMeta
|
|||
from .compat import PY3, add_metaclass, MutableSequence
|
||||
from .exceptions import ElementPathSyntaxError, ElementPathNameError, ElementPathValueError, ElementPathTypeError
|
||||
|
||||
###
|
||||
# Regex based tokenizer
|
||||
SPECIAL_SYMBOL_PATTERN = re.compile(r'\(\w+\)')
|
||||
"""Compiled regular expression for matching special symbols, that are names between round brackets."""
|
||||
|
||||
|
@ -50,49 +48,6 @@ def symbol_to_identifier(symbol):
|
|||
return ''.join(get_id_name(c) for c in symbol)
|
||||
|
||||
|
||||
def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
|
||||
"""
|
||||
Returns a regex based tokenizer built from a symbol table of token classes.
|
||||
The returned tokenizer skips extra spaces between symbols.
|
||||
|
||||
A regular expression is created from the symbol table of the parser using a template.
|
||||
The symbols are inserted in the template putting the longer symbols first. Symbols and
|
||||
their patterns can't contain spaces.
|
||||
|
||||
:param symbol_table: a dictionary containing the token classes of the formal language.
|
||||
:param name_pattern: pattern to use to match names.
|
||||
"""
|
||||
tokenizer_pattern_template = r"""
|
||||
('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers)
|
||||
(%s|[%s]) | # Symbol's patterns
|
||||
(%s) | # Names
|
||||
(\S) | # Unexpected characters
|
||||
\s+ # Skip extra spaces
|
||||
"""
|
||||
patterns = [
|
||||
t.pattern for s, t in symbol_table.items()
|
||||
if SPECIAL_SYMBOL_PATTERN.match(s) is None
|
||||
]
|
||||
string_patterns = []
|
||||
character_patterns = []
|
||||
|
||||
for p in patterns:
|
||||
if ' ' in p:
|
||||
raise ElementPathValueError('pattern %r contains spaces' % p)
|
||||
length = len(p)
|
||||
if length == 1 or length == 2 and p[0] == '\\':
|
||||
character_patterns.append(p)
|
||||
else:
|
||||
string_patterns.append(p)
|
||||
|
||||
pattern = tokenizer_pattern_template % (
|
||||
'|'.join(sorted(string_patterns, key=lambda x: -len(x))),
|
||||
''.join(character_patterns),
|
||||
name_pattern
|
||||
)
|
||||
return re.compile(pattern, re.VERBOSE)
|
||||
|
||||
|
||||
#
|
||||
# Simple top down parser based on Vaughan Pratt's algorithm (Top Down Operator Precedence).
|
||||
#
|
||||
|
@ -358,21 +313,9 @@ class Parser(object):
|
|||
tokenizer = None
|
||||
symbol_table = {}
|
||||
|
||||
@classmethod
|
||||
def build_tokenizer(cls, name_pattern='[A-Za-z0-9_]+'):
|
||||
"""
|
||||
Builds the parser class tokenizer using the symbol related patterns.
|
||||
|
||||
:param name_pattern: Pattern to use to match names.
|
||||
"""
|
||||
if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
|
||||
unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
|
||||
raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
|
||||
cls.tokenizer = create_tokenizer(cls.symbol_table, name_pattern)
|
||||
|
||||
def __init__(self):
|
||||
if self.tokenizer is None:
|
||||
raise ValueError("Incomplete parser class %s registration." % self.__class__.__name__)
|
||||
raise ValueError("The parser %r is not built!" % self.__class__)
|
||||
self.token = None
|
||||
self.match = None
|
||||
self.next_token = None
|
||||
|
@ -730,3 +673,59 @@ class Parser(object):
|
|||
setattr(token_class, func.__name__, func)
|
||||
return func
|
||||
return bind
|
||||
|
||||
@classmethod
|
||||
def build(cls):
|
||||
"""
|
||||
Builds the parser class. Checks if all declared symbols are defined
|
||||
and builds a the regex tokenizer using the symbol related patterns.
|
||||
"""
|
||||
if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
|
||||
unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
|
||||
raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
|
||||
cls.tokenizer = cls.create_tokenizer(cls.symbol_table)
|
||||
|
||||
build_tokenizer = build # For backward compatibility
|
||||
|
||||
@staticmethod
|
||||
def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
|
||||
"""
|
||||
Returns a regex based tokenizer built from a symbol table of token classes.
|
||||
The returned tokenizer skips extra spaces between symbols.
|
||||
|
||||
A regular expression is created from the symbol table of the parser using a template.
|
||||
The symbols are inserted in the template putting the longer symbols first. Symbols and
|
||||
their patterns can't contain spaces.
|
||||
|
||||
:param symbol_table: a dictionary containing the token classes of the formal language.
|
||||
:param name_pattern: pattern to use to match names.
|
||||
"""
|
||||
tokenizer_pattern_template = r"""
|
||||
('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers)
|
||||
(%s|[%s]) | # Symbol's patterns
|
||||
(%s) | # Names
|
||||
(\S) | # Unexpected characters
|
||||
\s+ # Skip extra spaces
|
||||
"""
|
||||
patterns = [
|
||||
t.pattern for s, t in symbol_table.items()
|
||||
if SPECIAL_SYMBOL_PATTERN.match(s) is None
|
||||
]
|
||||
string_patterns = []
|
||||
character_patterns = []
|
||||
|
||||
for p in patterns:
|
||||
if ' ' in p:
|
||||
raise ElementPathValueError('pattern %r contains spaces' % p)
|
||||
length = len(p)
|
||||
if length == 1 or length == 2 and p[0] == '\\':
|
||||
character_patterns.append(p)
|
||||
else:
|
||||
string_patterns.append(p)
|
||||
|
||||
pattern = tokenizer_pattern_template % (
|
||||
'|'.join(sorted(string_patterns, key=lambda x: -len(x))),
|
||||
''.join(character_patterns),
|
||||
name_pattern
|
||||
)
|
||||
return re.compile(pattern, re.VERBOSE)
|
||||
|
|
|
@ -26,6 +26,7 @@ from .xpath_nodes import AttributeNode, NamespaceNode, TypedAttribute, TypedElem
|
|||
is_etree_element, is_xpath_node, is_element_node, is_document_node, is_attribute_node, \
|
||||
is_text_node, is_comment_node, is_processing_instruction_node, node_name
|
||||
|
||||
|
||||
XML_NAME_CHARACTER = (u"A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
|
||||
u"\u200C\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD")
|
||||
XML_NCNAME_PATTERN = u"[{0}][-.0-9\u00B7\u0300-\u036F\u203F-\u2040{0}]*".format(XML_NAME_CHARACTER)
|
||||
|
@ -93,9 +94,9 @@ class XPath1Parser(Parser):
|
|||
self.variables = dict(variables if variables is not None else [])
|
||||
self.strict = strict
|
||||
|
||||
@classmethod
|
||||
def build_tokenizer(cls, name_pattern=XML_NCNAME_PATTERN):
|
||||
super(XPath1Parser, cls).build_tokenizer(name_pattern)
|
||||
@staticmethod
|
||||
def create_tokenizer(symbol_table, name_pattern=XML_NCNAME_PATTERN):
|
||||
return Parser.create_tokenizer(symbol_table, name_pattern)
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
|
@ -1282,4 +1283,4 @@ def evaluate(self, context=None):
|
|||
|
||||
|
||||
register('(end)')
|
||||
XPath1Parser.build_tokenizer()
|
||||
XPath1Parser.build()
|
||||
|
|
|
@ -601,4 +601,4 @@ def evaluate(self, context=None):
|
|||
return DateTime10(dt.year, dt.month, dt.day, tm.hour, tm.minute, tm.second, tm.microsecond, tzinfo)
|
||||
|
||||
|
||||
XPath2Parser.build_tokenizer() # XPath 2.0 definitions completed, build the tokenizer.
|
||||
XPath2Parser.build() # XPath 2.0 definition complete, can build the parser class.
|
||||
|
|
|
@ -26,8 +26,7 @@ from .namespaces import XSD_NAMESPACE, XPATH_FUNCTIONS_NAMESPACE, \
|
|||
qname_to_prefixed, prefixed_to_qname, XSD_UNTYPED_ATOMIC
|
||||
from .datatypes import UntypedAtomic, XSD_BUILTIN_TYPES
|
||||
from .xpath_nodes import is_xpath_node
|
||||
from .tdop_parser import create_tokenizer
|
||||
from .xpath1_parser import XML_NCNAME_PATTERN, XPath1Parser
|
||||
from .xpath1_parser import XPath1Parser
|
||||
from .xpath_context import XPathSchemaContext
|
||||
from .schema_proxy import AbstractSchemaProxy
|
||||
|
||||
|
@ -167,19 +166,21 @@ class XPath2Parser(XPath1Parser):
|
|||
self.function_namespace = function_namespace
|
||||
|
||||
if schema is None:
|
||||
self.schema = None
|
||||
pass
|
||||
elif not isinstance(schema, AbstractSchemaProxy):
|
||||
raise ElementPathTypeError("schema argument must be a subclass or instance of AbstractSchemaProxy!")
|
||||
raise ElementPathTypeError("argument 'schema' must be an instance of AbstractSchemaProxy")
|
||||
else:
|
||||
self.schema = schema
|
||||
self.symbol_table = self.symbol_table.copy()
|
||||
for xsd_type in self.schema.iter_atomic_types():
|
||||
self.schema_constructor(xsd_type.name)
|
||||
self.tokenizer = create_tokenizer(self.symbol_table, XML_NCNAME_PATTERN)
|
||||
schema.bind_parser(self)
|
||||
|
||||
self.base_uri = None if base_uri is None else urlparse(base_uri).geturl()
|
||||
self._compatibility_mode = compatibility_mode
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state.pop('symbol_table', None)
|
||||
state.pop('tokenizer', None)
|
||||
return state
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
return '2.0'
|
||||
|
@ -337,6 +338,9 @@ class XPath2Parser(XPath1Parser):
|
|||
except KeyError:
|
||||
raise ElementPathKeyError("unknown type %r" % type_qname)
|
||||
|
||||
def is_schema_bound(self):
|
||||
return 'symbol_table' in self.__dict__
|
||||
|
||||
def parse(self, source):
|
||||
root_token = super(XPath1Parser, self).parse(source)
|
||||
if self.schema is None:
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
#
|
||||
# @author Davide Brunato <brunato@sissa.it>
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .xpath_context import XPathContext
|
||||
from .xpath2_parser import XPath2Parser as XPath2Parser
|
||||
|
||||
|
|
Loading…
Reference in New Issue