debian-elementpath/elementpath/tdop_parser.py

732 lines
28 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright (c), 2018-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
This module contains classes and helper functions for defining Pratt parsers.
"""
import sys
import re
from unicodedata import name as unicode_name
from decimal import Decimal
from abc import ABCMeta
from .compat import PY3, add_metaclass, MutableSequence
from .exceptions import ElementPathSyntaxError, ElementPathNameError, ElementPathValueError, ElementPathTypeError
SPECIAL_SYMBOL_PATTERN = re.compile(r'\(\w+\)')
"""Compiled regular expression for matching special symbols, that are names between round brackets."""
SPACE_PATTERN = re.compile(r'\s')
def symbol_to_identifier(symbol):
"""
Converts a symbol string to an identifier (only alphanumeric and '_').
"""
def get_id_name(c):
if c in ('_', '-', ' '):
return '_'
elif c.isalnum():
return c
elif PY3:
return unicode_name(c).title().replace(' ', '')
else:
return unicode_name(unicode(c)).title().replace(' ', '')
if symbol.isalnum():
return symbol
elif SPECIAL_SYMBOL_PATTERN.match(symbol):
return symbol[1:-1]
else:
return ''.join(get_id_name(c) for c in symbol)
#
# Simple top down parser based on Vaughan Pratt's algorithm (Top Down Operator Precedence).
#
# References:
#
# https://tdop.github.io/ (Vaughan R. Pratt's "Top Down Operator Precedence" - 1973)
# http://crockford.com/javascript/tdop/tdop.html (Douglas Crockford - 2007)
# http://effbot.org/zone/simple-top-down-parsing.htm (Fredrik Lundh - 2008)
#
# This implementation is based on a base class for tokens and a base class for parsers.
# A real parser is built with a derivation of the base parser class followed by the
# registrations of token classes for the symbols of the language.
#
# A parser can be extended by derivation, copying the reusable token classes and
# defining the additional ones. See the files xpath1_parser.py and xpath2_parser.py
# for a fully implementation example of a real parser.
#
class MultiLabel(object):
"""
Helper class for defining multi-value label for tokens. Useful when a symbol has more roles.
A label of this type has equivalence with each of its values.
Example:
label = MultiLabel('function', 'operator')
label == 'symbol' # False
label == 'function' # True
label == 'operator' # True
"""
def __init__(self, *values):
self.values = values
def __eq__(self, other):
return any(other == v for v in self.values)
def __ne__(self, other):
return all(other != v for v in self.values)
def __str__(self):
return '_'.join(self.values)
def __unicode__(self):
return u'_'.join(self.values)
if PY3:
__str__ = __unicode__
class Token(MutableSequence):
"""
Token base class for defining a parser based on Pratt's method.
Each token instance is a list-like object. The number of token's items is the arity of
the represented operator, where token's items are the operands. Nullary operators are
used for symbols, names and literals. Tokens with items represent the other operators
(unary, binary and so on).
Each token class has a *symbol*, a lbp (left binding power) value and a rbp (right binding
power) value, that are used in the sense described by the Pratt's method. This implementation
of Pratt tokens includes two extra attributes, *pattern* and *label*, that can be used to
simplify the parsing of symbols in a concrete parser.
:param parser: The parser instance that creates the token instance.
:param value: The token value. If not provided defaults to token symbol.
:cvar symbol: the symbol of the token class.
:cvar lbp: Pratt's left binding power, defaults to 0.
:cvar rbp: Pratt's right binding power, defaults to 0.
:cvar pattern: the regex pattern used for the token class. Defaults to the escaped symbol. \
Can be customized to match more detailed conditions (eg. a function with its left round bracket), \
in order to simplify the related code.
:cvar label: defines the typology of the token class. Its value is used in representations of the \
token instance and can be used to restrict code choices without more complicated analysis. The label \
value can be set as needed by the parser implementation (eg. 'function', 'axis', 'constructor' are \
used by the XPath parsers). In the base parser class defaults to 'symbol' with 'literal' and 'operator' \
as possible alternatives. If set by a tuple of values the token class label is transformed to a \
multi-value label, that means the token class can covers multiple roles (eg. as XPath function or axis). \
In those cases the definitive role is defined at parse time (nud and/or led methods) after the token \
instance creation.
"""
symbol = None # the token identifier, key in the token table.
lbp = 0 # left binding power
rbp = 0 # right binding power
pattern = None # the token regex pattern, for building the tokenizer.
label = 'symbol' # optional label
def __init__(self, parser, value=None):
self.parser = parser
self.value = value if value is not None else self.symbol
self._operands = []
def __getitem__(self, i):
return self._operands[i]
def __setitem__(self, i, item):
self._operands[i] = item
def __delitem__(self, i):
del self._operands[i]
def __len__(self):
return len(self._operands)
def insert(self, i, item):
self._operands.insert(i, item)
def __str__(self):
symbol = self.symbol
if SPECIAL_SYMBOL_PATTERN.match(symbol) is not None:
return '%r %s' % (self.value, symbol[1:-1])
else:
return '%r %s' % (symbol, self.label)
def __repr__(self):
symbol, value = self.symbol, self.value
if value != symbol:
return u'%s(value=%r)' % (self.__class__.__name__, value)
else:
return u'%s()' % self.__class__.__name__
def __cmp__(self, other):
return self.symbol == other.symbol and self.value == other.value
@property
def arity(self):
return len(self)
@property
def tree(self):
"""Returns a tree representation string."""
symbol, length = self.symbol, len(self)
if symbol == '(name)':
return u'(%s)' % self.value
elif SPECIAL_SYMBOL_PATTERN.match(symbol) is not None:
return u'(%r)' % self.value
elif symbol == '(':
return '()' if not self else self[0].tree
elif not length:
return u'(%s)' % symbol
else:
return u'(%s %s)' % (symbol, ' '.join(item.tree for item in self))
@property
def source(self):
"""Returns the source representation string."""
symbol = self.symbol
if symbol == '(name)':
return self.value
elif symbol == '(decimal)':
return str(self.value)
elif SPECIAL_SYMBOL_PATTERN.match(symbol) is not None:
return repr(self.value)
else:
length = len(self)
if not length:
return symbol
elif length == 1:
return u'%s %s' % (symbol, self[0].source)
elif length == 2:
return u'%s %s %s' % (self[0].source, symbol, self[1].source)
else:
return u'%s %s' % (symbol, ' '.join(item.source for item in self))
def nud(self):
"""Pratt's null denotation method"""
self.wrong_syntax()
def led(self, left):
"""Pratt's left denotation method"""
self.wrong_syntax()
def evaluate(self, *args, **kwargs):
"""Evaluation method"""
def iter(self):
"""Returns a generator for iterating the token's tree."""
for t in self[:1]:
for token in t.iter():
yield token
yield self
for t in self[1:]:
for token in t.iter():
yield token
def expected(self, *symbols):
if symbols and self.symbol not in symbols:
self.wrong_syntax()
def unexpected(self, *symbols):
if not symbols or self.symbol in symbols:
self.wrong_syntax()
def wrong_syntax(self, message=None):
symbol = self.value if SPECIAL_SYMBOL_PATTERN.match(self.symbol) is not None else self.symbol
line_column = 'line %d, column %d' % self.parser.position
token = self.parser.token
if token is not None and symbol != token.symbol:
msg = "symbol %r after %s at %s" % (symbol, token, line_column)
else:
msg = "symbol %r at %s" % (symbol, line_column)
if message:
raise ElementPathSyntaxError('%s: %s' % (msg, message), self)
else:
raise ElementPathSyntaxError('unexpected %s.' % msg, self)
def wrong_value(self, message='unknown error'):
raise ElementPathValueError(message, self)
def wrong_type(self, message='unknown error'):
raise ElementPathTypeError(message, self)
class ParserMeta(type):
def __new__(mcs, name, bases, namespace):
cls = super(ParserMeta, mcs).__new__(mcs, name, bases, namespace)
# Avoids more parsers definitions for a single module
for k, v in sys.modules[cls.__module__].__dict__.items():
if isinstance(v, ParserMeta) and v.__module__ == cls.__module__:
raise RuntimeError("Multiple parser class definitions per module are not permitted: %r" % cls)
# Checks and initializes class attributes
if not hasattr(cls, 'token_base_class'):
cls.token_base_class = Token
if 'tokenizer' not in namespace:
cls.tokenizer = None
if 'SYMBOLS' not in namespace:
cls.SYMBOLS = set()
for base_class in bases:
if hasattr(base_class, 'SYMBOLS'):
cls.SYMBOLS.update(base_class.SYMBOLS)
break
if 'symbol_table' not in namespace:
cls.symbol_table = {}
for base_class in bases:
if hasattr(base_class, 'symbol_table'):
cls.symbol_table.update(base_class.symbol_table)
break
return cls
def __init__(cls, name, bases, namespace):
super(ParserMeta, cls).__init__(name, bases, namespace)
@add_metaclass(ParserMeta)
class Parser(object):
"""
Parser class for implementing a Top Down Operator Precedence parser.
:cvar SYMBOLS: the symbols of the definable tokens for the parser. In the base class it's an \
immutable set that contains the symbols for special tokens (literals, names and end-token).\
Has to be extended in a concrete parser adding all the symbols of the language.
:cvar symbol_table: a dictionary that stores the token classes defined for the language.
:type symbol_table: dict
:cvar token_base_class: the base class for creating language's token classes.
:type token_base_class: Token
:cvar tokenizer: the language tokenizer compiled regexp.
"""
SYMBOLS = frozenset(('(string)', '(float)', '(decimal)', '(integer)', '(name)', '(end)'))
token_base_class = Token
tokenizer = None
symbol_table = {}
def __init__(self):
if self.tokenizer is None:
raise ValueError("The parser %r is not built!" % self.__class__)
self.token = None
self.match = None
self.next_token = None
self.next_match = None
self.tokens = iter(())
self.source = ''
def __eq__(self, other):
if self.token_base_class != other.token_base_class:
return False
elif self.SYMBOLS != other.SYMBOLS:
return False
elif self.symbol_table != other.symbol_table:
return False
else:
return True
def parse(self, source):
"""
Parses a source code of the formal language. This is the main method that has to be
called for a parser's instance.
:param source: The source string.
:return: The root of the token's tree that parse the source.
"""
try:
self.source = source
self.tokens = iter(self.tokenizer.finditer(source))
self.advance()
root_token = self.expression()
if self.next_token.symbol != '(end)':
self.next_token.unexpected()
return root_token
finally:
self.tokens = iter(())
self.token = None
self.match = None
self.next_token = None
self.next_match = None
def advance(self, *symbols):
"""
The Pratt's function for advancing to next token.
:param symbols: Optional arguments tuple. If not empty one of the provided \
symbols is expected. If the next token's symbol differs the parser raise a \
parse error.
:return: The next token instance.
"""
if self.next_token is not None:
if self.next_token.symbol == '(end)':
if self.token is None:
raise ElementPathSyntaxError("source is empty.")
else:
raise ElementPathSyntaxError("unexpected end of source after %s." % self.token)
elif symbols:
self.next_token.expected(*symbols)
self.token = self.next_token
self.match = self.next_match
while True:
try:
self.next_match = next(self.tokens)
except StopIteration:
self.next_token = self.symbol_table['(end)'](self)
break
else:
literal, symbol, name, unexpected = self.next_match.groups()
if symbol is not None:
symbol = symbol.strip()
try:
self.next_token = self.symbol_table[symbol](self)
except KeyError:
raise ElementPathSyntaxError("unknown symbol %r." % symbol)
break
elif literal is not None:
if literal[0] in '\'"':
self.next_token = self.symbol_table['(string)'](self, literal.strip("'\""))
elif 'e' in literal or 'E' in literal:
self.next_token = self.symbol_table['(float)'](self, float(literal))
elif '.' in literal:
self.next_token = self.symbol_table['(decimal)'](self, Decimal(literal))
else:
self.next_token = self.symbol_table['(integer)'](self, int(literal))
break
elif name is not None:
self.next_token = self.symbol_table['(name)'](self, name)
break
elif unexpected is not None:
raise ElementPathSyntaxError(
"unexpected symbol %r at %s." % (unexpected, 'line %d, column %d' % self.position)
)
elif str(self.next_match.group()).strip():
raise RuntimeError(
"Unexpected matching %r: not compatible tokenizer." % self.next_match.group()
)
return self.next_token
def raw_advance(self, *stop_symbols):
"""
Advances until one of the symbols is found or the end of source is reached, returning
the raw source string placed before. Useful for raw parsing of comments and references
enclosed between specific symbols. This is an extension provided by this implementation.
:param stop_symbols: The symbols that have to be found for stopping advance.
:return: The source string chunk enclosed between the initial position and the first stop symbol.
"""
if not stop_symbols:
raise ElementPathValueError("at least a stop symbol required!", self.next_token)
elif getattr(self.next_token, 'symbol', None) == '(end)':
if self.token is None:
raise ElementPathSyntaxError("source is empty.", self.next_token)
else:
raise ElementPathSyntaxError("unexpected end of source after %s." % self.token, self.next_token)
self.token = self.next_token
self.match = self.next_match
source_chunk = []
while True:
try:
self.next_match = next(self.tokens)
except StopIteration:
self.next_token = self.symbol_table['(end)'](self)
break
else:
symbol = self.next_match.group(2)
if symbol is not None:
symbol = symbol.strip()
if symbol not in stop_symbols:
source_chunk.append(symbol)
else:
try:
self.next_token = self.symbol_table[symbol](self)
except KeyError:
raise ElementPathSyntaxError("unknown symbol %r." % symbol)
break
else:
source_chunk.append(self.next_match.group())
return ''.join(source_chunk)
def expression(self, rbp=0):
"""
Pratt's function for parsing an expression. It calls token.nud() and then advances
until the right binding power is less the left binding power of the next
token, invoking the led() method on the following token.
:param rbp: right binding power for the expression.
:return: left token.
"""
token = self.next_token
self.advance()
left = token.nud()
while rbp < self.next_token.lbp:
token = self.next_token
self.advance()
left = token.led(left)
return left
@property
def position(self):
"""Property that returns the current line and column indexes."""
if self.match is None:
return 1, 0
token_index = self.match.span()[0]
line = self.source[:token_index].count('\n') + 1
if line == 1:
return line, token_index + 1
else:
return line, token_index - self.source[:token_index].rindex('\n')
def is_source_start(self):
"""
Returns `True` if the parser is positioned at the start of the source, ignoring the spaces.
"""
if self.match is None:
return True
return not bool(self.source[0:self.match.span()[0]].strip())
def is_line_start(self):
"""
Returns `True` if the parser is positioned at the start of a source line, ignoring the spaces.
"""
if self.match is None:
return True
token_index = self.match.span()[0]
line_start = self.source[0:token_index].rindex('\n') + 1
return not bool(self.source[line_start:token_index].strip())
def is_spaced(self, before=True, after=True):
"""
Returns `True` if the source has an extra space (whitespace, tab or newline) immediately
before or after the current position of the parser.
:param before: if `True` considers also the extra spaces before the current token symbol.
:param after: if `True` considers also the extra spaces after the current token symbol.
"""
if self.match is None:
return False
start, end = self.match.span()
if before and start > 0 and self.source[start - 1] in ' \t\n':
return True
try:
return after and self.source[end] in ' \t\n'
except IndexError:
return False
@classmethod
def register(cls, symbol, **kwargs):
"""
Register/update a token class in the symbol table.
:param symbol: The identifier symbol for a new class or an existent token class.
:param kwargs: Optional attributes/methods for the token class.
:return: A token class.
"""
def symbol_escape(s):
s = re.escape(s)
s.replace(r'\ ', r'\s+')
if s.isalpha():
s = r'\b%s\b' % s
elif s[-2:] == r'\(':
s = r'%s\s*%s' % (s[:-2], s[-2:])
elif s[-4:] == r'\:\:':
s = r'%s\s*%s' % (s[:-4], s[-4:])
return s
try:
try:
if ' ' in symbol:
raise ElementPathValueError("%r: a symbol can't contains whitespaces." % symbol)
except TypeError:
assert isinstance(symbol, type) and issubclass(symbol, Token), \
"A %r subclass requested, not %r." % (Token, symbol)
symbol, token_class = symbol.symbol, symbol
assert symbol in cls.symbol_table and cls.symbol_table[symbol] is token_class, \
"Token class %r is not registered." % token_class
else:
token_class = cls.symbol_table[symbol]
except KeyError:
# Register a new symbol and create a new custom class. The new class
# name is registered at parser class's module level.
if symbol not in cls.SYMBOLS:
raise ElementPathNameError('%r is not a symbol of the parser %r.' % (symbol, cls))
kwargs['symbol'] = symbol
if 'pattern' not in kwargs:
pattern = symbol_escape(symbol) if len(symbol) > 1 else re.escape(symbol)
kwargs['pattern'] = pattern
label = kwargs.get('label', 'symbol')
if isinstance(label, tuple):
label = kwargs['label'] = MultiLabel(*label)
token_class_name = str("_%s_%s_token" % (symbol_to_identifier(symbol), label))
kwargs.update({
'__module__': cls.__module__,
'__qualname__': token_class_name,
'__return__': None
})
token_class = ABCMeta(token_class_name, (cls.token_base_class,), kwargs)
cls.symbol_table[symbol] = token_class
MutableSequence.register(token_class)
setattr(sys.modules[cls.__module__], token_class_name, token_class)
else:
for key, value in kwargs.items():
if key == 'lbp' and value > token_class.lbp:
token_class.lbp = value
elif key == 'rbp' and value > token_class.rbp:
token_class.rbp = value
elif callable(value):
setattr(token_class, key, value)
return token_class
@classmethod
def unregister(cls, symbol):
"""Unregister a token class from the symbol table."""
del cls.symbol_table[symbol.strip()]
@classmethod
def duplicate(cls, symbol, new_symbol, **kwargs):
"""Duplicate a token class with a new symbol."""
token_class = cls.symbol_table[symbol]
new_token_class = cls.register(new_symbol, **kwargs)
for key, value in token_class.__dict__.items():
if key in kwargs or key in ('symbol', 'pattern') or key.startswith('_'):
continue
setattr(new_token_class, key, value)
return new_token_class
@classmethod
def literal(cls, symbol, bp=0):
"""Register a token for a symbol that represents a *literal*."""
def nud(self):
return self
def evaluate(self, *args, **kwargs):
return self.value
return cls.register(symbol, label='literal', lbp=bp, evaluate=evaluate, nud=nud)
@classmethod
def nullary(cls, symbol, bp=0):
"""Register a token for a symbol that represents a *nullary* operator."""
def nud(self):
return self
return cls.register(symbol, label='operator', lbp=bp, nud=nud)
@classmethod
def prefix(cls, symbol, bp=0):
"""Register a token for a symbol that represents a *prefix* unary operator."""
def nud(self):
self[:] = self.parser.expression(rbp=bp),
return self
return cls.register(symbol, label='operator', lbp=bp, rbp=bp, nud=nud)
@classmethod
def postfix(cls, symbol, bp=0):
"""Register a token for a symbol that represents a *postfix* unary operator."""
def led(self, left):
self[:] = left,
return self
return cls.register(symbol, label='operator', lbp=bp, rbp=bp, led=led)
@classmethod
def infix(cls, symbol, bp=0):
"""Register a token for a symbol that represents an *infix* binary operator."""
def led(self, left):
self[:] = left, self.parser.expression(rbp=bp)
return self
return cls.register(symbol, label='operator', lbp=bp, rbp=bp, led=led)
@classmethod
def infixr(cls, symbol, bp=0):
"""Register a token for a symbol that represents an *infixr* binary operator."""
def led(self, left):
self[:] = left, self.parser.expression(rbp=bp - 1)
return self
return cls.register(symbol, label='operator', lbp=bp, rbp=bp - 1, led=led)
@classmethod
def method(cls, symbol, bp=0):
"""
Register a token for a symbol that represents a custom operator or redefine
a method for an existing token.
"""
token_class = cls.register(symbol, label='operator', lbp=bp, rbp=bp)
def bind(func):
assert callable(getattr(token_class, func.__name__, None)), \
"The name %r does not match with a callable of %r." % (func.__name__, token_class)
setattr(token_class, func.__name__, func)
return func
return bind
@classmethod
def build(cls):
"""
Builds the parser class. Checks if all declared symbols are defined
and builds a the regex tokenizer using the symbol related patterns.
"""
if not cls.SYMBOLS.issubset(cls.symbol_table.keys()):
unregistered = [s for s in cls.SYMBOLS if s not in cls.symbol_table]
raise ValueError("The parser %r has unregistered symbols: %r" % (cls, unregistered))
cls.tokenizer = cls.create_tokenizer(cls.symbol_table)
build_tokenizer = build # For backward compatibility
@staticmethod
def create_tokenizer(symbol_table, name_pattern='[A-Za-z0-9_]+'):
"""
Returns a regex based tokenizer built from a symbol table of token classes.
The returned tokenizer skips extra spaces between symbols.
A regular expression is created from the symbol table of the parser using a template.
The symbols are inserted in the template putting the longer symbols first. Symbols and
their patterns can't contain spaces.
:param symbol_table: a dictionary containing the token classes of the formal language.
:param name_pattern: pattern to use to match names.
"""
tokenizer_pattern_template = r"""
('[^']*' | "[^"]*" | (?:\d+|\.\d+)(?:\.\d*)?(?:[Ee][+-]?\d+)?) | # Literals (string and numbers)
(%s|[%s]) | # Symbol's patterns
(%s) | # Names
(\S) | # Unexpected characters
\s+ # Skip extra spaces
"""
patterns = [
t.pattern for s, t in symbol_table.items()
if SPECIAL_SYMBOL_PATTERN.match(s) is None
]
string_patterns = []
character_patterns = []
for p in patterns:
if ' ' in p:
raise ElementPathValueError('pattern %r contains spaces' % p)
length = len(p)
if length == 1 or length == 2 and p[0] == '\\':
character_patterns.append(p)
else:
string_patterns.append(p)
pattern = tokenizer_pattern_template % (
'|'.join(sorted(string_patterns, key=lambda x: -len(x))),
''.join(character_patterns),
name_pattern
)
return re.compile(pattern, re.VERBOSE)