# -*- coding: utf-8 -*- # # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). # All rights reserved. # This file is distributed under the terms of the MIT License. # See the file 'LICENSE' in the root directory of the present # distribution, or http://opensource.org/licenses/MIT. # # @author Davide Brunato # """ This module contains ElementTree setup and helpers for xmlschema package. """ from __future__ import unicode_literals import sys import importlib import re from collections import Counter try: import lxml.etree as lxml_etree except ImportError: lxml_etree = None from .compat import PY3 from .exceptions import XMLSchemaTypeError from .namespaces import XSLT_NAMESPACE, HFP_NAMESPACE, VC_NAMESPACE, get_namespace from .qnames import get_qname, qname_to_prefixed ### # Programmatic import of xml.etree.ElementTree # # In Python 3 the pure python implementation is overwritten by the C module API, # so use a programmatic re-import to obtain the pure Python module, necessary for # defining a safer XMLParser. # if not PY3: # Python 2.7: nothing have to be done because it's not overridden by C implementation ElementTree = PyElementTree = importlib.import_module('xml.etree.ElementTree') elif '_elementtree' in sys.modules: # Temporary remove the loaded modules ElementTree = sys.modules.pop('xml.etree.ElementTree', None) _cmod = sys.modules.pop('_elementtree') # Load the pure Python module sys.modules['_elementtree'] = None PyElementTree = importlib.import_module('xml.etree.ElementTree') # Restore original modules sys.modules['_elementtree'] = _cmod if ElementTree is not None: sys.modules['xml.etree.ElementTree'] = ElementTree else: ElementTree = PyElementTree else: # Load the pure Python module sys.modules['_elementtree'] = None PyElementTree = importlib.import_module('xml.etree.ElementTree') # Remove the pure Python module from imported modules del sys.modules['xml.etree.ElementTree'] del sys.modules['_elementtree'] # Load the C optimized ElementTree module ElementTree = importlib.import_module('xml.etree.ElementTree') # ElementTree APIs etree_element = ElementTree.Element etree_register_namespace = ElementTree.register_namespace ParseError = ElementTree.ParseError etree_register_namespace('xslt', XSLT_NAMESPACE) etree_register_namespace('hfp', HFP_NAMESPACE) etree_register_namespace('vc', VC_NAMESPACE) # Pure Python ElementTree APIs py_etree_element = PyElementTree.Element py_etree_register_namespace = ElementTree.register_namespace py_etree_register_namespace('xslt', XSLT_NAMESPACE) py_etree_register_namespace('hfp', HFP_NAMESPACE) py_etree_register_namespace('vc', VC_NAMESPACE) # Lxml APIs if lxml_etree is not None: lxml_etree_element = lxml_etree.Element lxml_etree_comment = lxml_etree.Comment lxml_etree_register_namespace = lxml_etree.register_namespace lxml_etree_register_namespace('xslt', XSLT_NAMESPACE) lxml_etree_register_namespace('hfp', HFP_NAMESPACE) lxml_etree_register_namespace('vc', VC_NAMESPACE) else: lxml_etree_element = None lxml_etree_comment = None lxml_etree_register_namespace = None class SafeXMLParser(PyElementTree.XMLParser): """ An XMLParser that forbids entities processing. Drops the *html* argument that is deprecated since version 3.4. :param target: the target object called by the `feed()` method of the parser, \ that defaults to `TreeBuilder`. :param encoding: if provided, its value overrides the encoding specified in the XML file. """ def __init__(self, target=None, encoding=None): super(SafeXMLParser, self).__init__(target=target, encoding=encoding) parser = self.parser if PY3 else self._parser parser.EntityDeclHandler = self.entity_declaration parser.UnparsedEntityDeclHandler = self.unparsed_entity_declaration parser.ExternalEntityRefHandler = self.external_entity_reference def entity_declaration(self, entity_name, is_parameter_entity, value, base, system_id, public_id, notation_name): raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name)) def unparsed_entity_declaration(self, entity_name, base, system_id, public_id, notation_name): raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name)) def external_entity_reference(self, context, base, system_id, public_id): raise PyElementTree.ParseError( "External references are forbidden (system_id={!r}, public_id={!r})".format(system_id, public_id) ) def etree_tostring(elem, namespaces=None, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False): """ Serialize an Element tree to a string. Tab characters are replaced by whitespaces. :param elem: the Element instance. :param namespaces: is an optional mapping from namespace prefix to URI. Provided namespaces are \ registered before serialization. :param indent: the base line indentation. :param max_lines: if truncate serialization after a number of lines (default: do not truncate). :param spaces_for_tab: number of spaces for replacing tab characters (default is 4). :param xml_declaration: if set to `True` inserts the XML declaration at the head. :return: a Unicode string. """ def reindent(line): if not line: return line elif line.startswith(min_indent): return line[start:] if start >= 0 else indent[start:] + line else: return indent + line if isinstance(elem, etree_element): if namespaces: for prefix, uri in namespaces.items(): if not re.match(r'ns\d+$', prefix): etree_register_namespace(prefix, uri) tostring = ElementTree.tostring elif isinstance(elem, py_etree_element): if namespaces: for prefix, uri in namespaces.items(): if not re.match(r'ns\d+$', prefix): PyElementTree.register_namespace(prefix, uri) tostring = PyElementTree.tostring elif lxml_etree is not None: if namespaces: for prefix, uri in namespaces.items(): if prefix and not re.match(r'ns\d+$', prefix): lxml_etree_register_namespace(prefix, uri) tostring = lxml_etree.tostring else: raise XMLSchemaTypeError("cannot serialize %r: lxml library not available." % type(elem)) if PY3: xml_text = tostring(elem, encoding="unicode").replace('\t', ' ' * spaces_for_tab) else: xml_text = unicode(tostring(elem)).replace('\t', ' ' * spaces_for_tab) lines = [''] if xml_declaration else [] lines.extend(xml_text.splitlines()) while lines and not lines[-1].strip(): lines.pop(-1) last_indent = ' ' * min(k for k in range(len(lines[-1])) if lines[-1][k] != ' ') if len(lines) > 2: child_indent = ' ' * min(k for line in lines[1:-1] for k in range(len(line)) if line[k] != ' ') min_indent = min(child_indent, last_indent) else: min_indent = child_indent = last_indent start = len(min_indent) - len(indent) if max_lines is not None and len(lines) > max_lines + 2: lines = lines[:max_lines] + [child_indent + '...'] * 2 + lines[-1:] return '\n'.join(reindent(line) for line in lines) def etree_iterpath(elem, tag=None, path='.', namespaces=None, add_position=False): """ Creates an iterator for the element and its subelements that yield elements and paths. If tag is not `None` or '*', only elements whose matches tag are returned from the iterator. :param elem: the element to iterate. :param tag: tag filtering. :param path: the current path, '.' for default. :param add_position: add context position to child elements that appear multiple times. :param namespaces: is an optional mapping from namespace prefix to URI. """ if tag == "*": tag = None if tag is None or elem.tag == tag: yield elem, path if add_position: children_tags = Counter([e.tag for e in elem]) positions = Counter([t for t in children_tags if children_tags[t] > 1]) else: positions = () for child in elem: if callable(child.tag): continue # Skip lxml comments child_name = child.tag if namespaces is None else qname_to_prefixed(child.tag, namespaces) if path == '/': child_path = '/%s' % child_name elif path: child_path = '/'.join((path, child_name)) else: child_path = child_name if child.tag in positions: child_path += '[%d]' % positions[child.tag] positions[child.tag] += 1 for _child, _child_path in etree_iterpath(child, tag, child_path, namespaces): yield _child, _child_path def etree_getpath(elem, root, namespaces=None, relative=True, add_position=False): """ Returns the XPath path from *root* to descendant *elem* element. :param elem: the descendant element. :param root: the root element. :param namespaces: is an optional mapping from namespace prefix to URI. :param relative: returns a relative path. :param add_position: add context position to child elements that appear multiple times. :return: An XPath expression or `None` if *elem* is not a descendant of *root*. """ if relative: path = '.' elif namespaces: path = '/%s' % qname_to_prefixed(root.tag, namespaces) else: path = '/%s' % root.tag for e, path in etree_iterpath(root, elem.tag, path, namespaces, add_position): if e is elem: return path def etree_elements_assert_equal(elem, other, strict=True, skip_comments=True): """ Tests the equality of two XML Element trees. :param elem: the master Element tree, reference for namespace mapping. :param other: the other Element tree that has to be compared. :param strict: asserts strictly equality. `True` for default. :param skip_comments: Skip comments for e :raise: an AssertionError containing information about first difference encountered. """ _REGEX_SPACES = re.compile(r'\s+') other_elements = iter(other.iter()) namespace = '' for e1 in elem.iter(): if skip_comments and e1.tag is lxml_etree_comment: continue try: e2 = next(other_elements) except StopIteration: assert False, "Second tree ends before the first: %r." % e1 if strict or e1 is elem: assert e1.tag == e2.tag, "%r != %r: tags differ." % (e1, e2) else: namespace = get_namespace(e1.tag) or namespace assert get_qname(namespace, e1.tag) == get_qname(namespace, e1.tag), "%r != %r: tags differ." % (e1, e2) # Attributes if e1.attrib != e2.attrib: if strict: raise AssertionError("%r != %r: attribute differ: %r != %r." % (e1, e2, e1.attrib, e2.attrib)) else: assert sorted(e1.attrib.keys()) == sorted(e2.attrib.keys()), \ "%r != %r: attribute keys differ: %r != %r." % (e1, e2, e1.attrib.keys(), e2.attrib.keys()) for k in e1.attrib: a1, a2 = e1.attrib[k].strip(), e2.attrib[k].strip() if a1 != a2: try: assert float(a1) == float(a2) except (AssertionError, ValueError, TypeError): raise AssertionError( "%r != %r: attribute %r differ: %r != %r." % (e1, e2, k, a1, a2) ) # Number of children if skip_comments: nc1 = len([c for c in e1 if c.tag is not lxml_etree_comment]) nc2 = len([c for c in e2 if c.tag is not lxml_etree_comment]) else: nc1 = len(e1) nc2 = len(e2) assert nc1 == nc2, "%r != %r: children number differ: %r != %r." % (e1, e2, nc1, nc2) # Text if e1.text != e2.text: message = "%r != %r: texts differ: %r != %r." % (e1, e2, e1.text, e2.text) if strict: raise AssertionError(message) elif e1.text is None: assert not e2.text.strip(), message elif e2.text is None: assert not e1.text.strip(), message elif _REGEX_SPACES.sub(e1.text.strip(), '') != _REGEX_SPACES.sub(e2.text.strip(), ''): try: assert float(e1.text.strip()) == float(e2.text.strip()) except (AssertionError, ValueError, TypeError): raise AssertionError(message) # Tail if e1.tail != e2.tail: message = "%r != %r: tails differ: %r != %r." % (e1, e2, e1.tail, e2.tail) if strict: raise AssertionError(message) elif e1.tail is None: assert not e2.tail.strip(), message elif e2.text is None: assert not e1.tail.strip(), message else: assert e1.tail.strip() == e2.tail.strip(), message try: e2 = next(other_elements) except StopIteration: pass else: assert False, "First tree ends before the second: %r." % e2 def prune_etree(root, selector): """ Removes from an tree structure the elements that verify the selector function. The checking and eventual removals are performed using a breadth-first visit method. :param root: the root element of the tree. :param selector: the single argument function to apply on each visited node. :return: `True` if the root node verify the selector function, `None` otherwise. """ def _prune_subtree(elem): for child in elem[:]: if selector(child): elem.remove(child) for child in elem: _prune_subtree(child) if selector(root): del root[:] return True _prune_subtree(root)