debian-xmlschema/xmlschema/etree.py

378 lines
14 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
"""
This module contains ElementTree setup and helpers for xmlschema package.
"""
from __future__ import unicode_literals
import sys
import importlib
import re
from collections import Counter
try:
import lxml.etree as lxml_etree
except ImportError:
lxml_etree = None
from .compat import PY3
from .exceptions import XMLSchemaTypeError
from .namespaces import XSLT_NAMESPACE, HFP_NAMESPACE, VC_NAMESPACE, get_namespace
from .qnames import get_qname, qname_to_prefixed
###
# Programmatic import of xml.etree.ElementTree
#
# In Python 3 the pure python implementation is overwritten by the C module API,
# so use a programmatic re-import to obtain the pure Python module, necessary for
# defining a safer XMLParser.
#
if not PY3:
# Python 2.7: nothing have to be done because it's not overridden by C implementation
ElementTree = PyElementTree = importlib.import_module('xml.etree.ElementTree')
elif '_elementtree' in sys.modules:
# Temporary remove the loaded modules
ElementTree = sys.modules.pop('xml.etree.ElementTree', None)
_cmod = sys.modules.pop('_elementtree')
# Load the pure Python module
sys.modules['_elementtree'] = None
PyElementTree = importlib.import_module('xml.etree.ElementTree')
# Restore original modules
sys.modules['_elementtree'] = _cmod
if ElementTree is not None:
sys.modules['xml.etree.ElementTree'] = ElementTree
else:
ElementTree = PyElementTree
else:
# Load the pure Python module
sys.modules['_elementtree'] = None
PyElementTree = importlib.import_module('xml.etree.ElementTree')
# Remove the pure Python module from imported modules
del sys.modules['xml.etree.ElementTree']
del sys.modules['_elementtree']
# Load the C optimized ElementTree module
ElementTree = importlib.import_module('xml.etree.ElementTree')
# ElementTree APIs
etree_element = ElementTree.Element
etree_register_namespace = ElementTree.register_namespace
ParseError = ElementTree.ParseError
etree_register_namespace('xslt', XSLT_NAMESPACE)
etree_register_namespace('hfp', HFP_NAMESPACE)
etree_register_namespace('vc', VC_NAMESPACE)
# Pure Python ElementTree APIs
py_etree_element = PyElementTree.Element
py_etree_register_namespace = ElementTree.register_namespace
py_etree_register_namespace('xslt', XSLT_NAMESPACE)
py_etree_register_namespace('hfp', HFP_NAMESPACE)
py_etree_register_namespace('vc', VC_NAMESPACE)
# Lxml APIs
if lxml_etree is not None:
lxml_etree_element = lxml_etree.Element
lxml_etree_comment = lxml_etree.Comment
lxml_etree_register_namespace = lxml_etree.register_namespace
lxml_etree_register_namespace('xslt', XSLT_NAMESPACE)
lxml_etree_register_namespace('hfp', HFP_NAMESPACE)
lxml_etree_register_namespace('vc', VC_NAMESPACE)
else:
lxml_etree_element = None
lxml_etree_comment = None
lxml_etree_register_namespace = None
class SafeXMLParser(PyElementTree.XMLParser):
"""
An XMLParser that forbids entities processing. Drops the *html* argument that is deprecated
since version 3.4.
:param target: the target object called by the `feed()` method of the parser, \
that defaults to `TreeBuilder`.
:param encoding: if provided, its value overrides the encoding specified in the XML file.
"""
def __init__(self, target=None, encoding=None):
super(SafeXMLParser, self).__init__(target=target, encoding=encoding)
parser = self.parser if PY3 else self._parser
parser.EntityDeclHandler = self.entity_declaration
parser.UnparsedEntityDeclHandler = self.unparsed_entity_declaration
parser.ExternalEntityRefHandler = self.external_entity_reference
def entity_declaration(self, entity_name, is_parameter_entity, value, base, system_id, public_id, notation_name):
raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name))
def unparsed_entity_declaration(self, entity_name, base, system_id, public_id, notation_name):
raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name))
def external_entity_reference(self, context, base, system_id, public_id):
raise PyElementTree.ParseError(
"External references are forbidden (system_id={!r}, public_id={!r})".format(system_id, public_id)
)
def etree_tostring(elem, namespaces=None, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):
"""
Serialize an Element tree to a string. Tab characters are replaced by whitespaces.
:param elem: the Element instance.
:param namespaces: is an optional mapping from namespace prefix to URI. Provided namespaces are \
registered before serialization.
:param indent: the base line indentation.
:param max_lines: if truncate serialization after a number of lines (default: do not truncate).
:param spaces_for_tab: number of spaces for replacing tab characters (default is 4).
:param xml_declaration: if set to `True` inserts the XML declaration at the head.
:return: a Unicode string.
"""
def reindent(line):
if not line:
return line
elif line.startswith(min_indent):
return line[start:] if start >= 0 else indent[start:] + line
else:
return indent + line
if isinstance(elem, etree_element):
if namespaces:
for prefix, uri in namespaces.items():
if not re.match(r'ns\d+$', prefix):
etree_register_namespace(prefix, uri)
tostring = ElementTree.tostring
elif isinstance(elem, py_etree_element):
if namespaces:
for prefix, uri in namespaces.items():
if not re.match(r'ns\d+$', prefix):
PyElementTree.register_namespace(prefix, uri)
tostring = PyElementTree.tostring
elif lxml_etree is not None:
if namespaces:
for prefix, uri in namespaces.items():
if prefix and not re.match(r'ns\d+$', prefix):
lxml_etree_register_namespace(prefix, uri)
tostring = lxml_etree.tostring
else:
raise XMLSchemaTypeError("cannot serialize %r: lxml library not available." % type(elem))
if PY3:
xml_text = tostring(elem, encoding="unicode").replace('\t', ' ' * spaces_for_tab)
else:
xml_text = unicode(tostring(elem)).replace('\t', ' ' * spaces_for_tab)
lines = ['<?xml version="1.0" encoding="UTF-8"?>'] if xml_declaration else []
lines.extend(xml_text.splitlines())
while lines and not lines[-1].strip():
lines.pop(-1)
last_indent = ' ' * min(k for k in range(len(lines[-1])) if lines[-1][k] != ' ')
if len(lines) > 2:
child_indent = ' ' * min(k for line in lines[1:-1] for k in range(len(line)) if line[k] != ' ')
min_indent = min(child_indent, last_indent)
else:
min_indent = child_indent = last_indent
start = len(min_indent) - len(indent)
if max_lines is not None and len(lines) > max_lines + 2:
lines = lines[:max_lines] + [child_indent + '...'] * 2 + lines[-1:]
return '\n'.join(reindent(line) for line in lines)
def etree_iterpath(elem, tag=None, path='.', namespaces=None, add_position=False):
"""
Creates an iterator for the element and its subelements that yield elements and paths.
If tag is not `None` or '*', only elements whose matches tag are returned from the iterator.
:param elem: the element to iterate.
:param tag: tag filtering.
:param path: the current path, '.' for default.
:param add_position: add context position to child elements that appear multiple times.
:param namespaces: is an optional mapping from namespace prefix to URI.
"""
if tag == "*":
tag = None
if tag is None or elem.tag == tag:
yield elem, path
if add_position:
children_tags = Counter([e.tag for e in elem])
positions = Counter([t for t in children_tags if children_tags[t] > 1])
else:
positions = ()
for child in elem:
if callable(child.tag):
continue # Skip lxml comments
child_name = child.tag if namespaces is None else qname_to_prefixed(child.tag, namespaces)
if path == '/':
child_path = '/%s' % child_name
elif path:
child_path = '/'.join((path, child_name))
else:
child_path = child_name
if child.tag in positions:
child_path += '[%d]' % positions[child.tag]
positions[child.tag] += 1
for _child, _child_path in etree_iterpath(child, tag, child_path, namespaces):
yield _child, _child_path
def etree_getpath(elem, root, namespaces=None, relative=True, add_position=False):
"""
Returns the XPath path from *root* to descendant *elem* element.
:param elem: the descendant element.
:param root: the root element.
:param namespaces: is an optional mapping from namespace prefix to URI.
:param relative: returns a relative path.
:param add_position: add context position to child elements that appear multiple times.
:return: An XPath expression or `None` if *elem* is not a descendant of *root*.
"""
if relative:
path = '.'
elif namespaces:
path = '/%s' % qname_to_prefixed(root.tag, namespaces)
else:
path = '/%s' % root.tag
for e, path in etree_iterpath(root, elem.tag, path, namespaces, add_position):
if e is elem:
return path
def etree_elements_assert_equal(elem, other, strict=True, skip_comments=True):
"""
Tests the equality of two XML Element trees.
:param elem: the master Element tree, reference for namespace mapping.
:param other: the other Element tree that has to be compared.
:param strict: asserts strictly equality. `True` for default.
:param skip_comments: Skip comments for e
:raise: an AssertionError containing information about first difference encountered.
"""
_REGEX_SPACES = re.compile(r'\s+')
other_elements = iter(other.iter())
namespace = ''
for e1 in elem.iter():
if skip_comments and e1.tag is lxml_etree_comment:
continue
try:
e2 = next(other_elements)
except StopIteration:
assert False, "Second tree ends before the first: %r." % e1
if strict or e1 is elem:
assert e1.tag == e2.tag, "%r != %r: tags differ." % (e1, e2)
else:
namespace = get_namespace(e1.tag) or namespace
assert get_qname(namespace, e1.tag) == get_qname(namespace, e1.tag), "%r != %r: tags differ." % (e1, e2)
# Attributes
if e1.attrib != e2.attrib:
if strict:
raise AssertionError("%r != %r: attribute differ: %r != %r." % (e1, e2, e1.attrib, e2.attrib))
else:
assert sorted(e1.attrib.keys()) == sorted(e2.attrib.keys()), \
"%r != %r: attribute keys differ: %r != %r." % (e1, e2, e1.attrib.keys(), e2.attrib.keys())
for k in e1.attrib:
a1, a2 = e1.attrib[k].strip(), e2.attrib[k].strip()
if a1 != a2:
try:
assert float(a1) == float(a2)
except (AssertionError, ValueError, TypeError):
raise AssertionError(
"%r != %r: attribute %r differ: %r != %r." % (e1, e2, k, a1, a2)
)
# Number of children
if skip_comments:
nc1 = len([c for c in e1 if c.tag is not lxml_etree_comment])
nc2 = len([c for c in e2 if c.tag is not lxml_etree_comment])
else:
nc1 = len(e1)
nc2 = len(e2)
assert nc1 == nc2, "%r != %r: children number differ: %r != %r." % (e1, e2, nc1, nc2)
# Text
if e1.text != e2.text:
message = "%r != %r: texts differ: %r != %r." % (e1, e2, e1.text, e2.text)
if strict:
raise AssertionError(message)
elif e1.text is None:
assert not e2.text.strip(), message
elif e2.text is None:
assert not e1.text.strip(), message
elif _REGEX_SPACES.sub(e1.text.strip(), '') != _REGEX_SPACES.sub(e2.text.strip(), ''):
try:
assert float(e1.text.strip()) == float(e2.text.strip())
except (AssertionError, ValueError, TypeError):
raise AssertionError(message)
# Tail
if e1.tail != e2.tail:
message = "%r != %r: tails differ: %r != %r." % (e1, e2, e1.tail, e2.tail)
if strict:
raise AssertionError(message)
elif e1.tail is None:
assert not e2.tail.strip(), message
elif e2.text is None:
assert not e1.tail.strip(), message
else:
assert e1.tail.strip() == e2.tail.strip(), message
try:
e2 = next(other_elements)
except StopIteration:
pass
else:
assert False, "First tree ends before the second: %r." % e2
def prune_etree(root, selector):
"""
Removes from an tree structure the elements that verify the selector
function. The checking and eventual removals are performed using a
breadth-first visit method.
:param root: the root element of the tree.
:param selector: the single argument function to apply on each visited node.
:return: `True` if the root node verify the selector function, `None` otherwise.
"""
def _prune_subtree(elem):
for child in elem[:]:
if selector(child):
elem.remove(child)
for child in elem:
_prune_subtree(child)
if selector(root):
del root[:]
return True
_prune_subtree(root)