debian-xmlschema/xmlschema/resources.py

695 lines
28 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies).
# All rights reserved.
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# @author Davide Brunato <brunato@sissa.it>
#
import os.path
import re
import codecs
from elementpath import iter_select, Selector
from .compat import (
PY3, StringIO, BytesIO, string_base_type, urlopen, urlsplit, urljoin, urlunsplit,
pathname2url, URLError, uses_relative
)
from .exceptions import XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaURLError, XMLSchemaOSError
from .namespaces import get_namespace
from .qnames import XSI_SCHEMA_LOCATION, XSI_NONS_SCHEMA_LOCATION
from .etree import ElementTree, PyElementTree, SafeXMLParser, etree_tostring
DEFUSE_MODES = ('always', 'remote', 'never')
def is_remote_url(url):
return url is not None and urlsplit(url).scheme not in ('', 'file')
def url_path_is_directory(url):
return os.path.isdir(urlsplit(url).path)
def url_path_is_file(url):
return os.path.isfile(urlsplit(url).path)
def normalize_url(url, base_url=None, keep_relative=False):
"""
Returns a normalized URL doing a join with a base URL. URL scheme defaults to 'file' and
backslashes are replaced with slashes. For file paths the os.path.join is used instead of
urljoin.
:param url: a relative or absolute URL.
:param base_url: the reference base URL for construct the normalized URL from the argument. \
For compatibility between "os.path.join" and "urljoin" a trailing '/' is added to not empty paths.
:param keep_relative: if set to `True` keeps relative file paths, which would not strictly \
conformant to URL format specification.
:return: A normalized URL.
"""
def add_trailing_slash(r):
return urlunsplit((r[0], r[1], r[2] + '/' if r[2] and r[2][-1] != '/' else r[2], r[3], r[4]))
if base_url is not None:
base_url = base_url.replace('\\', '/')
while base_url.startswith('//'):
base_url = base_url.replace('//', '/', 1)
base_url_parts = urlsplit(base_url)
base_url = add_trailing_slash(base_url_parts)
if base_url_parts.scheme not in uses_relative:
base_url_parts = urlsplit('file:///{}'.format(base_url))
else:
base_url_parts = urlsplit(base_url)
if base_url_parts.scheme not in ('', 'file'):
url = urljoin(base_url, url)
else:
url_parts = urlsplit(url)
if url_parts.scheme not in ('', 'file'):
url = urljoin(base_url, url)
elif not url_parts.netloc or base_url_parts.netloc == url_parts.netloc:
# Join paths only if host parts (netloc) are equal, using the os.path.join
# instead of urljoin for path normalization.
url = urlunsplit((
'',
base_url_parts.netloc,
os.path.normpath(os.path.join(base_url_parts.path, url_parts.path)),
url_parts.query,
url_parts.fragment,
))
# Add 'file' scheme if '//' prefix is added
if base_url_parts.netloc and not url.startswith(base_url_parts.netloc) and url.startswith('//'):
url = 'file:' + url
url = url.replace('\\', '/')
while url.startswith('//'):
url = url.replace('//', '/', 1)
url_parts = urlsplit(url, scheme='file')
if url_parts.scheme not in uses_relative:
return 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/....
elif url_parts.scheme != 'file':
return urlunsplit((
url_parts.scheme,
url_parts.netloc,
pathname2url(url_parts.path),
url_parts.query,
url_parts.fragment,
))
elif os.path.isabs(url_parts.path):
return url_parts.geturl()
elif keep_relative:
# Can't use urlunsplit with a scheme because it converts relative paths to absolute ones.
return 'file:{}'.format(urlunsplit(('',) + url_parts[1:]))
else:
return urlunsplit((
url_parts.scheme,
url_parts.netloc,
os.path.abspath(url_parts.path),
url_parts.query,
url_parts.fragment,
))
def fetch_resource(location, base_url=None, timeout=30):
"""
Fetch a resource trying to accessing it. If the resource is accessible
returns the URL, otherwise raises an error (XMLSchemaURLError).
:param location: an URL or a file path.
:param base_url: reference base URL for normalizing local and relative URLs.
:param timeout: the timeout in seconds for the connection attempt in case of remote data.
:return: a normalized URL.
"""
if not location:
raise XMLSchemaValueError("'location' argument must contains a not empty string.")
url = normalize_url(location, base_url)
try:
resource = urlopen(url, timeout=timeout)
except URLError as err:
# fallback joining the path without a base URL
url = normalize_url(location)
try:
resource = urlopen(url, timeout=timeout)
except URLError:
raise XMLSchemaURLError(reason=err.reason)
else:
resource.close()
return url
else:
resource.close()
return url
def fetch_schema_locations(source, locations=None, **resource_options):
"""
Fetches the schema URL for the source's root of an XML data source and a list of location hints.
If an accessible schema location is not found raises a ValueError.
:param source: an Element or an Element Tree with XML data or an URL or a file-like object.
:param locations: a dictionary or dictionary items with Schema location hints.
:param resource_options: keyword arguments for providing :class:`XMLResource` class init options.
:return: A tuple with the URL referring to the first reachable schema resource, a list \
of dictionary items with normalized location hints.
"""
base_url = resource_options.pop('base_url', None)
timeout = resource_options.pop('timeout', 30)
if not isinstance(source, XMLResource):
resource = XMLResource(source, base_url, timeout=timeout, **resource_options)
else:
resource = source
base_url = resource.base_url
namespace = resource.namespace
locations = resource.get_locations(locations)
for ns, url in filter(lambda x: x[0] == namespace, locations):
try:
return fetch_resource(url, base_url, timeout), locations
except XMLSchemaURLError:
pass
raise XMLSchemaValueError("not found a schema for XML data resource %r (namespace=%r)." % (source, namespace))
def fetch_schema(source, locations=None, **resource_options):
"""
Fetches the schema URL for the source's root of an XML data source.
If an accessible schema location is not found raises a ValueError.
:param source: An an Element or an Element Tree with XML data or an URL or a file-like object.
:param locations: A dictionary or dictionary items with schema location hints.
:param resource_options: keyword arguments for providing :class:`XMLResource` class init options.
:return: An URL referring to a reachable schema resource.
"""
return fetch_schema_locations(source, locations, **resource_options)[0]
def fetch_namespaces(source, **resource_options):
"""
Extracts namespaces with related prefixes from the XML data source. If the source is
an lxml's ElementTree/Element returns the nsmap attribute of the root. If a duplicate
prefix declaration is encountered then adds the namespace using a different prefix,
but only in the case if the namespace URI is not already mapped by another prefix.
:param source: a string containing the XML document or file path or an url \
or a file like object or an ElementTree or Element.
:param resource_options: keyword arguments for providing :class:`XMLResource` init options.
:return: A dictionary for mapping namespace prefixes to full URI.
"""
timeout = resource_options.pop('timeout', 30)
return XMLResource(source, timeout=timeout, **resource_options).get_namespaces()
def load_xml_resource(source, element_only=True, **resource_options):
"""
Load XML data source into an Element tree, returning the root Element, the XML text and an
url, if available. Usable for XML data files of small or medium sizes, as XSD schemas.
:param source: an URL, a filename path or a file-like object.
:param element_only: if True the function returns only the root Element of the tree.
:param resource_options: keyword arguments for providing :class:`XMLResource` init options.
:return: a tuple with three items (root Element, XML text and XML URL) or \
only the root Element if 'element_only' argument is True.
"""
lazy = resource_options.pop('lazy', False)
source = XMLResource(source, lazy=lazy, **resource_options)
if element_only:
return source.root
else:
source.load()
return source.root, source.text, source.url
class XMLResource(object):
"""
XML resource reader based on ElementTree and urllib.
:param source: a string containing the XML document or file path or an URL or a file like \
object or an ElementTree or an Element.
:param base_url: is an optional base URL, used for the normalization of relative paths when \
the URL of the resource can't be obtained from the source argument.
:param defuse: set the usage of SafeXMLParser for XML data. Can be 'always', 'remote' or 'never'. \
Default is 'remote' that uses the defusedxml only when loading remote data.
:param timeout: the timeout in seconds for the connection attempt in case of remote data.
:param lazy: if set to `False` the source is fully loaded into and processed from memory. \
Default is `True` that means that only the root element of the source is loaded. This is \
ignored if *source* is an Element or an ElementTree.
"""
def __init__(self, source, base_url=None, defuse='remote', timeout=300, lazy=True):
if base_url is not None and not isinstance(base_url, string_base_type):
raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url))
self._root = self._document = self._url = self._text = None
self._base_url = base_url
self.defuse = defuse
self.timeout = timeout
self._lazy = lazy
self.source = source
def __str__(self):
# noinspection PyCompatibility,PyUnresolvedReferences
return unicode(self).encode("utf-8")
def __unicode__(self):
return self.__repr__()
if PY3:
__str__ = __unicode__
def __repr__(self):
if self._root is None:
return u'%s()' % self.__class__.__name__
elif self._url is None:
return u'%s(tag=%r)' % (self.__class__.__name__, self._root.tag)
else:
return u'%s(tag=%r, basename=%r)' % (
self.__class__.__name__, self._root.tag, os.path.basename(self._url)
)
def __setattr__(self, name, value):
if name == 'source':
self._root, self._document, self._text, self._url = self._fromsource(value)
elif name == 'defuse' and value not in DEFUSE_MODES:
raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value))
elif name == 'timeout' and (not isinstance(value, int) or value <= 0):
raise XMLSchemaValueError(u"'timeout' attribute must be a positive integer: {!r}".format(value))
elif name == 'lazy' and not isinstance(value, bool):
raise XMLSchemaValueError(u"'lazy' attribute must be a boolean: {!r}".format(value))
super(XMLResource, self).__setattr__(name, value)
def _fromsource(self, source):
url, lazy = None, self._lazy
if hasattr(source, 'tag'):
self._lazy = False
return source, None, None, None # Source is already an Element --> nothing to load
elif isinstance(source, string_base_type):
_url, self._url = self._url, None
try:
if lazy:
# check if source is a string containing a valid XML root
for _, root in self.iterparse(StringIO(source), events=('start',)):
return root, None, source, None
else:
return self.fromstring(source), None, source, None
except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
if '\n' in source:
raise
finally:
self._url = _url
url = normalize_url(source) if '\n' not in source else None
elif isinstance(source, StringIO):
_url, self._url = self._url, None
try:
if lazy:
for _, root in self.iterparse(source, events=('start',)):
return root, None, source.getvalue(), None
else:
document = self.parse(source)
return document.getroot(), document, source.getvalue(), None
finally:
self._url = _url
elif hasattr(source, 'read'):
# source should be a file-like object
try:
if hasattr(source, 'url'):
url = source.url
else:
url = normalize_url(source.name)
except AttributeError:
pass
else:
_url, self._url = self._url, url
try:
if lazy:
for _, root in self.iterparse(source, events=('start',)):
return root, None, None, url
else:
document = self.parse(source)
return document.getroot(), document, None, url
finally:
self._url = _url
else:
# Try ElementTree object at last
try:
root = source.getroot()
except (AttributeError, TypeError):
pass
else:
if hasattr(root, 'tag'):
self._lazy = False
return root, source, None, None
if url is None:
raise XMLSchemaTypeError(
"wrong type %r for 'source' attribute: an ElementTree object or an Element instance or a "
"string containing XML data or an URL or a file-like object is required." % type(source)
)
else:
resource = urlopen(url, timeout=self.timeout)
_url, self._url = self._url, url
try:
if lazy:
for _, root in self.iterparse(resource, events=('start',)):
return root, None, None, url
else:
document = self.parse(resource)
root = document.getroot()
return root, document, None, url
finally:
self._url = _url
resource.close()
@property
def root(self):
"""The XML tree root Element."""
return self._root
@property
def document(self):
"""
The ElementTree document, `None` if the instance is lazy or is not created
from another document or from an URL.
"""
return self._document
@property
def text(self):
"""The XML text source, `None` if it's not available."""
return self._text
@property
def url(self):
"""The source URL, `None` if the instance is created from an Element tree or from a string."""
return self._url
@property
def base_url(self):
"""The base URL for completing relative locations."""
return os.path.dirname(self._url) if self._url else self._base_url
@property
def namespace(self):
"""The namespace of the XML document."""
return get_namespace(self._root.tag) if self._root is not None else None
@staticmethod
def defusing(source):
"""
Defuse an XML source, raising an `ElementTree.ParseError` if the source contains entity
definitions or remote entity loading.
:param source: a filename or file object containing XML data.
"""
parser = SafeXMLParser(target=PyElementTree.TreeBuilder())
try:
for _, _ in PyElementTree.iterparse(source, ('start',), parser):
break
except PyElementTree.ParseError as err:
raise ElementTree.ParseError(str(err))
def parse(self, source):
"""
An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When
protection is applied XML data are loaded and defused before building the ElementTree instance.
:param source: a filename or file object containing XML data.
:returns: an ElementTree instance.
"""
if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url):
text = source.read()
if isinstance(text, bytes):
self.defusing(BytesIO(text))
return ElementTree.parse(BytesIO(text))
else:
self.defusing(StringIO(text))
return ElementTree.parse(StringIO(text))
else:
return ElementTree.parse(source)
def iterparse(self, source, events=None):
"""
An equivalent of *ElementTree.iterparse()* that can protect from XML entities attacks.
When protection is applied the iterator yields pure-Python Element instances.
:param source: a filename or file object containing XML data.
:param events: a list of events to report back. If omitted, only “end” events are reported.
"""
if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url):
parser = SafeXMLParser(target=PyElementTree.TreeBuilder())
try:
return PyElementTree.iterparse(source, events, parser)
except PyElementTree.ParseError as err:
raise ElementTree.ParseError(str(err))
else:
return ElementTree.iterparse(source, events)
def fromstring(self, text):
"""
An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks.
:param text: a string containing XML data.
:returns: the root Element instance.
"""
if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url):
self.defusing(StringIO(text))
return ElementTree.fromstring(text)
def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):
"""Generates a string representation of the XML resource."""
return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration)
def copy(self, **kwargs):
"""Resource copy method. Change init parameters with keyword arguments."""
obj = type(self)(
source=self.source,
base_url=kwargs.get('base_url', self.base_url),
defuse=kwargs.get('defuse', self.defuse),
timeout=kwargs.get('timeout', self.timeout),
lazy=kwargs.get('lazy', self._lazy)
)
if obj._text is None and self._text is not None:
obj._text = self._text
return obj
def open(self):
"""Returns a opened resource reader object for the instance URL."""
if self._url is None:
raise XMLSchemaValueError("can't open, the resource has no URL associated.")
try:
return urlopen(self._url, timeout=self.timeout)
except URLError as err:
raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason))
def load(self):
"""
Loads the XML text from the data source. If the data source is an Element
the source XML text can't be retrieved.
"""
if self._url is None:
return # Created from Element or text source --> already loaded
resource = self.open()
try:
data = resource.read()
except (OSError, IOError) as err:
raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err))
finally:
resource.close()
try:
self._text = data.decode('utf-8') if PY3 else data.encode('utf-8')
except UnicodeDecodeError:
if PY3:
self._text = data.decode('iso-8859-1')
else:
with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f:
self._text = f.read().encode('iso-8859-1')
def is_lazy(self):
"""Returns `True` if the XML resource is lazy."""
return self._lazy
def is_loaded(self):
"""Returns `True` if the XML text of the data source is loaded."""
return self._text is not None
def iter(self, tag=None):
"""XML resource tree iterator."""
if not self._lazy:
for elem in self._root.iter(tag):
yield elem
return
elif self._url is not None:
resource = urlopen(self._url, timeout=self.timeout)
else:
resource = StringIO(self._text)
try:
for event, elem in self.iterparse(resource, events=('end',)):
if tag is None or elem.tag == tag:
yield elem
elem.clear()
finally:
resource.close()
def iterfind(self, path=None, namespaces=None):
"""XML resource tree iterfind selector."""
if not self._lazy:
if path is None:
yield self._root
else:
for e in iter_select(self._root, path, namespaces, strict=False):
yield e
return
elif self._url is not None:
resource = urlopen(self._url, timeout=self.timeout)
else:
self.load()
resource = StringIO(self._text)
try:
if path is None:
level = 0
for event, elem in self.iterparse(resource, events=('start', 'end')):
if event == "start":
if level == 0:
self._root.clear()
self._root = elem
level += 1
else:
level -= 1
if level == 0:
yield elem
elem.clear()
else:
selector = Selector(path, namespaces, strict=False)
level = 0
for event, elem in self.iterparse(resource, events=('start', 'end')):
if event == "start":
if level == 0:
self._root.clear()
self._root = elem
level += 1
else:
level -= 1
if elem in selector.select(self._root):
yield elem
elem.clear()
elif level == 0:
elem.clear()
finally:
resource.close()
def iter_location_hints(self):
"""Yields schema location hints from the XML tree."""
for elem in self.iter():
try:
locations = elem.attrib[XSI_SCHEMA_LOCATION]
except KeyError:
pass
else:
locations = locations.split()
for ns, url in zip(locations[0::2], locations[1::2]):
yield ns, url
try:
locations = elem.attrib[XSI_NONS_SCHEMA_LOCATION]
except KeyError:
pass
else:
for url in locations.split():
yield '', url
def get_namespaces(self):
"""
Extracts namespaces with related prefixes from the XML resource. If a duplicate
prefix declaration is encountered then adds the namespace using a different prefix,
but only in the case if the namespace URI is not already mapped by another prefix.
:return: A dictionary for mapping namespace prefixes to full URI.
"""
def update_nsmap(prefix, uri):
if prefix not in nsmap and (prefix or not local_root):
nsmap[prefix] = uri
elif not any(uri == ns for ns in nsmap.values()):
if not prefix:
try:
prefix = re.search(r'(\w+)$', uri.strip()).group()
except AttributeError:
return
while prefix in nsmap:
match = re.search(r'(\d+)$', prefix)
if match:
index = int(match.group()) + 1
prefix = prefix[:match.span()[0]] + str(index)
else:
prefix += '2'
nsmap[prefix] = uri
local_root = self.root.tag[0] != '{'
nsmap = {}
if self._url is not None:
resource = self.open()
try:
for event, node in self.iterparse(resource, events=('start-ns', 'end')):
if event == 'start-ns':
update_nsmap(*node)
else:
node.clear()
except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
pass
finally:
resource.close()
elif isinstance(self._text, string_base_type):
try:
for event, node in self.iterparse(StringIO(self._text), events=('start-ns', 'end')):
if event == 'start-ns':
update_nsmap(*node)
else:
node.clear()
except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
pass
else:
# Warning: can extracts namespace information only from lxml etree structures
try:
for elem in self._root.iter():
for k, v in elem.nsmap.items():
update_nsmap(k if k is not None else '', v)
except (AttributeError, TypeError):
pass # Not an lxml's tree or element
return nsmap
def get_locations(self, locations=None):
"""
Returns a list of schema location hints. The locations are normalized using the
base URL of the instance. The *locations* argument can be a dictionary or a list
of namespace resources, that are inserted before the schema location hints extracted
from the XML resource.
"""
base_url = self.base_url
location_hints = []
if locations is not None:
try:
for ns, value in locations.items():
if isinstance(value, list):
location_hints.extend([(ns, normalize_url(url, base_url)) for url in value])
else:
location_hints.append((ns, normalize_url(value, base_url)))
except AttributeError:
location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in locations])
location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in self.iter_location_hints()])
return location_hints