# -*- coding: utf-8 -*- # # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). # All rights reserved. # This file is distributed under the terms of the MIT License. # See the file 'LICENSE' in the root directory of the present # distribution, or http://opensource.org/licenses/MIT. # # @author Davide Brunato # import os.path import re import codecs from elementpath import iter_select, Selector from .compat import ( PY3, StringIO, BytesIO, string_base_type, urlopen, urlsplit, urljoin, urlunsplit, pathname2url, URLError, uses_relative ) from .exceptions import XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaURLError, XMLSchemaOSError from .namespaces import get_namespace from .qnames import XSI_SCHEMA_LOCATION, XSI_NONS_SCHEMA_LOCATION from .etree import ElementTree, PyElementTree, SafeXMLParser, etree_tostring DEFUSE_MODES = ('always', 'remote', 'never') def is_remote_url(url): return url is not None and urlsplit(url).scheme not in ('', 'file') def url_path_is_directory(url): return os.path.isdir(urlsplit(url).path) def url_path_is_file(url): return os.path.isfile(urlsplit(url).path) def normalize_url(url, base_url=None, keep_relative=False): """ Returns a normalized URL doing a join with a base URL. URL scheme defaults to 'file' and backslashes are replaced with slashes. For file paths the os.path.join is used instead of urljoin. :param url: a relative or absolute URL. :param base_url: the reference base URL for construct the normalized URL from the argument. \ For compatibility between "os.path.join" and "urljoin" a trailing '/' is added to not empty paths. :param keep_relative: if set to `True` keeps relative file paths, which would not strictly \ conformant to URL format specification. :return: A normalized URL. """ def add_trailing_slash(r): return urlunsplit((r[0], r[1], r[2] + '/' if r[2] and r[2][-1] != '/' else r[2], r[3], r[4])) if base_url is not None: base_url = base_url.replace('\\', '/') while base_url.startswith('//'): base_url = base_url.replace('//', '/', 1) base_url_parts = urlsplit(base_url) base_url = add_trailing_slash(base_url_parts) if base_url_parts.scheme not in uses_relative: base_url_parts = urlsplit('file:///{}'.format(base_url)) else: base_url_parts = urlsplit(base_url) if base_url_parts.scheme not in ('', 'file'): url = urljoin(base_url, url) else: url_parts = urlsplit(url) if url_parts.scheme not in ('', 'file'): url = urljoin(base_url, url) elif not url_parts.netloc or base_url_parts.netloc == url_parts.netloc: # Join paths only if host parts (netloc) are equal, using the os.path.join # instead of urljoin for path normalization. url = urlunsplit(( '', base_url_parts.netloc, os.path.normpath(os.path.join(base_url_parts.path, url_parts.path)), url_parts.query, url_parts.fragment, )) # Add 'file' scheme if '//' prefix is added if base_url_parts.netloc and not url.startswith(base_url_parts.netloc) and url.startswith('//'): url = 'file:' + url url = url.replace('\\', '/') while url.startswith('//'): url = url.replace('//', '/', 1) url_parts = urlsplit(url, scheme='file') if url_parts.scheme not in uses_relative: return 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/.... elif url_parts.scheme != 'file': return urlunsplit(( url_parts.scheme, url_parts.netloc, pathname2url(url_parts.path), url_parts.query, url_parts.fragment, )) elif os.path.isabs(url_parts.path): return url_parts.geturl() elif keep_relative: # Can't use urlunsplit with a scheme because it converts relative paths to absolute ones. return 'file:{}'.format(urlunsplit(('',) + url_parts[1:])) else: return urlunsplit(( url_parts.scheme, url_parts.netloc, os.path.abspath(url_parts.path), url_parts.query, url_parts.fragment, )) def fetch_resource(location, base_url=None, timeout=30): """ Fetch a resource trying to accessing it. If the resource is accessible returns the URL, otherwise raises an error (XMLSchemaURLError). :param location: an URL or a file path. :param base_url: reference base URL for normalizing local and relative URLs. :param timeout: the timeout in seconds for the connection attempt in case of remote data. :return: a normalized URL. """ if not location: raise XMLSchemaValueError("'location' argument must contains a not empty string.") url = normalize_url(location, base_url) try: resource = urlopen(url, timeout=timeout) except URLError as err: # fallback joining the path without a base URL url = normalize_url(location) try: resource = urlopen(url, timeout=timeout) except URLError: raise XMLSchemaURLError(reason=err.reason) else: resource.close() return url else: resource.close() return url def fetch_schema_locations(source, locations=None, **resource_options): """ Fetches the schema URL for the source's root of an XML data source and a list of location hints. If an accessible schema location is not found raises a ValueError. :param source: an Element or an Element Tree with XML data or an URL or a file-like object. :param locations: a dictionary or dictionary items with Schema location hints. :param resource_options: keyword arguments for providing :class:`XMLResource` class init options. :return: A tuple with the URL referring to the first reachable schema resource, a list \ of dictionary items with normalized location hints. """ base_url = resource_options.pop('base_url', None) timeout = resource_options.pop('timeout', 30) if not isinstance(source, XMLResource): resource = XMLResource(source, base_url, timeout=timeout, **resource_options) else: resource = source base_url = resource.base_url namespace = resource.namespace locations = resource.get_locations(locations) for ns, url in filter(lambda x: x[0] == namespace, locations): try: return fetch_resource(url, base_url, timeout), locations except XMLSchemaURLError: pass raise XMLSchemaValueError("not found a schema for XML data resource %r (namespace=%r)." % (source, namespace)) def fetch_schema(source, locations=None, **resource_options): """ Fetches the schema URL for the source's root of an XML data source. If an accessible schema location is not found raises a ValueError. :param source: An an Element or an Element Tree with XML data or an URL or a file-like object. :param locations: A dictionary or dictionary items with schema location hints. :param resource_options: keyword arguments for providing :class:`XMLResource` class init options. :return: An URL referring to a reachable schema resource. """ return fetch_schema_locations(source, locations, **resource_options)[0] def fetch_namespaces(source, **resource_options): """ Extracts namespaces with related prefixes from the XML data source. If the source is an lxml's ElementTree/Element returns the nsmap attribute of the root. If a duplicate prefix declaration is encountered then adds the namespace using a different prefix, but only in the case if the namespace URI is not already mapped by another prefix. :param source: a string containing the XML document or file path or an url \ or a file like object or an ElementTree or Element. :param resource_options: keyword arguments for providing :class:`XMLResource` init options. :return: A dictionary for mapping namespace prefixes to full URI. """ timeout = resource_options.pop('timeout', 30) return XMLResource(source, timeout=timeout, **resource_options).get_namespaces() def load_xml_resource(source, element_only=True, **resource_options): """ Load XML data source into an Element tree, returning the root Element, the XML text and an url, if available. Usable for XML data files of small or medium sizes, as XSD schemas. :param source: an URL, a filename path or a file-like object. :param element_only: if True the function returns only the root Element of the tree. :param resource_options: keyword arguments for providing :class:`XMLResource` init options. :return: a tuple with three items (root Element, XML text and XML URL) or \ only the root Element if 'element_only' argument is True. """ lazy = resource_options.pop('lazy', False) source = XMLResource(source, lazy=lazy, **resource_options) if element_only: return source.root else: source.load() return source.root, source.text, source.url class XMLResource(object): """ XML resource reader based on ElementTree and urllib. :param source: a string containing the XML document or file path or an URL or a file like \ object or an ElementTree or an Element. :param base_url: is an optional base URL, used for the normalization of relative paths when \ the URL of the resource can't be obtained from the source argument. :param defuse: set the usage of SafeXMLParser for XML data. Can be 'always', 'remote' or 'never'. \ Default is 'remote' that uses the defusedxml only when loading remote data. :param timeout: the timeout in seconds for the connection attempt in case of remote data. :param lazy: if set to `False` the source is fully loaded into and processed from memory. \ Default is `True` that means that only the root element of the source is loaded. This is \ ignored if *source* is an Element or an ElementTree. """ def __init__(self, source, base_url=None, defuse='remote', timeout=300, lazy=True): if base_url is not None and not isinstance(base_url, string_base_type): raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) self._root = self._document = self._url = self._text = None self._base_url = base_url self.defuse = defuse self.timeout = timeout self._lazy = lazy self.source = source def __str__(self): # noinspection PyCompatibility,PyUnresolvedReferences return unicode(self).encode("utf-8") def __unicode__(self): return self.__repr__() if PY3: __str__ = __unicode__ def __repr__(self): if self._root is None: return u'%s()' % self.__class__.__name__ elif self._url is None: return u'%s(tag=%r)' % (self.__class__.__name__, self._root.tag) else: return u'%s(tag=%r, basename=%r)' % ( self.__class__.__name__, self._root.tag, os.path.basename(self._url) ) def __setattr__(self, name, value): if name == 'source': self._root, self._document, self._text, self._url = self._fromsource(value) elif name == 'defuse' and value not in DEFUSE_MODES: raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) elif name == 'timeout' and (not isinstance(value, int) or value <= 0): raise XMLSchemaValueError(u"'timeout' attribute must be a positive integer: {!r}".format(value)) elif name == 'lazy' and not isinstance(value, bool): raise XMLSchemaValueError(u"'lazy' attribute must be a boolean: {!r}".format(value)) super(XMLResource, self).__setattr__(name, value) def _fromsource(self, source): url, lazy = None, self._lazy if hasattr(source, 'tag'): self._lazy = False return source, None, None, None # Source is already an Element --> nothing to load elif isinstance(source, string_base_type): _url, self._url = self._url, None try: if lazy: # check if source is a string containing a valid XML root for _, root in self.iterparse(StringIO(source), events=('start',)): return root, None, source, None else: return self.fromstring(source), None, source, None except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): if '\n' in source: raise finally: self._url = _url url = normalize_url(source) if '\n' not in source else None elif isinstance(source, StringIO): _url, self._url = self._url, None try: if lazy: for _, root in self.iterparse(source, events=('start',)): return root, None, source.getvalue(), None else: document = self.parse(source) return document.getroot(), document, source.getvalue(), None finally: self._url = _url elif hasattr(source, 'read'): # source should be a file-like object try: if hasattr(source, 'url'): url = source.url else: url = normalize_url(source.name) except AttributeError: pass else: _url, self._url = self._url, url try: if lazy: for _, root in self.iterparse(source, events=('start',)): return root, None, None, url else: document = self.parse(source) return document.getroot(), document, None, url finally: self._url = _url else: # Try ElementTree object at last try: root = source.getroot() except (AttributeError, TypeError): pass else: if hasattr(root, 'tag'): self._lazy = False return root, source, None, None if url is None: raise XMLSchemaTypeError( "wrong type %r for 'source' attribute: an ElementTree object or an Element instance or a " "string containing XML data or an URL or a file-like object is required." % type(source) ) else: resource = urlopen(url, timeout=self.timeout) _url, self._url = self._url, url try: if lazy: for _, root in self.iterparse(resource, events=('start',)): return root, None, None, url else: document = self.parse(resource) root = document.getroot() return root, document, None, url finally: self._url = _url resource.close() @property def root(self): """The XML tree root Element.""" return self._root @property def document(self): """ The ElementTree document, `None` if the instance is lazy or is not created from another document or from an URL. """ return self._document @property def text(self): """The XML text source, `None` if it's not available.""" return self._text @property def url(self): """The source URL, `None` if the instance is created from an Element tree or from a string.""" return self._url @property def base_url(self): """The base URL for completing relative locations.""" return os.path.dirname(self._url) if self._url else self._base_url @property def namespace(self): """The namespace of the XML document.""" return get_namespace(self._root.tag) if self._root is not None else None @staticmethod def defusing(source): """ Defuse an XML source, raising an `ElementTree.ParseError` if the source contains entity definitions or remote entity loading. :param source: a filename or file object containing XML data. """ parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) try: for _, _ in PyElementTree.iterparse(source, ('start',), parser): break except PyElementTree.ParseError as err: raise ElementTree.ParseError(str(err)) def parse(self, source): """ An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When protection is applied XML data are loaded and defused before building the ElementTree instance. :param source: a filename or file object containing XML data. :returns: an ElementTree instance. """ if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): text = source.read() if isinstance(text, bytes): self.defusing(BytesIO(text)) return ElementTree.parse(BytesIO(text)) else: self.defusing(StringIO(text)) return ElementTree.parse(StringIO(text)) else: return ElementTree.parse(source) def iterparse(self, source, events=None): """ An equivalent of *ElementTree.iterparse()* that can protect from XML entities attacks. When protection is applied the iterator yields pure-Python Element instances. :param source: a filename or file object containing XML data. :param events: a list of events to report back. If omitted, only “end” events are reported. """ if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) try: return PyElementTree.iterparse(source, events, parser) except PyElementTree.ParseError as err: raise ElementTree.ParseError(str(err)) else: return ElementTree.iterparse(source, events) def fromstring(self, text): """ An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks. :param text: a string containing XML data. :returns: the root Element instance. """ if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): self.defusing(StringIO(text)) return ElementTree.fromstring(text) def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False): """Generates a string representation of the XML resource.""" return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration) def copy(self, **kwargs): """Resource copy method. Change init parameters with keyword arguments.""" obj = type(self)( source=self.source, base_url=kwargs.get('base_url', self.base_url), defuse=kwargs.get('defuse', self.defuse), timeout=kwargs.get('timeout', self.timeout), lazy=kwargs.get('lazy', self._lazy) ) if obj._text is None and self._text is not None: obj._text = self._text return obj def open(self): """Returns a opened resource reader object for the instance URL.""" if self._url is None: raise XMLSchemaValueError("can't open, the resource has no URL associated.") try: return urlopen(self._url, timeout=self.timeout) except URLError as err: raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason)) def load(self): """ Loads the XML text from the data source. If the data source is an Element the source XML text can't be retrieved. """ if self._url is None: return # Created from Element or text source --> already loaded resource = self.open() try: data = resource.read() except (OSError, IOError) as err: raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err)) finally: resource.close() try: self._text = data.decode('utf-8') if PY3 else data.encode('utf-8') except UnicodeDecodeError: if PY3: self._text = data.decode('iso-8859-1') else: with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: self._text = f.read().encode('iso-8859-1') def is_lazy(self): """Returns `True` if the XML resource is lazy.""" return self._lazy def is_loaded(self): """Returns `True` if the XML text of the data source is loaded.""" return self._text is not None def iter(self, tag=None): """XML resource tree iterator.""" if not self._lazy: for elem in self._root.iter(tag): yield elem return elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: resource = StringIO(self._text) try: for event, elem in self.iterparse(resource, events=('end',)): if tag is None or elem.tag == tag: yield elem elem.clear() finally: resource.close() def iterfind(self, path=None, namespaces=None): """XML resource tree iterfind selector.""" if not self._lazy: if path is None: yield self._root else: for e in iter_select(self._root, path, namespaces, strict=False): yield e return elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: self.load() resource = StringIO(self._text) try: if path is None: level = 0 for event, elem in self.iterparse(resource, events=('start', 'end')): if event == "start": if level == 0: self._root.clear() self._root = elem level += 1 else: level -= 1 if level == 0: yield elem elem.clear() else: selector = Selector(path, namespaces, strict=False) level = 0 for event, elem in self.iterparse(resource, events=('start', 'end')): if event == "start": if level == 0: self._root.clear() self._root = elem level += 1 else: level -= 1 if elem in selector.select(self._root): yield elem elem.clear() elif level == 0: elem.clear() finally: resource.close() def iter_location_hints(self): """Yields schema location hints from the XML tree.""" for elem in self.iter(): try: locations = elem.attrib[XSI_SCHEMA_LOCATION] except KeyError: pass else: locations = locations.split() for ns, url in zip(locations[0::2], locations[1::2]): yield ns, url try: locations = elem.attrib[XSI_NONS_SCHEMA_LOCATION] except KeyError: pass else: for url in locations.split(): yield '', url def get_namespaces(self): """ Extracts namespaces with related prefixes from the XML resource. If a duplicate prefix declaration is encountered then adds the namespace using a different prefix, but only in the case if the namespace URI is not already mapped by another prefix. :return: A dictionary for mapping namespace prefixes to full URI. """ def update_nsmap(prefix, uri): if prefix not in nsmap and (prefix or not local_root): nsmap[prefix] = uri elif not any(uri == ns for ns in nsmap.values()): if not prefix: try: prefix = re.search(r'(\w+)$', uri.strip()).group() except AttributeError: return while prefix in nsmap: match = re.search(r'(\d+)$', prefix) if match: index = int(match.group()) + 1 prefix = prefix[:match.span()[0]] + str(index) else: prefix += '2' nsmap[prefix] = uri local_root = self.root.tag[0] != '{' nsmap = {} if self._url is not None: resource = self.open() try: for event, node in self.iterparse(resource, events=('start-ns', 'end')): if event == 'start-ns': update_nsmap(*node) else: node.clear() except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): pass finally: resource.close() elif isinstance(self._text, string_base_type): try: for event, node in self.iterparse(StringIO(self._text), events=('start-ns', 'end')): if event == 'start-ns': update_nsmap(*node) else: node.clear() except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): pass else: # Warning: can extracts namespace information only from lxml etree structures try: for elem in self._root.iter(): for k, v in elem.nsmap.items(): update_nsmap(k if k is not None else '', v) except (AttributeError, TypeError): pass # Not an lxml's tree or element return nsmap def get_locations(self, locations=None): """ Returns a list of schema location hints. The locations are normalized using the base URL of the instance. The *locations* argument can be a dictionary or a list of namespace resources, that are inserted before the schema location hints extracted from the XML resource. """ base_url = self.base_url location_hints = [] if locations is not None: try: for ns, value in locations.items(): if isinstance(value, list): location_hints.extend([(ns, normalize_url(url, base_url)) for url in value]) else: location_hints.append((ns, normalize_url(value, base_url))) except AttributeError: location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in locations]) location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in self.iter_location_hints()]) return location_hints