From 4085e8daa5600fd880cdc02b768dcfe3d6f91fcd Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 5 Jul 2019 00:06:34 +1000 Subject: [PATCH 01/34] Add failing test when reloading xml from zipfile content. This problem comes from the way `XMLResource` re-opens files if it wants to get more information from them. `XMLResource` is deriving the source location of file like objects from their `name` attr. When that attr doesn't correspond to a file on disk (zipfile contents, Django files) an error is raised when `XMLResource.open()` is called. --- xmlschema/tests/test_resources.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 038e05f..4022c08 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -15,6 +15,7 @@ This module runs tests concerning resources. import unittest import os import platform +import zipfile try: from pathlib import PureWindowsPath, PurePath @@ -26,7 +27,7 @@ from xmlschema import ( load_xml_resource, XMLResource, XMLSchemaURLError ) from xmlschema.tests import XMLSchemaTestCase, SKIP_REMOTE_TESTS -from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO +from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO, BytesIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, is_etree_element, etree_element, py_etree_element @@ -383,6 +384,27 @@ class TestResources(XMLSchemaTestCase): for schema in vh_schema.maps.iter_schemas(): self.assertIsInstance(schema.root, etree_element) + def test_fid_with_name_attr(self): + """XMLResource gets correct data when passed a file like object + with a name attribute that isn't on disk. + + These file descriptors appear when working with the contents from a + zip using the zipfile module and with Django files in some + instances. + """ + zipname = "not__on____disk.xml" + bytes_fid = BytesIO() + with zipfile.ZipFile(bytes_fid, 'w') as zf: + with open(self.vh_xml_file) as fid: + zf.writestr(zipname, fid.read()) + + bytes_fid.seek(0) + with zipfile.ZipFile(bytes_fid) as zf: + with zf.open(zipname) as fid: + resource = XMLResource(fid) + # This should not cause an error. + resource.load() + if __name__ == '__main__': from xmlschema.tests import print_test_header From 61e1f609fcfe1b4acb44d75c416227ef9b56474d Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 5 Jul 2019 22:23:28 +1000 Subject: [PATCH 02/34] Stop reading `name` and `url` from file object attrs These attrs shouldn't be used to reopen the file object as: - they may not reflect the original file or resource (file objects opened from a zipfile will have a name that doesn't correspond to any file on disk). - Depending on how the fid was opened, these attrs could be crafted to read arbitrary files from disk. If the creator of a .zip gives a file inside the zip file a path of `/etc/passwd` we may end up opening that file. Instead of reopening the file, we keep track of the file object and seek to the beginning of the file. This means (for most operations) the file object must be seekable. On Python 2 urlopen returns an unseekable object for 'file://' paths. One test had to be skipped in Python 2 for this reason. --- xmlschema/resources.py | 98 ++++++++++++++++++------------- xmlschema/tests/test_resources.py | 61 +++++++++++++------ 2 files changed, 100 insertions(+), 59 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index b2898f7..be9f1e8 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -245,7 +245,7 @@ class XMLResource(object): if base_url is not None and not isinstance(base_url, string_base_type): raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) - self._root = self._document = self._url = self._text = None + self._root = self._document = self._url = self._text = self._fid = None self._base_url = base_url self.defuse = defuse self.timeout = timeout @@ -274,7 +274,7 @@ class XMLResource(object): def __setattr__(self, name, value): if name == 'source': - self._root, self._document, self._text, self._url = self._fromsource(value) + self._root, self._document, self._text, self._url, self._fid = self._fromsource(value) elif name == 'defuse' and value not in DEFUSE_MODES: raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) elif name == 'timeout' and (not isinstance(value, int) or value <= 0): @@ -287,16 +287,16 @@ class XMLResource(object): url, lazy = None, self._lazy if is_etree_element(source): self._lazy = False - return source, None, None, None # Source is already an Element --> nothing to load + return source, None, None, None, None # Source is already an Element --> nothing to load elif isinstance(source, string_base_type): _url, self._url = self._url, None try: if lazy: # check if source is a string containing a valid XML root for _, root in self.iterparse(StringIO(source), events=('start',)): - return root, None, source, None + return root, None, source, None, None else: - return self.fromstring(source), None, source, None + return self.fromstring(source), None, source, None, None except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): if '\n' in source: raise @@ -309,33 +309,24 @@ class XMLResource(object): try: if lazy: for _, root in self.iterparse(source, events=('start',)): - return root, None, source.getvalue(), None + return root, None, source.getvalue(), None, None else: document = self.parse(source) - return document.getroot(), document, source.getvalue(), None + return document.getroot(), document, source.getvalue(), None, None finally: self._url = _url elif hasattr(source, 'read'): - # source should be a file-like object + _url, self._url = self._url, url try: - if hasattr(source, 'url'): - url = source.url + if lazy: + for _, root in self.iterparse(source, events=('start',)): + return root, None, None, url, source else: - url = normalize_url(source.name) - except AttributeError: - pass - else: - _url, self._url = self._url, url - try: - if lazy: - for _, root in self.iterparse(source, events=('start',)): - return root, None, None, url - else: - document = self.parse(source) - return document.getroot(), document, None, url - finally: - self._url = _url + document = self.parse(source) + return document.getroot(), document, None, url, source + finally: + self._url = _url else: # Try ElementTree object at last @@ -346,7 +337,7 @@ class XMLResource(object): else: if is_etree_element(root): self._lazy = False - return root, source, None, None + return root, source, None, None, None if url is None: raise XMLSchemaTypeError( @@ -359,11 +350,11 @@ class XMLResource(object): try: if lazy: for _, root in self.iterparse(resource, events=('start',)): - return root, None, None, url + return root, None, None, url, None else: document = self.parse(resource) root = document.getroot() - return root, document, None, url + return root, document, None, url, None finally: self._url = _url resource.close() @@ -482,6 +473,10 @@ class XMLResource(object): def open(self): """Returns a opened resource reader object for the instance URL.""" + if self._fid is not None: + self._fid.seek(0) + return self._fid + if self._url is None: raise XMLSchemaValueError("can't open, the resource has no URL associated.") try: @@ -494,7 +489,7 @@ class XMLResource(object): Loads the XML text from the data source. If the data source is an Element the source XML text can't be retrieved. """ - if self._url is None: + if self._url is None and self._fid is None: return # Created from Element or text source --> already loaded resource = self.open() @@ -503,16 +498,25 @@ class XMLResource(object): except (OSError, IOError) as err: raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err)) finally: - resource.close() + # We don't want to close the file obj if it wasn't originally + # opened by `XMLResource`. That is the concern of the code + # where the file obj came from. + if self._fid is None: + resource.close() - try: - self._text = data.decode('utf-8') if PY3 else data.encode('utf-8') - except UnicodeDecodeError: - if PY3: - self._text = data.decode('iso-8859-1') - else: - with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: - self._text = f.read().encode('iso-8859-1') + if isinstance(data, bytes): + try: + text = data.decode('utf-8') if PY3 else data.encode('utf-8') + except UnicodeDecodeError: + if PY3: + text = data.decode('iso-8859-1') + else: + with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: + text = f.read().encode('iso-8859-1') + else: + text = data + + self._text = text def is_lazy(self): """Returns `True` if the XML resource is lazy.""" @@ -528,6 +532,9 @@ class XMLResource(object): for elem in self._root.iter(tag): yield elem return + elif self._fid is not None: + self._fid.seek(0) + resource = self._fid elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -539,7 +546,8 @@ class XMLResource(object): yield elem elem.clear() finally: - resource.close() + if self._fid is None: + resource.close() def iterfind(self, path=None, namespaces=None): """XML resource tree iterfind selector.""" @@ -550,6 +558,9 @@ class XMLResource(object): for e in iter_select(self._root, path, namespaces, strict=False): yield e return + elif self._fid is not None: + self._fid.seek(0) + resource = self._fid elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -587,7 +598,8 @@ class XMLResource(object): elif level == 0: elem.clear() finally: - resource.close() + if self._fid is None: + resource.close() def iter_location_hints(self): """Yields schema location hints from the XML tree.""" @@ -639,7 +651,7 @@ class XMLResource(object): local_root = self.root.tag[0] != '{' nsmap = {} - if self._url is not None: + if self._url is not None or self._fid is not None: resource = self.open() try: for event, node in self.iterparse(resource, events=('start-ns', 'end')): @@ -650,7 +662,11 @@ class XMLResource(object): except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): pass finally: - resource.close() + # We don't want to close the file obj if it wasn't + # originally opened by `XMLResource`. That is the concern + # of the code where the file obj came from. + if self._fid is None: + resource.close() elif isinstance(self._text, string_base_type): try: for event, node in self.iterparse(StringIO(self._text), events=('start-ns', 'end')): diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 4022c08..b259c5c 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -15,7 +15,6 @@ This module runs tests concerning resources. import unittest import os import platform -import zipfile try: from pathlib import PureWindowsPath, PurePath @@ -27,7 +26,7 @@ from xmlschema import ( load_xml_resource, XMLResource, XMLSchemaURLError ) from xmlschema.tests import XMLSchemaTestCase, SKIP_REMOTE_TESTS -from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO, BytesIO +from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, is_etree_element, etree_element, py_etree_element @@ -196,17 +195,22 @@ class TestResources(XMLSchemaTestCase): resource.load() self.assertIsNone(resource.text) + @unittest.skipIf( + platform.python_version_tuple()[0] < '3', + "Skip: urlopen on Python 2 can't seek 'file://' paths." + ) def test_xml_resource_from_resource(self): xml_file = urlopen('file://{}'.format(add_leading_slash(self.vh_xml_file))) try: resource = XMLResource(xml_file) self.assertEqual(resource.source, xml_file) self.assertEqual(resource.root.tag, '{http://example.com/vehicles}vehicles') - self.check_url(resource.url, self.vh_xml_file) + self.assertIsNone(resource.url) self.assertIsNone(resource.document) self.assertIsNone(resource.text) resource.load() self.assertTrue(resource.text.startswith(' Date: Tue, 15 Oct 2019 21:40:40 +0200 Subject: [PATCH 03/34] Add LOCATION_HINTS dict to namespaces module - Use LOCATION_HINTS for schema's fallback locations --- doc/conf.py | 2 +- publiccode.yml | 4 +- setup.py | 2 +- xmlschema/__init__.py | 2 +- xmlschema/namespaces.py | 24 ++++++++++++ .../tests/validators/test_schema_class.py | 5 ++- xmlschema/validators/schema.py | 39 +++++++------------ 7 files changed, 45 insertions(+), 33 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index c0eed6c..90a672d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -62,7 +62,7 @@ author = 'Davide Brunato' # The short X.Y version. version = '1.0' # The full version, including alpha/beta/rc tags. -release = '1.0.15' +release = '1.0.16' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/publiccode.yml b/publiccode.yml index 6ecb1e0..4770e55 100644 --- a/publiccode.yml +++ b/publiccode.yml @@ -6,8 +6,8 @@ publiccodeYmlVersion: '0.2' name: xmlschema url: 'https://github.com/sissaschool/xmlschema' landingURL: 'https://github.com/sissaschool/xmlschema' -releaseDate: '2019-10-13' -softwareVersion: v1.0.15 +releaseDate: '2019-10-XX' +softwareVersion: v1.0.16 developmentStatus: stable platforms: - linux diff --git a/setup.py b/setup.py index 603a73e..2e5472c 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ class InstallCommand(install): setup( name='xmlschema', - version='1.0.15', + version='1.0.16', install_requires=['elementpath~=1.3.0'], packages=['xmlschema'], include_package_data=True, diff --git a/xmlschema/__init__.py b/xmlschema/__init__.py index e9adc67..d800a17 100644 --- a/xmlschema/__init__.py +++ b/xmlschema/__init__.py @@ -30,7 +30,7 @@ from .validators import ( XsdGlobals, XMLSchemaBase, XMLSchema, XMLSchema10, XMLSchema11 ) -__version__ = '1.0.15' +__version__ = '1.0.16' __author__ = "Davide Brunato" __contact__ = "brunato@sissa.it" __copyright__ = "Copyright 2016-2019, SISSA" diff --git a/xmlschema/namespaces.py b/xmlschema/namespaces.py index beff6c6..44cd453 100644 --- a/xmlschema/namespaces.py +++ b/xmlschema/namespaces.py @@ -12,10 +12,13 @@ This module contains namespace definitions for W3C core standards and namespace related classes. """ from __future__ import unicode_literals +import os import re from .compat import MutableMapping, Mapping +### +# Namespace URIs XSD_NAMESPACE = 'http://www.w3.org/2001/XMLSchema' "URI of the XML Schema Definition namespace (xs|xsd)" @@ -42,6 +45,27 @@ VC_NAMESPACE = 'http://www.w3.org/2007/XMLSchema-versioning' "URI of the XML Schema Versioning namespace (vc)" +### +# Schema location hints + +SCHEMAS_DIR = os.path.join(os.path.dirname(__file__), 'validators/schemas/') + +LOCATION_HINTS = { + # Locally saved schemas + HFP_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XMLSchema-hasFacetAndProperty_minimal.xsd'), + VC_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XMLSchema-versioning_minimal.xsd'), + XLINK_NAMESPACE: os.path.join(SCHEMAS_DIR, 'xlink.xsd'), + XHTML_NAMESPACE: os.path.join(SCHEMAS_DIR, 'xhtml1-strict.xsd'), + + # Remote locations: contributors can propose additional official locations + # for other namespaces for extending this list. + XSLT_NAMESPACE: os.path.join(SCHEMAS_DIR, 'http://www.w3.org/2007/schema-for-xslt20.xsd'), +} + + +### +# Helper functions and classes + NAMESPACE_PATTERN = re.compile(r'{([^}]*)}') diff --git a/xmlschema/tests/validators/test_schema_class.py b/xmlschema/tests/validators/test_schema_class.py index fb94fe1..45be457 100644 --- a/xmlschema/tests/validators/test_schema_class.py +++ b/xmlschema/tests/validators/test_schema_class.py @@ -13,9 +13,11 @@ from __future__ import print_function, unicode_literals import unittest import platform import warnings +import os from xmlschema import XMLSchemaParseError, XMLSchemaIncludeWarning, XMLSchemaImportWarning from xmlschema.etree import etree_element +from xmlschema.namespaces import SCHEMAS_DIR from xmlschema.qnames import XSD_ELEMENT, XSI_TYPE from xmlschema.tests import SKIP_REMOTE_TESTS, XsdValidatorTestCase from xmlschema.validators import XMLSchema11 @@ -113,8 +115,7 @@ class TestXMLSchema10(XsdValidatorTestCase): """, XMLSchemaParseError) def test_base_schemas(self): - from xmlschema.validators.schema import XML_SCHEMA_FILE - self.schema_class(XML_SCHEMA_FILE) + self.schema_class(os.path.join(SCHEMAS_DIR, 'xml_minimal.xsd')) def test_root_elements(self): # Test issue #107 fix diff --git a/xmlschema/validators/schema.py b/xmlschema/validators/schema.py index 321809f..1277d26 100644 --- a/xmlschema/validators/schema.py +++ b/xmlschema/validators/schema.py @@ -32,8 +32,8 @@ from ..qnames import VC_MIN_VERSION, VC_MAX_VERSION, VC_TYPE_AVAILABLE, \ XSD_ALL, XSD_ANY, XSD_ANY_ATTRIBUTE, XSD_INCLUDE, XSD_IMPORT, XSD_REDEFINE, \ XSD_OVERRIDE, XSD_DEFAULT_OPEN_CONTENT from ..helpers import get_xsd_derivation_attribute, get_xsd_form_attribute -from ..namespaces import XSD_NAMESPACE, XML_NAMESPACE, XSI_NAMESPACE, XHTML_NAMESPACE, \ - XLINK_NAMESPACE, VC_NAMESPACE, NamespaceResourcesMap, NamespaceView +from ..namespaces import XSD_NAMESPACE, XML_NAMESPACE, XSI_NAMESPACE, VC_NAMESPACE, \ + SCHEMAS_DIR, LOCATION_HINTS, NamespaceResourcesMap, NamespaceView from ..etree import etree_element, etree_tostring, prune_etree, ParseError from ..resources import is_remote_url, url_path_is_file, fetch_resource, XMLResource from ..converters import XMLSchemaConverter @@ -75,14 +75,6 @@ ANY_ELEMENT = etree_element( 'maxOccurs': 'unbounded' }) -# XSD schemas of W3C standards -SCHEMAS_DIR = os.path.join(os.path.dirname(__file__), 'schemas/') -XML_SCHEMA_FILE = os.path.join(SCHEMAS_DIR, 'xml_minimal.xsd') -XSI_SCHEMA_FILE = os.path.join(SCHEMAS_DIR, 'XMLSchema-instance_minimal.xsd') -XLINK_SCHEMA_FILE = os.path.join(SCHEMAS_DIR, 'xlink.xsd') -XHTML_SCHEMA_FILE = os.path.join(SCHEMAS_DIR, 'xhtml1-strict.xsd') -VC_SCHEMA_FILE = os.path.join(SCHEMAS_DIR, 'XMLSchema-versioning_minimal.xsd') - class XMLSchemaMeta(ABCMeta): @@ -1024,14 +1016,15 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): warnings.warn(self.warnings[-1], XMLSchemaImportWarning, stacklevel=3) self.imports[namespace] = None - def import_schema(self, namespace, location, base_url=None, force=False): + def import_schema(self, namespace, location, base_url=None, force=False, build=False): """ Imports a schema for an external namespace, from a specific URL. :param namespace: is the URI of the external namespace. :param location: is the URL of the schema. :param base_url: is an optional base URL for fetching the schema resource. - :param force: is set to `True` imports the schema also if the namespace is already imported. + :param force: if set to `True` imports the schema also if the namespace is already imported. + :param build: defines when to build the imported schema, the default is to not build. :return: the imported :class:`XMLSchema` instance. """ if not force: @@ -1058,7 +1051,7 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): base_url=self.base_url, defuse=self.defuse, timeout=self.timeout, - build=False, + build=build, ) if schema.target_namespace != namespace: raise XMLSchemaValueError('imported schema %r has an unmatched namespace %r' % (location, namespace)) @@ -1487,13 +1480,10 @@ class XMLSchema10(XMLSchemaBase): } meta_schema = os.path.join(SCHEMAS_DIR, 'XSD_1.0/XMLSchema.xsd') BASE_SCHEMAS = { - XML_NAMESPACE: XML_SCHEMA_FILE, - XSI_NAMESPACE: XSI_SCHEMA_FILE, - } - FALLBACK_LOCATIONS = { - XLINK_NAMESPACE: XLINK_SCHEMA_FILE, - XHTML_NAMESPACE: XHTML_SCHEMA_FILE, + XML_NAMESPACE: os.path.join(SCHEMAS_DIR, 'xml_minimal.xsd'), + XSI_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XMLSchema-instance_minimal.xsd'), } + FALLBACK_LOCATIONS = LOCATION_HINTS # ++++ UNDER DEVELOPMENT, DO NOT USE!!! ++++ @@ -1550,15 +1540,12 @@ class XMLSchema11(XMLSchemaBase): } meta_schema = os.path.join(SCHEMAS_DIR, 'XSD_1.1/XMLSchema.xsd') BASE_SCHEMAS = { + XML_NAMESPACE: os.path.join(SCHEMAS_DIR, 'xml_minimal.xsd'), + XSI_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XMLSchema-instance_minimal.xsd'), XSD_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XSD_1.1/xsd11-extra.xsd'), - XML_NAMESPACE: XML_SCHEMA_FILE, - XSI_NAMESPACE: XSI_SCHEMA_FILE, - VC_NAMESPACE: VC_SCHEMA_FILE, - } - FALLBACK_LOCATIONS = { - XLINK_NAMESPACE: XLINK_SCHEMA_FILE, - XHTML_NAMESPACE: XHTML_SCHEMA_FILE, + VC_NAMESPACE: os.path.join(SCHEMAS_DIR, 'XMLSchema-versioning_minimal.xsd'), } + FALLBACK_LOCATIONS = LOCATION_HINTS def _parse_inclusions(self): super(XMLSchema11, self)._parse_inclusions() From 248b9f9b6875733a73349cf61412683ec29e3fd8 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 16 Oct 2019 15:16:19 +0200 Subject: [PATCH 04/34] Update XSD validation with wildcards - Add load_namespace() to XsdGlobals . Modify iter_decode/iter_encode for xs:any and xs:anyAttribute --- xmlschema/validators/globals_.py | 59 ++++++++++++- xmlschema/validators/wildcards.py | 132 +++++++++++++++++------------- 2 files changed, 132 insertions(+), 59 deletions(-) diff --git a/xmlschema/validators/globals_.py b/xmlschema/validators/globals_.py index d610324..c9716d7 100644 --- a/xmlschema/validators/globals_.py +++ b/xmlschema/validators/globals_.py @@ -17,7 +17,7 @@ from collections import Counter from ..compat import string_base_type from ..exceptions import XMLSchemaKeyError, XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaWarning -from ..namespaces import XSD_NAMESPACE, NamespaceResourcesMap +from ..namespaces import XSD_NAMESPACE, LOCATION_HINTS, NamespaceResourcesMap from ..qnames import XSD_REDEFINE, XSD_OVERRIDE, XSD_NOTATION, XSD_ANY_TYPE, \ XSD_SIMPLE_TYPE, XSD_COMPLEX_TYPE, XSD_GROUP, XSD_ATTRIBUTE, XSD_ATTRIBUTE_GROUP, \ XSD_ELEMENT, XSI_TYPE, get_qname, local_name, qname_to_extended @@ -203,6 +203,7 @@ class XsdGlobals(XsdValidator): self.validator = validator self.namespaces = NamespaceResourcesMap() # Registered schemas by namespace URI + self.missing_locations = [] # Missing or failing resource locations self.types = {} # Global types (both complex and simple) self.attributes = {} # Global attributes @@ -384,6 +385,61 @@ class XsdGlobals(XsdValidator): elif not any(schema.url == obj.url and schema.__class__ == obj.__class__ for obj in ns_schemas): ns_schemas.append(schema) + def load_namespace(self, namespace, build=True): + """ + Load namespace from available location hints. Returns `True` if the namespace + is already loaded or if the namespace can be loaded from one of the locations, + returns `False` otherwise. Failing locations are inserted into the missing + locations list. + + :param namespace: the namespace to load. + :param build: if left with `True` value builds the maps after load. If the \ + build fails the resource URL is added to missing locations. + """ + namespace = namespace.strip() + if namespace in self.namespaces: + return True + elif self.validator.meta_schema is None: + return False # Do not load additional namespaces for meta-schema (XHTML) + + # Try from schemas location hints: usually the namespaces related to these + # hints are already loaded during schema construction, but it's better to + # retry once if the initial load has failed. + for schema in self.iter_schemas(): + for url in schema.get_locations(namespace): + if url in self.missing_locations: + continue + + try: + if schema.import_schema(namespace, url, schema.base_url) is not None: + if build: + self.build() + except (OSError, IOError): + pass + except XMLSchemaNotBuiltError: + self.clear(remove_schemas=True, only_unbuilt=True) + self.missing_locations.append(url) + else: + return True + + # Try from library location hint, if there is any. + if namespace in LOCATION_HINTS: + url = LOCATION_HINTS[namespace] + if url not in self.missing_locations: + try: + if self.validator.import_schema(namespace, url) is not None: + if build: + self.build() + except (OSError, IOError): + return False + except XMLSchemaNotBuiltError: + self.clear(remove_schemas=True, only_unbuilt=True) + self.missing_locations.append(url) + else: + return True + + return False + def clear(self, remove_schemas=False, only_unbuilt=False): """ Clears the instance maps and schemas. @@ -415,6 +471,7 @@ class XsdGlobals(XsdValidator): self.namespaces = namespaces else: + self.missing_locations.clear() for global_map in self.global_maps: global_map.clear() self.substitution_groups.clear() diff --git a/xmlschema/validators/wildcards.py b/xmlschema/validators/wildcards.py index ade601b..849c22c 100644 --- a/xmlschema/validators/wildcards.py +++ b/xmlschema/validators/wildcards.py @@ -13,13 +13,12 @@ This module contains classes for XML Schema wildcards. """ from __future__ import unicode_literals +from ..compat import unicode_type from ..exceptions import XMLSchemaValueError from ..namespaces import XSI_NAMESPACE from ..qnames import XSD_ANY, XSD_ANY_ATTRIBUTE, XSD_OPEN_CONTENT, \ XSD_DEFAULT_OPEN_CONTENT, get_namespace from ..xpath import XMLSchemaProxy, ElementPathMixin - -from .exceptions import XMLSchemaNotBuiltError from .xsdbase import ValidationMixin, XsdComponent, ParticleMixin @@ -129,25 +128,6 @@ class XsdWildcard(XsdComponent, ValidationMixin): self.not_qname = names - def _load_namespace(self, namespace): - if namespace in self.schema.maps.namespaces: - return - - for url in self.schema.get_locations(namespace): - try: - schema = self.schema.import_schema(namespace, url, base_url=self.schema.base_url) - if schema is not None: - try: - schema.maps.build() - except XMLSchemaNotBuiltError: - # Namespace build fails: remove unbuilt schemas and the url hint - schema.maps.clear(remove_schemas=True, only_unbuilt=True) - self.schema.locations[namespace].remove(url) - else: - break - except (OSError, IOError): - pass - @property def built(self): return True @@ -444,48 +424,65 @@ class XsdAnyElement(XsdWildcard, ParticleMixin, ElementPathMixin): return iter(()) def iter_decode(self, elem, validation='lax', **kwargs): - if self.is_matching(elem.tag): - if self.process_contents == 'skip': - return + if not self.is_matching(elem.tag): + if validation != 'skip': + reason = "element %r not allowed here." % elem.tag + yield self.validation_error(validation, reason, elem, **kwargs) - self._load_namespace(get_namespace(elem.tag)) + elif self.process_contents == 'skip': + return + + elif self.maps.load_namespace(get_namespace(elem.tag)): try: xsd_element = self.maps.lookup_element(elem.tag) except LookupError: - if kwargs.get('drop_results'): - # Validation-only mode: use anyType for decode a complex element. + if validation == 'skip': yield self.any_type.decode(elem) if len(elem) > 0 else elem.text - elif self.process_contents == 'strict' and validation != 'skip': + elif self.process_contents == 'strict': reason = "element %r not found." % elem.tag yield self.validation_error(validation, reason, elem, **kwargs) else: for result in xsd_element.iter_decode(elem, validation, **kwargs): yield result - elif validation != 'skip': - reason = "element %r not allowed here." % elem.tag + + elif validation == 'skip': + yield self.any_type.decode(elem) if len(elem) > 0 else elem.text + + elif self.process_contents == 'strict': + reason = "unavailable namespace {!r}".format(get_namespace(elem.tag)) yield self.validation_error(validation, reason, elem, **kwargs) def iter_encode(self, obj, validation='lax', **kwargs): - if self.process_contents == 'skip': - return - name, value = obj namespace = get_namespace(name) - if self.is_namespace_allowed(namespace): - self._load_namespace(namespace) + if not self.is_namespace_allowed(namespace): + if validation != 'skip': + reason = "element %r not allowed here." % name + yield self.validation_error(validation, reason, value, **kwargs) + + elif self.process_contents == 'skip': + return + + elif self.maps.load_namespace(namespace): try: xsd_element = self.maps.lookup_element(name) except LookupError: - if self.process_contents == 'strict' and validation != 'skip': + if validation == 'skip': + yield self.any_type.encode(value) + elif self.process_contents == 'strict': reason = "element %r not found." % name yield self.validation_error(validation, reason, **kwargs) else: for result in xsd_element.iter_encode(value, validation, **kwargs): yield result - elif validation != 'skip': - reason = "element %r not allowed here." % name - yield self.validation_error(validation, reason, value, **kwargs) + + elif validation == 'skip': + yield self.any_type.encode(value) + + elif self.process_contents == 'strict': + reason = "unavailable namespace {!r}".format(namespace) + yield self.validation_error(validation, reason, **kwargs) def is_overlap(self, other): if not isinstance(other, XsdAnyElement): @@ -562,47 +559,66 @@ class XsdAnyAttribute(XsdWildcard): def iter_decode(self, attribute, validation='lax', **kwargs): name, value = attribute - if self.is_matching(name): - if self.process_contents == 'skip': - return - self._load_namespace(get_namespace(name)) + if not self.is_matching(name): + if validation != 'skip': + reason = "attribute %r not allowed." % name + yield self.validation_error(validation, reason, attribute, **kwargs) + + elif self.process_contents == 'skip': + return + + elif self.maps.load_namespace(get_namespace(name)): try: xsd_attribute = self.maps.lookup_attribute(name) except LookupError: - if kwargs.get('drop_results'): - # Validation-only mode: returns the value if a decoder is not found. + if validation == 'skip': yield value - elif self.process_contents == 'strict' and validation != 'skip': + elif self.process_contents == 'strict': reason = "attribute %r not found." % name yield self.validation_error(validation, reason, attribute, **kwargs) else: for result in xsd_attribute.iter_decode(value, validation, **kwargs): yield result - elif validation != 'skip': - reason = "attribute %r not allowed." % name - yield self.validation_error(validation, reason, attribute, **kwargs) + + elif validation == 'skip': + yield value + + elif self.process_contents == 'strict': + reason = "unavailable namespace {!r}".format(get_namespace(name)) + yield self.validation_error(validation, reason, **kwargs) def iter_encode(self, attribute, validation='lax', **kwargs): - if self.process_contents == 'skip': - return - name, value = attribute namespace = get_namespace(name) - if self.is_namespace_allowed(namespace): - self._load_namespace(namespace) + + if not self.is_namespace_allowed(namespace): + if validation != 'skip': + reason = "attribute %r not allowed." % name + yield self.validation_error(validation, reason, attribute, **kwargs) + + elif self.process_contents == 'skip': + return + + elif self.maps.load_namespace(namespace): try: xsd_attribute = self.maps.lookup_attribute(name) except LookupError: - if self.process_contents == 'strict' and validation != 'skip': + if validation == 'skip': + yield unicode_type(value) + elif self.process_contents == 'strict': reason = "attribute %r not found." % name yield self.validation_error(validation, reason, attribute, **kwargs) else: for result in xsd_attribute.iter_encode(value, validation, **kwargs): yield result - elif validation != 'skip': - reason = "attribute %r not allowed." % name - yield self.validation_error(validation, reason, attribute, **kwargs) + + elif validation == 'skip': + yield unicode_type(value) + + elif self.process_contents == 'strict': + reason = "unavailable namespace {!r}".format(get_namespace(name)) + yield self.validation_error(validation, reason, **kwargs) class Xsd11AnyElement(XsdAnyElement): From 54060ba0df95eb7209dbb94dcf885bb7a85c5007 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 16 Oct 2019 21:14:15 +0200 Subject: [PATCH 05/34] Modify resources.fetch_schema_locations() - Now can returns location for another namespace if hints for resource namespace are missing --- xmlschema/resources.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 1cf37c1..35478d4 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -169,12 +169,17 @@ def fetch_schema_locations(source, locations=None, **resource_options): base_url = resource.base_url namespace = resource.namespace locations = resource.get_locations(locations) - for ns, url in filter(lambda x: x[0] == namespace, locations): + if not locations: + msg = "the XML data resource {!r} does not contain any schema location hint." + raise XMLSchemaValueError(msg.format(source)) + + for ns, url in sorted(locations, key=lambda x: x[0] != namespace): try: return fetch_resource(url, base_url, timeout), locations except XMLSchemaURLError: pass - raise XMLSchemaValueError("not found a schema for XML data resource %r (namespace=%r)." % (source, namespace)) + + raise XMLSchemaValueError("not found a schema for XML data resource {!r}.".format(source)) def fetch_schema(source, locations=None, **resource_options): From 257ef230c4b4c42a3374d0a5bc620c68d579e0fd Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 17 Oct 2019 10:50:24 +0200 Subject: [PATCH 06/34] Update test_resources.py --- xmlschema/tests/test_resources.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index f5dbd5d..2519be2 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -14,6 +14,7 @@ This module runs tests concerning resources. """ import unittest import os +import platform try: from pathlib import PureWindowsPath, PurePath @@ -22,9 +23,9 @@ except ImportError: from xmlschema import ( fetch_namespaces, fetch_resource, normalize_url, fetch_schema, fetch_schema_locations, - load_xml_resource, XMLResource, XMLSchemaURLError + load_xml_resource, XMLResource, XMLSchemaURLError, XMLSchema ) -from xmlschema.tests import casepath +from xmlschema.tests import SKIP_REMOTE_TESTS, casepath from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, \ etree_element, py_etree_element @@ -44,6 +45,7 @@ class TestResources(unittest.TestCase): @classmethod def setUpClass(cls): + cls.schema_class = XMLSchema cls.vh_dir = casepath('examples/vehicles') cls.vh_xsd_file = casepath('examples/vehicles/vehicles.xsd') cls.vh_xml_file = casepath('examples/vehicles/vehicles.xml') From 2ed02120026fe788f173680c269f904bbc88fc52 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Sat, 19 Oct 2019 00:04:28 +0200 Subject: [PATCH 07/34] Add option -n/--narrow to tests for skipping extra checks --- CHANGELOG.rst | 6 +++ xmlschema/tests/test_factory/arguments.py | 4 +- xmlschema/tests/test_factory/factory.py | 5 +- xmlschema/tests/test_factory/schema_tests.py | 5 +- .../tests/test_factory/validation_tests.py | 47 ++++++++++--------- xmlschema/tests/test_resources.py | 6 ++- xmlschema/validators/globals_.py | 5 +- 7 files changed, 49 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 213513e..0af63c5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ CHANGELOG ********* +`v1.0.16`_ (2019-10-XX) +======================= +* Improved XMLResource with zip files interface and lazy +* Fix for validation with XSD wildcards and 'lax' process content +* Fix for issue #1... + `v1.0.15`_ (2019-10-13) ======================= * Improved XPath 2.0 bindings diff --git a/xmlschema/tests/test_factory/arguments.py b/xmlschema/tests/test_factory/arguments.py index 95ff4c2..49326cd 100644 --- a/xmlschema/tests/test_factory/arguments.py +++ b/xmlschema/tests/test_factory/arguments.py @@ -21,6 +21,7 @@ import re import argparse TEST_FACTORY_OPTIONS = { + 'narrow': '-n' in sys.argv or '--narrow' in sys.argv, # Skip extra checks (eg. other converters) 'extra_cases': '-x' in sys.argv or '--extra' in sys.argv, # Include extra test cases 'check_with_lxml': '-l' in sys.argv or '--lxml' in sys.argv, # Check with lxml.etree.XMLSchema (for XSD 1.0) } @@ -28,7 +29,8 @@ TEST_FACTORY_OPTIONS = { RUN_W3C_TEST_SUITE = '-w' in sys.argv or '--w3c' in sys.argv -sys.argv = [a for a in sys.argv if a not in {'-x', '--extra', '-l', '--lxml'}] # Clean sys.argv for unittest +sys.argv = [a for a in sys.argv if a not in + {'-x', '--extra', '-l', '--lxml', '-n', '--narrow'}] # Clean sys.argv for unittest def get_test_args(args_line): diff --git a/xmlschema/tests/test_factory/factory.py b/xmlschema/tests/test_factory/factory.py index 53e3700..08d62d4 100644 --- a/xmlschema/tests/test_factory/factory.py +++ b/xmlschema/tests/test_factory/factory.py @@ -38,6 +38,7 @@ def tests_factory(test_class_builder, suffix='xml'): test_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) testfiles = [os.path.join(test_dir, 'test_cases/testfiles')] + narrow = TEST_FACTORY_OPTIONS['narrow'] if TEST_FACTORY_OPTIONS['extra_cases']: package_dir = os.path.dirname(os.path.dirname(test_dir)) testfiles.extend(glob.glob(os.path.join(package_dir, 'test_cases/testfiles'))) @@ -94,7 +95,9 @@ def tests_factory(test_class_builder, suffix='xml'): schema_class = ObservedXMLSchema11 if test_args.inspect else XMLSchema11 check_with_lxml = False - test_class = test_class_builder(test_file, test_args, test_num, schema_class, check_with_lxml) + test_class = test_class_builder( + test_file, test_args, test_num, schema_class, narrow, check_with_lxml + ) test_classes[test_class.__name__] = test_class logger.debug("Add XSD %s test class %r.", test_args.version, test_class.__name__) diff --git a/xmlschema/tests/test_factory/schema_tests.py b/xmlschema/tests/test_factory/schema_tests.py index 6796ef3..5e3511a 100644 --- a/xmlschema/tests/test_factory/schema_tests.py +++ b/xmlschema/tests/test_factory/schema_tests.py @@ -27,7 +27,7 @@ from xmlschema.tests import XsdValidatorTestCase from .observers import SchemaObserver -def make_schema_test_class(test_file, test_args, test_num, schema_class, check_with_lxml): +def make_schema_test_class(test_file, test_args, test_num, schema_class, narrow, check_with_lxml): """ Creates a schema test class. @@ -35,6 +35,7 @@ def make_schema_test_class(test_file, test_args, test_num, schema_class, check_w :param test_args: line arguments for test case. :param test_num: a positive integer number associated with the test case. :param schema_class: the schema class to use. + :param narrow: skip extra checks (observed inspections). :param check_with_lxml: if `True` compare with lxml XMLSchema class, reporting anomalies. \ Works only for XSD 1.0 tests. """ @@ -69,7 +70,7 @@ def make_schema_test_class(test_file, test_args, test_num, schema_class, check_w xs = schema_class(xsd_file, locations=locations, defuse=defuse, loglevel=loglevel) self.errors.extend(xs.maps.all_errors) - if inspect: + if narrow and inspect: components_ids = set([id(c) for c in xs.maps.iter_components()]) missing = [c for c in SchemaObserver.components if id(c) not in components_ids] if any(c for c in missing): diff --git a/xmlschema/tests/test_factory/validation_tests.py b/xmlschema/tests/test_factory/validation_tests.py index dfd2d50..651281e 100644 --- a/xmlschema/tests/test_factory/validation_tests.py +++ b/xmlschema/tests/test_factory/validation_tests.py @@ -47,7 +47,7 @@ def iter_nested_items(items, dict_class=dict, list_class=list): yield items -def make_validator_test_class(test_file, test_args, test_num, schema_class, check_with_lxml): +def make_validator_test_class(test_file, test_args, test_num, schema_class, narrow, check_with_lxml): """ Creates a validator test class. @@ -55,6 +55,7 @@ def make_validator_test_class(test_file, test_args, test_num, schema_class, chec :param test_args: line arguments for test case. :param test_num: a positive integer number associated with the test case. :param schema_class: the schema class to use. + :param narrow: skip other converters checks. :param check_with_lxml: if `True` compare with lxml XMLSchema class, reporting anomalies. \ Works only for XSD 1.0 tests. """ @@ -239,19 +240,21 @@ def make_validator_test_class(test_file, test_args, test_num, schema_class, chec options = {'namespaces': namespaces, 'dict_class': ordered_dict_class} self.check_etree_encode(root, cdata_prefix='#', **options) # Default converter - self.check_etree_encode(root, ParkerConverter, validation='lax', **options) - self.check_etree_encode(root, ParkerConverter, validation='skip', **options) - self.check_etree_encode(root, BadgerFishConverter, **options) - self.check_etree_encode(root, AbderaConverter, **options) - self.check_etree_encode(root, JsonMLConverter, **options) + if narrow: + self.check_etree_encode(root, ParkerConverter, validation='lax', **options) + self.check_etree_encode(root, ParkerConverter, validation='skip', **options) + self.check_etree_encode(root, BadgerFishConverter, **options) + self.check_etree_encode(root, AbderaConverter, **options) + self.check_etree_encode(root, JsonMLConverter, **options) options.pop('dict_class') self.check_json_serialization(root, cdata_prefix='#', **options) - self.check_json_serialization(root, ParkerConverter, validation='lax', **options) - self.check_json_serialization(root, ParkerConverter, validation='skip', **options) - self.check_json_serialization(root, BadgerFishConverter, **options) - self.check_json_serialization(root, AbderaConverter, **options) - self.check_json_serialization(root, JsonMLConverter, **options) + if narrow: + self.check_json_serialization(root, ParkerConverter, validation='lax', **options) + self.check_json_serialization(root, ParkerConverter, validation='skip', **options) + self.check_json_serialization(root, BadgerFishConverter, **options) + self.check_json_serialization(root, AbderaConverter, **options) + self.check_json_serialization(root, JsonMLConverter, **options) def check_decoding_and_encoding_with_lxml(self): xml_tree = lxml_etree.parse(xml_file) @@ -280,19 +283,21 @@ def make_validator_test_class(test_file, test_args, test_num, schema_class, chec 'dict_class': ordered_dict_class, } self.check_etree_encode(root, cdata_prefix='#', **options) # Default converter - self.check_etree_encode(root, ParkerConverter, validation='lax', **options) - self.check_etree_encode(root, ParkerConverter, validation='skip', **options) - self.check_etree_encode(root, BadgerFishConverter, **options) - self.check_etree_encode(root, AbderaConverter, **options) - self.check_etree_encode(root, JsonMLConverter, **options) + if narrow: + self.check_etree_encode(root, ParkerConverter, validation='lax', **options) + self.check_etree_encode(root, ParkerConverter, validation='skip', **options) + self.check_etree_encode(root, BadgerFishConverter, **options) + self.check_etree_encode(root, AbderaConverter, **options) + self.check_etree_encode(root, JsonMLConverter, **options) options.pop('dict_class') self.check_json_serialization(root, cdata_prefix='#', **options) - self.check_json_serialization(root, ParkerConverter, validation='lax', **options) - self.check_json_serialization(root, ParkerConverter, validation='skip', **options) - self.check_json_serialization(root, BadgerFishConverter, **options) - self.check_json_serialization(root, AbderaConverter, **options) - self.check_json_serialization(root, JsonMLConverter, **options) + if narrow: + self.check_json_serialization(root, ParkerConverter, validation='lax', **options) + self.check_json_serialization(root, ParkerConverter, validation='skip', **options) + self.check_json_serialization(root, BadgerFishConverter, **options) + self.check_json_serialization(root, AbderaConverter, **options) + self.check_json_serialization(root, JsonMLConverter, **options) def check_validate_and_is_valid_api(self): if expected_errors: diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index f5dbd5d..2519be2 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -14,6 +14,7 @@ This module runs tests concerning resources. """ import unittest import os +import platform try: from pathlib import PureWindowsPath, PurePath @@ -22,9 +23,9 @@ except ImportError: from xmlschema import ( fetch_namespaces, fetch_resource, normalize_url, fetch_schema, fetch_schema_locations, - load_xml_resource, XMLResource, XMLSchemaURLError + load_xml_resource, XMLResource, XMLSchemaURLError, XMLSchema ) -from xmlschema.tests import casepath +from xmlschema.tests import SKIP_REMOTE_TESTS, casepath from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, \ etree_element, py_etree_element @@ -44,6 +45,7 @@ class TestResources(unittest.TestCase): @classmethod def setUpClass(cls): + cls.schema_class = XMLSchema cls.vh_dir = casepath('examples/vehicles') cls.vh_xsd_file = casepath('examples/vehicles/vehicles.xsd') cls.vh_xml_file = casepath('examples/vehicles/vehicles.xml') diff --git a/xmlschema/validators/globals_.py b/xmlschema/validators/globals_.py index c9716d7..c469b63 100644 --- a/xmlschema/validators/globals_.py +++ b/xmlschema/validators/globals_.py @@ -15,7 +15,7 @@ from __future__ import unicode_literals import warnings from collections import Counter -from ..compat import string_base_type +from ..compat import string_base_type, lru_cache from ..exceptions import XMLSchemaKeyError, XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaWarning from ..namespaces import XSD_NAMESPACE, LOCATION_HINTS, NamespaceResourcesMap from ..qnames import XSD_REDEFINE, XSD_OVERRIDE, XSD_NOTATION, XSD_ANY_TYPE, \ @@ -385,6 +385,7 @@ class XsdGlobals(XsdValidator): elif not any(schema.url == obj.url and schema.__class__ == obj.__class__ for obj in ns_schemas): ns_schemas.append(schema) + @lru_cache(maxsize=1000) def load_namespace(self, namespace, build=True): """ Load namespace from available location hints. Returns `True` if the namespace @@ -471,7 +472,7 @@ class XsdGlobals(XsdValidator): self.namespaces = namespaces else: - self.missing_locations.clear() + del self.missing_locations[:] for global_map in self.global_maps: global_map.clear() self.substitution_groups.clear() From 43322b6bc0c76d425a13c1028f2c2bfea3cbda7a Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Sat, 19 Oct 2019 00:08:09 +0200 Subject: [PATCH 08/34] Refactor XmlResource after merge - Remove _document and _fid (use the attribute source instead) --- xmlschema/resources.py | 130 +++++++++++++++++++----------- xmlschema/tests/test_resources.py | 9 +-- 2 files changed, 85 insertions(+), 54 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 4fff367..be80107 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -250,7 +250,7 @@ class XMLResource(object): if base_url is not None and not isinstance(base_url, string_base_type): raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) - self._root = self._document = self._url = self._text = self._fid = None + self._root = self._text = self._url = None self._base_url = base_url self.defuse = defuse self.timeout = timeout @@ -279,7 +279,7 @@ class XMLResource(object): def __setattr__(self, name, value): if name == 'source': - self._root, self._document, self._text, self._url, self._fid = self._fromsource(value) + self._root, self._text, self._url = self._fromsource(value) elif name == 'defuse' and value not in DEFUSE_MODES: raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) elif name == 'timeout' and (not isinstance(value, int) or value <= 0): @@ -289,47 +289,54 @@ class XMLResource(object): super(XMLResource, self).__setattr__(name, value) def _fromsource(self, source): - url, lazy = None, self._lazy - if hasattr(source, 'tag'): + url = None + if hasattr(source, 'tag') and hasattr(source, 'attrib'): self._lazy = False - return source, None, None, None, None # Source is already an Element --> nothing to load + return source, None, None # Source is already an Element --> nothing to load + elif isinstance(source, string_base_type): _url, self._url = self._url, None try: - if lazy: + if self._lazy: # check if source is a string containing a valid XML root for _, root in self.iterparse(StringIO(source), events=('start',)): - return root, None, source, None, None + return root, source, None else: - return self.fromstring(source), None, source, None, None + return self.fromstring(source), source, None except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): if '\n' in source: raise finally: self._url = _url + url = normalize_url(source) if '\n' not in source else None elif isinstance(source, StringIO): _url, self._url = self._url, None try: - if lazy: + if self._lazy: for _, root in self.iterparse(source, events=('start',)): - return root, None, source.getvalue(), None, None + return root, source.getvalue(), None else: - document = self.parse(source) - return document.getroot(), document, source.getvalue(), None, None + return self.parse(source).getroot(), source.getvalue(), None finally: self._url = _url elif hasattr(source, 'read'): + try: + # Save remote urls for open new resources (non seekable) + if is_remote_url(source.url): + url = source.url + except AttributeError: + pass + _url, self._url = self._url, url try: - if lazy: + if self._lazy: for _, root in self.iterparse(source, events=('start',)): - return root, None, None, url, source + return root, None, url else: - document = self.parse(source) - return document.getroot(), document, None, url, source + return self.parse(source).getroot(), None, url finally: self._url = _url @@ -342,7 +349,7 @@ class XMLResource(object): else: if hasattr(root, 'tag'): self._lazy = False - return root, source, None, None, None + return root, None, None if url is None: raise XMLSchemaTypeError( @@ -353,13 +360,11 @@ class XMLResource(object): resource = urlopen(url, timeout=self.timeout) _url, self._url = self._url, url try: - if lazy: + if self._lazy: for _, root in self.iterparse(resource, events=('start',)): - return root, None, None, url, None + return root, None, url else: - document = self.parse(resource) - root = document.getroot() - return root, document, None, url, None + return self.parse(resource).getroot(), None, url finally: self._url = _url resource.close() @@ -369,14 +374,6 @@ class XMLResource(object): """The XML tree root Element.""" return self._root - @property - def document(self): - """ - The ElementTree document, `None` if the instance is lazy or is not created - from another document or from an URL. - """ - return self._document - @property def text(self): """The XML text source, `None` if it's not available.""" @@ -392,9 +389,22 @@ class XMLResource(object): """The base URL for completing relative locations.""" return os.path.dirname(self._url) if self._url else self._base_url + @property + def document(self): + """ + The resource as ElementTree XML document. It's `None` if the instance + is lazy or if it's an lxml Element. + """ + if isinstance(self.source, ElementTree.ElementTree): + return self.source + elif hasattr(self.source, 'getroot') and hasattr(self.source, 'parse'): + return self.source # lxml's _ElementTree + elif not self._lazy and not hasattr(self.root, 'nsmap'): + return ElementTree.ElementTree(self.root) + @property def namespace(self): - """The namespace of the XML document.""" + """The namespace of the XML resource.""" return get_namespace(self._root.tag) if self._root is not None else None @staticmethod @@ -477,24 +487,48 @@ class XMLResource(object): return obj def open(self): - """Returns a opened resource reader object for the instance URL.""" - if self._fid is not None: - self._fid.seek(0) - return self._fid - - if self._url is None: + """ + Returns a opened resource reader object for the instance URL. If the + source attribute is a seekable file-like object rewind the source and + return it. + """ + if self.seek(0) == 0: + return self.source + elif self._url is None: raise XMLSchemaValueError("can't open, the resource has no URL associated.") try: return urlopen(self._url, timeout=self.timeout) except URLError as err: raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason)) + def seek(self, position): + if not hasattr(self.source, 'read'): + return + + try: + if not self.source.seekable(): + return + except AttributeError: + pass + else: + return self.source.seek(position) + + try: + return self.source.seek(position) + except AttributeError: + pass + + try: + return self.source.fp.seek(position) + except AttributeError: + pass + def load(self): """ Loads the XML text from the data source. If the data source is an Element the source XML text can't be retrieved. """ - if self._url is None and self._fid is None: + if self._url is None and not hasattr(self.source, 'read'): return # Created from Element or text source --> already loaded resource = self.open() @@ -506,7 +540,7 @@ class XMLResource(object): # We don't want to close the file obj if it wasn't originally # opened by `XMLResource`. That is the concern of the code # where the file obj came from. - if self._fid is None: + if resource is not self.source: resource.close() if isinstance(data, bytes): @@ -537,9 +571,8 @@ class XMLResource(object): for elem in self._root.iter(tag): yield elem return - elif self._fid is not None: - self._fid.seek(0) - resource = self._fid + elif self.seek(0) == 0: + resource = self.source elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -551,7 +584,7 @@ class XMLResource(object): yield elem elem.clear() finally: - if self._fid is None: + if resource is not self.source: resource.close() def iterfind(self, path=None, namespaces=None): @@ -563,9 +596,8 @@ class XMLResource(object): for e in iter_select(self._root, path, namespaces, strict=False): yield e return - elif self._fid is not None: - self._fid.seek(0) - resource = self._fid + elif self.seek(0) == 0: + resource = self.source elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -603,7 +635,7 @@ class XMLResource(object): elif level == 0: elem.clear() finally: - if self._fid is None: + if self.source is not resource: resource.close() def iter_location_hints(self): @@ -656,7 +688,7 @@ class XMLResource(object): local_root = self.root.tag[0] != '{' nsmap = {} - if self._url is not None or self._fid is not None: + if self._url is not None or hasattr(self.source, 'read'): resource = self.open() try: for event, node in self.iterparse(resource, events=('start-ns', 'end')): @@ -670,7 +702,7 @@ class XMLResource(object): # We don't want to close the file obj if it wasn't # originally opened by `XMLResource`. That is the concern # of the code where the file obj came from. - if self._fid is None: + if self.source is not resource: resource.close() elif isinstance(self._text, string_base_type): try: diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 2519be2..5acf4fa 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -179,7 +179,7 @@ class TestResources(unittest.TestCase): resource = XMLResource(vh_root) self.assertEqual(resource.source, vh_root) - self.assertIsNone(resource.document) + self.assertIsInstance(resource.document, ElementTree.ElementTree) self.assertEqual(resource.root.tag, '{http://example.com/vehicles}vehicles') self.assertIsNone(resource.url) self.assertIsNone(resource.text) @@ -436,12 +436,11 @@ class TestResources(unittest.TestCase): except (KeyError, AttributeError): return getattr(self.__dict__["_fid"], attr) - fake_name = "not__on____disk.xml" - with open(self.vh_xml_file) as schema_file: - resource = XMLResource(FileProxy(schema_file, fake_name)) + with open(self.vh_xml_file) as xml_file: + resource = XMLResource(FileProxy(xml_file, fake_name="not__on____disk.xml")) self.assertIsNone(resource.url) self.assertEqual(set(resource.get_namespaces().keys()), {'vh', 'xsi'}) - self.assertFalse(schema_file.closed) + self.assertFalse(xml_file.closed) if __name__ == '__main__': From 8dd5d193ba699b23bfc491bc70fe5cb3a0e0d85a Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Sat, 19 Oct 2019 19:31:43 +0200 Subject: [PATCH 09/34] Update XML resource iterfind() to fix issues #102 and #112 - Speed up admitting simple paths and checking only elements that match path level - Avoid selection for * paths (about 35% faster) - Add close() method to XmlResource --- xmlschema/resources.py | 71 ++++++++++++--- xmlschema/tests/test_resources.py | 147 ++++++++++++++++++++++++++++-- 2 files changed, 200 insertions(+), 18 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index be80107..6fef39c 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -11,7 +11,7 @@ import os.path import re import codecs -from elementpath import iter_select, Selector +from elementpath import iter_select, Selector, XPath1Parser from .compat import ( PY3, StringIO, BytesIO, string_base_type, urlopen, urlsplit, urljoin, urlunsplit, @@ -26,8 +26,23 @@ from .etree import ElementTree, PyElementTree, SafeXMLParser, etree_tostring DEFUSE_MODES = ('always', 'remote', 'never') +XML_RESOURCE_XPATH_SYMBOLS = { + 'position', 'last', 'not', 'and', 'or', '!=', '<=', '>=', '(', ')', 'text', + '[', ']', '.', ',', '/', '|', '*', '=', '<', '>', ':', '(end)', '(name)', + '(string)', '(float)', '(decimal)', '(integer)' +} + + +class XmlResourceXPathParser(XPath1Parser): + symbol_table = {k: v for k, v in XPath1Parser.symbol_table.items() if k in XML_RESOURCE_XPATH_SYMBOLS} + SYMBOLS = XML_RESOURCE_XPATH_SYMBOLS + + +XmlResourceXPathParser.build_tokenizer() + + def is_remote_url(url): - return url is not None and urlsplit(url).scheme not in ('', 'file') + return isinstance(url, string_base_type) and urlsplit(url).scheme not in ('', 'file') def url_path_is_directory(url): @@ -424,14 +439,23 @@ class XMLResource(object): def parse(self, source): """ - An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When - protection is applied XML data are loaded and defused before building the ElementTree instance. + An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. + When protection is applied XML data are loaded and defused before building the + ElementTree instance. The protection applied is based on value of *defuse* + attribute and *base_url* property. :param source: a filename or file object containing XML data. :returns: an ElementTree instance. """ - if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): - text = source.read() + if self.defuse == 'always' or self.defuse == 'remote' and \ + hasattr(source, 'read') and is_remote_url(self.base_url): + + if hasattr(source, 'read'): + text = source.read() + else: + with open(source) as f: + text = f.read() + if isinstance(text, bytes): self.defusing(BytesIO(text)) return ElementTree.parse(BytesIO(text)) @@ -445,11 +469,14 @@ class XMLResource(object): """ An equivalent of *ElementTree.iterparse()* that can protect from XML entities attacks. When protection is applied the iterator yields pure-Python Element instances. + The protection applied is based on resource *defuse* attribute and *base_url* property. :param source: a filename or file object containing XML data. :param events: a list of events to report back. If omitted, only “end” events are reported. """ - if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): + if self.defuse == 'always' or self.defuse == 'remote' and \ + hasattr(source, 'read') and is_remote_url(self.base_url): + parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) try: return PyElementTree.iterparse(source, events, parser) @@ -461,17 +488,20 @@ class XMLResource(object): def fromstring(self, text): """ An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks. + The protection applied is based on resource *defuse* attribute and *base_url* property. :param text: a string containing XML data. :returns: the root Element instance. """ - if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): + if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self.base_url): self.defusing(StringIO(text)) return ElementTree.fromstring(text) def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False): """Generates a string representation of the XML resource.""" - return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration) + elem = self._root + namespaces = self.get_namespaces() + return etree_tostring(elem, namespaces, indent, max_lines, spaces_for_tab, xml_declaration) def copy(self, **kwargs): """Resource copy method. Change init parameters with keyword arguments.""" @@ -502,6 +532,10 @@ class XMLResource(object): raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason)) def seek(self, position): + """ + Change stream position if the XML resource was created with a seekable + file-like object. In the other cases this method has no effect. + """ if not hasattr(self.source, 'read'): return @@ -523,6 +557,16 @@ class XMLResource(object): except AttributeError: pass + def close(self): + """ + Close the XML resource if it's created with a file-like object. + In other cases this method has no effect. + """ + try: + self.source.close() + except (AttributeError, TypeError): + pass + def load(self): """ Loads the XML text from the data source. If the data source is an Element @@ -619,7 +663,11 @@ class XMLResource(object): yield elem elem.clear() else: - selector = Selector(path, namespaces, strict=False) + selector = Selector(path, namespaces, strict=False, parser=XmlResourceXPathParser) + path.replace(' ', '').replace('./', '') + path_level = path.count('/') + 1 + select_all = '*' in path and set(path).issubset({'*', '/'}) + level = 0 for event, elem in self.iterparse(resource, events=('start', 'end')): if event == "start": @@ -629,7 +677,8 @@ class XMLResource(object): level += 1 else: level -= 1 - if elem in selector.select(self._root): + if level == path_level and \ + (select_all or elem in selector.select(self._root)): yield elem elem.clear() elif level == 0: diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 5acf4fa..e668315 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -13,12 +13,14 @@ This module runs tests concerning resources. """ import unittest +import time import os import platform try: from pathlib import PureWindowsPath, PurePath except ImportError: + # noinspection PyPackageRequirements from pathlib2 import PureWindowsPath, PurePath from xmlschema import ( @@ -29,6 +31,7 @@ from xmlschema.tests import SKIP_REMOTE_TESTS, casepath from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, \ etree_element, py_etree_element +from xmlschema.namespaces import XSD_NAMESPACE from xmlschema.helpers import is_etree_element @@ -344,14 +347,36 @@ class TestResources(unittest.TestCase): resource.load() self.assertTrue(resource.is_loaded()) - def test_xml_resource_open(self): + def test_xml_resource_parse(self): resource = XMLResource(self.vh_xml_file) - xml_file = resource.open() - data = xml_file.read().decode('utf-8') - self.assertTrue(data.startswith('') - self.assertRaises(ValueError, resource.open) + + self.assertEqual(resource.defuse, 'remote') + xml_document = resource.parse(self.col_xml_file) + self.assertTrue(is_etree_element(xml_document.getroot())) + + resource.defuse = 'always' + xml_document = resource.parse(self.col_xml_file) + self.assertTrue(is_etree_element(xml_document.getroot())) + + def test_xml_resource_iterparse(self): + resource = XMLResource(self.vh_xml_file) + + self.assertEqual(resource.defuse, 'remote') + for _, elem in resource.iterparse(self.col_xml_file, events=('end',)): + self.assertTrue(is_etree_element(elem)) + + resource.defuse = 'always' + for _, elem in resource.iterparse(self.col_xml_file, events=('end',)): + self.assertTrue(is_etree_element(elem)) + + def test_xml_resource_fromstring(self): + resource = XMLResource(self.vh_xml_file) + + self.assertEqual(resource.defuse, 'remote') + self.assertEqual(resource.fromstring('').tag, 'root') + + resource.defuse = 'always' + self.assertEqual(resource.fromstring('').tag, 'root') def test_xml_resource_tostring(self): resource = XMLResource(self.vh_xml_file) @@ -373,6 +398,114 @@ class TestResources(unittest.TestCase): resource2 = resource.copy() self.assertEqual(resource.text, resource2.text) + def test_xml_resource_open(self): + resource = XMLResource(self.vh_xml_file) + xml_file = resource.open() + self.assertIsNot(xml_file, resource.source) + data = xml_file.read().decode('utf-8') + self.assertTrue(data.startswith('') + self.assertRaises(ValueError, resource.open) + + resource = XMLResource(source=open(self.vh_xml_file)) + xml_file = resource.open() + self.assertIs(xml_file, resource.source) + xml_file.close() + + def test_xml_resource_seek(self): + resource = XMLResource(self.vh_xml_file) + self.assertIsNone(resource.seek(0)) + self.assertIsNone(resource.seek(1)) + xml_file = open(self.vh_xml_file) + resource = XMLResource(source=xml_file) + self.assertEqual(resource.seek(0), 0) + self.assertEqual(resource.seek(1), 1) + xml_file.close() + + def test_xml_resource_close(self): + resource = XMLResource(self.vh_xml_file) + resource.close() + xml_file = resource.open() + self.assertTrue(callable(xml_file.read)) + + xml_file = open(self.vh_xml_file) + resource = XMLResource(source=xml_file) + resource.close() + with self.assertRaises(ValueError): + resource.open() + + def test_xml_resource_iter(self): + for lazy in (False, True): + resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=lazy) + k = 0 + for k, _ in enumerate(resource.iter()): + pass + self.assertEqual(k, 1389) + + k = 0 + for k, _ in enumerate(resource.iter('{%s}complexType' % XSD_NAMESPACE)): + pass + self.assertEqual(k, 55) + + def test_xml_resource_iterfind(self): + resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=False) + self.assertFalse(resource.is_lazy()) + + start_time = time.time() + for _ in range(10): + for _ in resource.iterfind(): + pass + t1 = time.time() - start_time + + start_time = time.time() + for _ in range(10): + for _ in resource.iterfind(path='.'): + pass + t2 = time.time() - start_time + self.assertLessEqual(t1, t2 / 30.0) + self.assertGreaterEqual(t1, t2 / 100.0) + + start_time = time.time() + counter = 0 + for _ in resource.iterfind(path='*'): + counter += 1 + t3 = time.time() - start_time + self.assertGreaterEqual(t2, t3 / counter * 10) + + resource = XMLResource(self.schema_class.meta_schema.source.url) + self.assertTrue(resource.is_lazy()) + + start_time = time.time() + for _ in range(10): + for _ in resource.iterfind(): + pass + tl1 = time.time() - start_time + self.assertLessEqual(t1, tl1 / 1000.0) + self.assertGreaterEqual(t1, tl1 / 10000.0) + + start_time = time.time() + for _ in range(10): + for _ in resource.iterfind(path='.'): + pass + tl2 = time.time() - start_time + + self.assertLessEqual(t2, tl2 / 80.0) + self.assertGreaterEqual(t2, tl2 / 1000.0) + + start_time = time.time() + counter3 = 0 + for _ in resource.iterfind(path='*'): + counter3 += 1 + tl3 = time.time() - start_time + self.assertGreaterEqual(tl2, tl3 / counter3 * 10) + + start_time = time.time() + for _ in resource.iterfind(path='. /. / xs:complexType', namespaces={'xs': XSD_NAMESPACE}): + pass + tl4 = time.time() - start_time + self.assertTrue(0.7 < (tl3 / tl4) < 1) + def test_xml_resource_get_namespaces(self): with open(self.vh_xml_file) as schema_file: resource = XMLResource(schema_file) From 8db83477dfaa03c352abef859051e4976dd4b2d4 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Tue, 22 Oct 2019 14:28:46 +0200 Subject: [PATCH 10/34] Extend check_memory.py script - Add an argument to repeat test N times - Add matplotlib to dev requirements --- requirements-dev.txt | 1 + xmlschema/tests/check_memory.py | 73 ++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 83dfcbd..7ae0c91 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,6 +5,7 @@ coverage elementpath~=1.3.0 lxml memory_profiler +matplotlib pathlib2 # For Py27 tests on resources Sphinx sphinx_rtd_theme diff --git a/xmlschema/tests/check_memory.py b/xmlschema/tests/check_memory.py index 4a0c936..606c0dd 100755 --- a/xmlschema/tests/check_memory.py +++ b/xmlschema/tests/check_memory.py @@ -28,7 +28,7 @@ def test_choice_type(value): parser = argparse.ArgumentParser(add_help=True) -parser.usage = """%(prog)s TEST_NUM [XML_FILE] +parser.usage = """%(prog)s TEST_NUM [XML_FILE [REPEAT]] Run memory tests: 1) Package import or schema build @@ -44,6 +44,7 @@ Run memory tests: parser.add_argument('test_num', metavar="TEST_NUM", type=test_choice_type, help="Test number to run") parser.add_argument('xml_file', metavar='XML_FILE', nargs='?', help='Input XML file') +parser.add_argument('repeat', metavar='REPEAT', nargs='?', type=int, default=1, help='Repeat operation N times') args = parser.parse_args() @@ -68,54 +69,62 @@ def build_schema(source): @profile -def etree_parse(source): +def etree_parse(source, repeat=1): xt = ElementTree.parse(source) - for _ in xt.iter(): - pass - - -@profile -def etree_full_iterparse(source): - context = ElementTree.iterparse(source, events=('start', 'end')) - for event, elem in context: - if event == 'start': + for _ in range(repeat): + for _ in xt.iter(): pass @profile -def etree_emptied_iterparse(source): - context = ElementTree.iterparse(source, events=('start', 'end')) - for event, elem in context: - if event == 'end': - elem.clear() +def etree_full_iterparse(source, repeat=1): + for _ in range(repeat): + context = ElementTree.iterparse(source, events=('start', 'end')) + for event, elem in context: + if event == 'start': + pass @profile -def decode(source): +def etree_emptied_iterparse(source, repeat=1): + for _ in range(repeat): + context = ElementTree.iterparse(source, events=('start', 'end')) + for event, elem in context: + if event == 'end': + elem.clear() + + +@profile +def decode(source, repeat=1): decoder = xmlschema.XMLSchema.meta_schema if source.endswith('.xsd') else xmlschema - return decoder.to_dict(source) + for _ in range(repeat): + decoder.to_dict(source) @profile -def lazy_decode(source): +def lazy_decode(source, repeat=1): decoder = xmlschema.XMLSchema.meta_schema if source.endswith('.xsd') else xmlschema - for result in decoder.to_dict(xmlschema.XMLResource(source, lazy=True), path='*'): - del result + for _ in range(repeat): + for result in decoder.to_dict(xmlschema.XMLResource(source, lazy=True), path='*'): + del result @profile -def validate(source): +def validate(source, repeat=1): validator = xmlschema.XMLSchema.meta_schema if source.endswith('.xsd') else xmlschema - return validator.validate(source) + for _ in range(repeat): + validator.validate(source) @profile -def lazy_validate(source): +def lazy_validate(source, repeat=1): if source.endswith('.xsd'): validator, path = xmlschema.XMLSchema.meta_schema, '*' else: validator, path = xmlschema, None - return validator.validate(xmlschema.XMLResource(source, lazy=True), path=path) + + for _ in range(repeat): + validator.validate(xmlschema.XMLResource(source, lazy=True), path=path) if __name__ == '__main__': @@ -127,26 +136,26 @@ if __name__ == '__main__': build_schema(args.xml_file) elif args.test_num == 2: import xml.etree.ElementTree as ElementTree - etree_parse(args.xml_file) + etree_parse(args.xml_file, args.repeat) elif args.test_num == 3: import xml.etree.ElementTree as ElementTree - etree_full_iterparse(args.xml_file) + etree_full_iterparse(args.xml_file, args.repeat) elif args.test_num == 4: import xml.etree.ElementTree as ElementTree - etree_emptied_iterparse(args.xml_file) + etree_emptied_iterparse(args.xml_file, args.repeat) elif args.test_num == 5: import xmlschema xmlschema.XMLSchema.meta_schema.build() - decode(args.xml_file) + decode(args.xml_file, args.repeat) elif args.test_num == 6: import xmlschema xmlschema.XMLSchema.meta_schema.build() - lazy_decode(args.xml_file) + lazy_decode(args.xml_file, args.repeat) elif args.test_num == 7: import xmlschema xmlschema.XMLSchema.meta_schema.build() - validate(args.xml_file) + validate(args.xml_file, args.repeat) elif args.test_num == 8: import xmlschema xmlschema.XMLSchema.meta_schema.build() - lazy_validate(args.xml_file) + lazy_validate(args.xml_file, args.repeat) From c075ff22e546c02c9c80e7c0215814e2ec380edf Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Tue, 22 Oct 2019 18:37:26 +0200 Subject: [PATCH 11/34] Complete the revision of resource module - normalize_url() now processes file names containing '#' chars - Fix iterfind() of lazy resource - Add more tests for XML resources --- xmlschema/resources.py | 28 +++-- .../test_cases/resources/dummy file #2.txt | 1 + xmlschema/tests/test_resources.py | 113 +++++++++--------- xmlschema/validators/schemas/puppet.xsd | 32 ----- 4 files changed, 71 insertions(+), 103 deletions(-) create mode 100644 xmlschema/tests/test_cases/resources/dummy file #2.txt delete mode 100644 xmlschema/validators/schemas/puppet.xsd diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 6fef39c..94832fb 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -66,14 +66,21 @@ def normalize_url(url, base_url=None, keep_relative=False): conformant to URL format specification. :return: A normalized URL. """ - def add_trailing_slash(r): - return urlunsplit((r[0], r[1], r[2] + '/' if r[2] and r[2][-1] != '/' else r[2], r[3], r[4])) + def add_trailing_slash(x): + return urlunsplit((x[0], x[1], x[2] + '/' if x[2] and x[2][-1] != '/' else x[2], x[3], x[4])) + + def filter_url(x): + x = x.strip().replace('\\', '/') + while x.startswith('//'): + x = x.replace('//', '/', 1) + if not urlsplit(x).scheme: + x = x.replace('#', '%23') + return x + + url = filter_url(url) if base_url is not None: - base_url = base_url.replace('\\', '/') - while base_url.startswith('//'): - base_url = base_url.replace('//', '/', 1) - + base_url = filter_url(base_url) base_url_parts = urlsplit(base_url) base_url = add_trailing_slash(base_url_parts) if base_url_parts.scheme not in uses_relative: @@ -102,10 +109,6 @@ def normalize_url(url, base_url=None, keep_relative=False): if base_url_parts.netloc and not url.startswith(base_url_parts.netloc) and url.startswith('//'): url = 'file:' + url - url = url.replace('\\', '/') - while url.startswith('//'): - url = url.replace('//', '/', 1) - url_parts = urlsplit(url, scheme='file') if url_parts.scheme not in uses_relative: return 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/.... @@ -622,6 +625,7 @@ class XMLResource(object): else: resource = StringIO(self._text) + # Note: lazy iteration change the order (top level element is the last) try: for event, elem in self.iterparse(resource, events=('end',)): if tag is None or elem.tag == tag: @@ -664,8 +668,8 @@ class XMLResource(object): elem.clear() else: selector = Selector(path, namespaces, strict=False, parser=XmlResourceXPathParser) - path.replace(' ', '').replace('./', '') - path_level = path.count('/') + 1 + path = path.replace(' ', '').replace('./', '') + path_level = path.count('/') + 1 if path != '.' else 0 select_all = '*' in path and set(path).issubset({'*', '/'}) level = 0 diff --git a/xmlschema/tests/test_cases/resources/dummy file #2.txt b/xmlschema/tests/test_cases/resources/dummy file #2.txt new file mode 100644 index 0000000..a9e6024 --- /dev/null +++ b/xmlschema/tests/test_cases/resources/dummy file #2.txt @@ -0,0 +1 @@ +DUMMY CONTENT \ No newline at end of file diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index e668315..38f94a4 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -13,7 +13,6 @@ This module runs tests concerning resources. """ import unittest -import time import os import platform @@ -123,12 +122,25 @@ class TestResources(unittest.TestCase): self.assertEqual(normalize_url('dir2/schema.xsd', '//root/dir1'), 'file:///root/dir1/dir2/schema.xsd') self.assertEqual(normalize_url('dir2/schema.xsd', '////root/dir1'), 'file:///root/dir1/dir2/schema.xsd') + self.check_url(normalize_url('issue #000.xml', 'file://host/home/'), + 'file://host/home/issue %23000.xml') + self.check_url(normalize_url('data.xml', 'file://host/home/issue 000'), + 'file://host/home/issue 000/data.xml') + self.check_url(normalize_url('data.xml', '/host/home/issue #000'), + '/host/home/issue %23000/data.xml') + def test_fetch_resource(self): wrong_path = casepath('resources/dummy_file.txt') self.assertRaises(XMLSchemaURLError, fetch_resource, wrong_path) right_path = casepath('resources/dummy file.txt') self.assertTrue(fetch_resource(right_path).endswith('dummy file.txt')) + ambiguous_path = casepath('resources/dummy file #2.txt') + self.assertTrue(fetch_resource(ambiguous_path).endswith('dummy file %232.txt')) + + with urlopen(fetch_resource(ambiguous_path)) as res: + self.assertEqual(res.read(), b'DUMMY CONTENT') + def test_fetch_namespaces(self): self.assertFalse(fetch_namespaces(casepath('resources/malformed.xml'))) @@ -436,75 +448,58 @@ class TestResources(unittest.TestCase): resource.open() def test_xml_resource_iter(self): - for lazy in (False, True): - resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=lazy) - k = 0 - for k, _ in enumerate(resource.iter()): - pass - self.assertEqual(k, 1389) - - k = 0 - for k, _ in enumerate(resource.iter('{%s}complexType' % XSD_NAMESPACE)): - pass - self.assertEqual(k, 55) - - def test_xml_resource_iterfind(self): resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=False) self.assertFalse(resource.is_lazy()) + lazy_resource = XMLResource(self.schema_class.meta_schema.source.url) + self.assertTrue(lazy_resource.is_lazy()) - start_time = time.time() - for _ in range(10): - for _ in resource.iterfind(): - pass - t1 = time.time() - start_time + tags = [x.tag for x in resource.iter()] + self.assertEqual(len(tags), 1390) + self.assertEqual(tags[0], '{%s}schema' % XSD_NAMESPACE) - start_time = time.time() - for _ in range(10): - for _ in resource.iterfind(path='.'): - pass - t2 = time.time() - start_time - self.assertLessEqual(t1, t2 / 30.0) - self.assertGreaterEqual(t1, t2 / 100.0) + lazy_tags = [x.tag for x in lazy_resource.iter()] + self.assertEqual(len(lazy_tags), 1390) + self.assertEqual(lazy_tags[-1], '{%s}schema' % XSD_NAMESPACE) + self.assertNotEqual(tags, lazy_tags) - start_time = time.time() - counter = 0 - for _ in resource.iterfind(path='*'): - counter += 1 - t3 = time.time() - start_time - self.assertGreaterEqual(t2, t3 / counter * 10) + tags = [x.tag for x in resource.iter('{%s}complexType' % XSD_NAMESPACE)] + self.assertEqual(len(tags), 56) + self.assertEqual(tags[0], '{%s}complexType' % XSD_NAMESPACE) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iter('{%s}complexType' % XSD_NAMESPACE)]) - resource = XMLResource(self.schema_class.meta_schema.source.url) - self.assertTrue(resource.is_lazy()) + def test_xml_resource_iterfind(self): + namespaces = {'xs': XSD_NAMESPACE} + resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=False) + self.assertFalse(resource.is_lazy()) + lazy_resource = XMLResource(self.schema_class.meta_schema.source.url) + self.assertTrue(lazy_resource.is_lazy()) - start_time = time.time() - for _ in range(10): - for _ in resource.iterfind(): - pass - tl1 = time.time() - start_time - self.assertLessEqual(t1, tl1 / 1000.0) - self.assertGreaterEqual(t1, tl1 / 10000.0) + # Note: Element change with lazy resource so compare only tags - start_time = time.time() - for _ in range(10): - for _ in resource.iterfind(path='.'): - pass - tl2 = time.time() - start_time + tags = [x.tag for x in resource.iterfind()] + self.assertEqual(len(tags), 1) + self.assertEqual(tags[0], '{%s}schema' % XSD_NAMESPACE) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iterfind()]) - self.assertLessEqual(t2, tl2 / 80.0) - self.assertGreaterEqual(t2, tl2 / 1000.0) + tags = [x.tag for x in resource.iterfind(path='.')] + self.assertEqual(len(tags), 1) + self.assertEqual(tags[0], '{%s}schema' % XSD_NAMESPACE) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iterfind(path='.')]) - start_time = time.time() - counter3 = 0 - for _ in resource.iterfind(path='*'): - counter3 += 1 - tl3 = time.time() - start_time - self.assertGreaterEqual(tl2, tl3 / counter3 * 10) + tags = [x.tag for x in resource.iterfind(path='*')] + self.assertEqual(len(tags), 156) + self.assertEqual(tags[0], '{%s}annotation' % XSD_NAMESPACE) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iterfind(path='*')]) - start_time = time.time() - for _ in resource.iterfind(path='. /. / xs:complexType', namespaces={'xs': XSD_NAMESPACE}): - pass - tl4 = time.time() - start_time - self.assertTrue(0.7 < (tl3 / tl4) < 1) + tags = [x.tag for x in resource.iterfind('xs:complexType', namespaces)] + self.assertEqual(len(tags), 35) + self.assertTrue(all(t == '{%s}complexType' % XSD_NAMESPACE for t in tags)) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iterfind('xs:complexType', namespaces)]) + + tags = [x.tag for x in resource.iterfind('. /. / xs:complexType', namespaces)] + self.assertEqual(len(tags), 35) + self.assertTrue(all(t == '{%s}complexType' % XSD_NAMESPACE for t in tags)) + self.assertListEqual(tags, [x.tag for x in lazy_resource.iterfind('. /. / xs:complexType', namespaces)]) def test_xml_resource_get_namespaces(self): with open(self.vh_xml_file) as schema_file: diff --git a/xmlschema/validators/schemas/puppet.xsd b/xmlschema/validators/schemas/puppet.xsd deleted file mode 100644 index 4434ff4..0000000 --- a/xmlschema/validators/schemas/puppet.xsd +++ /dev/null @@ -1,32 +0,0 @@ - - - - - A schema with puppet types for creating substitute elements. - - - - - - - - - - - - - - - - - - - - - - - - - < - - \ No newline at end of file From 6942be8ac90cce426151372c7d36bcae369e2a7e Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 23 Oct 2019 09:47:49 +0200 Subject: [PATCH 12/34] Optimize qname_to_prefixed() and get_namespace() helpers - use_empty optional argument added to qname_to_prefixed() --- xmlschema/namespaces.py | 3 +++ xmlschema/qnames.py | 32 +++++++++++++++++------------- xmlschema/tests/test_helpers.py | 18 ++++++++++++++++- xmlschema/validators/exceptions.py | 5 +++-- 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/xmlschema/namespaces.py b/xmlschema/namespaces.py index 44cd453..67f8e4b 100644 --- a/xmlschema/namespaces.py +++ b/xmlschema/namespaces.py @@ -70,6 +70,9 @@ NAMESPACE_PATTERN = re.compile(r'{([^}]*)}') def get_namespace(name): + if not name or name[0] != '{': + return '' + try: return NAMESPACE_PATTERN.match(name).group(1) except (AttributeError, TypeError): diff --git a/xmlschema/qnames.py b/xmlschema/qnames.py index eb4f27d..0f80411 100644 --- a/xmlschema/qnames.py +++ b/xmlschema/qnames.py @@ -224,34 +224,38 @@ def local_name(qname): return qname -def qname_to_prefixed(qname, namespaces): +def qname_to_prefixed(qname, namespaces, use_empty=True): """ - Transforms a fully qualified name into a prefixed name using a namespace map. - Returns the *qname* argument if it's not a fully qualified name or if it has - boolean value `False`. + Maps a QName in extended format to a QName in prefixed format. + Do not change local names and QNames in prefixed format. - :param qname: an extended QName or a local name. + :param qname: a QName or a local name. :param namespaces: a map from prefixes to namespace URIs. + :param use_empty: if `True` use the empty prefix for mapping. :return: a QName in prefixed format or a local name. """ - if not qname: + if not qname or qname[0] != '{': return qname namespace = get_namespace(qname) - for prefix, uri in sorted(filter(lambda x: x[1] == namespace, namespaces.items()), reverse=True): - if not uri: - return '%s:%s' % (prefix, qname) if prefix else qname - elif prefix: - return qname.replace('{%s}' % uri, '%s:' % prefix) - else: - return qname.replace('{%s}' % uri, '') + prefixes = [x for x in namespaces if namespaces[x] == namespace] + + if not prefixes: + return qname + elif prefixes[0]: + return '%s:%s' % (prefixes[0], qname.split('}', 1)[1]) + elif len(prefixes) > 1: + return '%s:%s' % (prefixes[1], qname.split('}', 1)[1]) + elif use_empty: + return qname.split('}', 1)[1] else: return qname def qname_to_extended(qname, namespaces): """ - Converts a QName in prefixed format or a local name to the extended QName format. + Maps a QName in prefixed format or a local name to the extended QName format. + Local names are mapped if *namespaces* has a not empty default namespace. :param qname: a QName in prefixed format or a local name. :param namespaces: a map from prefixes to namespace URIs. diff --git a/xmlschema/tests/test_helpers.py b/xmlschema/tests/test_helpers.py index be195ef..5a9c894 100644 --- a/xmlschema/tests/test_helpers.py +++ b/xmlschema/tests/test_helpers.py @@ -40,6 +40,9 @@ class TestHelpers(unittest.TestCase): self.assertEqual(get_namespace(XSD_SIMPLE_TYPE), XSD_NAMESPACE) self.assertEqual(get_namespace(''), '') self.assertEqual(get_namespace(None), '') + self.assertEqual(get_namespace('{}name'), '') + self.assertEqual(get_namespace('{ }name'), ' ') + self.assertEqual(get_namespace('{ ns }name'), ' ns ') def test_get_qname_functions(self): self.assertEqual(get_qname(XSD_NAMESPACE, 'element'), XSD_ELEMENT) @@ -81,8 +84,21 @@ class TestHelpers(unittest.TestCase): self.assertEqual(qname_to_prefixed('', {}), '') self.assertEqual(qname_to_prefixed('type', {'': XSI_NAMESPACE}), 'type') - self.assertEqual(qname_to_prefixed('type', {'ns': ''}), 'ns:type') self.assertEqual(qname_to_prefixed('type', {'': ''}), 'type') + self.assertEqual(qname_to_prefixed('{}type', {'': ''}), 'type') + self.assertEqual(qname_to_prefixed('{}type', {'': ''}, use_empty=False), '{}type') + + # Attention! in XML the empty namespace (that means no namespace) can be + # associated only with empty prefix, so these cases should never happen. + self.assertEqual(qname_to_prefixed('{}type', {'p': ''}), 'p:type') + self.assertEqual(qname_to_prefixed('type', {'p': ''}), 'type') + + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns'}, use_empty=True), 'type') + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns'}, use_empty=False), '{ns}type') + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns', 'p': 'ns'}, use_empty=True), 'p:type') + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns', 'p': 'ns'}, use_empty=False), 'p:type') + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns', 'p': 'ns0'}, use_empty=True), 'type') + self.assertEqual(qname_to_prefixed('{ns}type', {'': 'ns', 'p': 'ns0'}, use_empty=False), '{ns}type') def test_get_xsd_annotation(self): elem = etree_element(XSD_SCHEMA) diff --git a/xmlschema/validators/exceptions.py b/xmlschema/validators/exceptions.py index 3ed988f..4ff969a 100644 --- a/xmlschema/validators/exceptions.py +++ b/xmlschema/validators/exceptions.py @@ -15,6 +15,7 @@ from __future__ import unicode_literals from ..compat import PY3, string_base_type from ..exceptions import XMLSchemaException, XMLSchemaWarning, XMLSchemaValueError +from ..namespaces import get_namespace from ..qnames import qname_to_prefixed from ..etree import etree_tostring, etree_getpath from ..helpers import is_etree_element @@ -317,11 +318,11 @@ class XMLSchemaChildrenValidationError(XMLSchemaValidationError): self.occurs = occurs self.expected = expected - tag = qname_to_prefixed(elem.tag, validator.namespaces) + tag = qname_to_prefixed(elem.tag, validator.namespaces, use_empty=False) if index >= len(elem): reason = "The content of element %r is not complete." % tag else: - child_tag = qname_to_prefixed(elem[index].tag, validator.namespaces) + child_tag = qname_to_prefixed(elem[index].tag, validator.namespaces, use_empty=False) reason = "Unexpected child with tag %r at position %d." % (child_tag, index + 1) if occurs and particle.is_missing(occurs): From a374d1580573b9b220334c079a5ed796276c11a1 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 24 Oct 2019 06:37:31 +0200 Subject: [PATCH 13/34] Fix resource tests for Python 2 --- doc/usage.rst | 15 ++++++++++++--- xmlschema/resources.py | 8 ++++++-- xmlschema/tests/test_resources.py | 7 +++++-- xmlschema/tests/validation/test_validation.py | 9 ++++++++- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/doc/usage.rst b/doc/usage.rst index fda3cde..bb22bff 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -40,7 +40,7 @@ Otherwise the argument can be also an opened file-like object: .. doctest:: >>> import xmlschema - >>> schema_file = open('xmlschema/tests/test_cases/examples/vehicles/vehicles.xsd') + >>> schema_file = open('xmlschema/tests/test_cases/examples/collection/collection.xsd') >>> schema = xmlschema.XMLSchema(schema_file) Alternatively you can pass a string containing the schema definition: @@ -54,8 +54,8 @@ Alternatively you can pass a string containing the schema definition: ... ... """) -this option might not works when the schema includes other local subschemas, because the package -cannot knows anything about the schema's source location: +Strings and file-like objects might not work when the schema includes other local subschemas, +because the package cannot knows anything about the schema's source location: .. doctest:: @@ -73,6 +73,15 @@ cannot knows anything about the schema's source location: Path: /xs:schema/xs:element/xs:complexType/xs:sequence/xs:element +In these cases you can provide an appropriate *base_url* optional argument to define the +reference directory path for other includes and imports: + +.. doctest:: + + >>> import xmlschema + >>> schema_file = open('xmlschema/tests/test_cases/examples/vehicles/vehicles.xsd') + >>> schema = xmlschema.XMLSchema(schema_file, base_url='xmlschema/tests/test_cases/examples/vehicles/') + XSD declarations ---------------- diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 94832fb..55ad1ab 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -551,14 +551,18 @@ class XMLResource(object): return self.source.seek(position) try: - return self.source.seek(position) + value = self.source.seek(position) except AttributeError: pass + else: + return value if PY3 else position try: - return self.source.fp.seek(position) + value = self.source.fp.seek(position) except AttributeError: pass + else: + return value if PY3 else position def close(self): """ diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 38f94a4..447ddad 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -138,8 +138,11 @@ class TestResources(unittest.TestCase): ambiguous_path = casepath('resources/dummy file #2.txt') self.assertTrue(fetch_resource(ambiguous_path).endswith('dummy file %232.txt')) - with urlopen(fetch_resource(ambiguous_path)) as res: + res = urlopen(fetch_resource(ambiguous_path)) + try: self.assertEqual(res.read(), b'DUMMY CONTENT') + finally: + res.close() def test_fetch_namespaces(self): self.assertFalse(fetch_namespaces(casepath('resources/malformed.xml'))) @@ -570,7 +573,7 @@ class TestResources(unittest.TestCase): self.assertEqual(set(resource.get_namespaces().keys()), {'vh', 'xsi'}) self.assertFalse(xml_file.closed) - + if __name__ == '__main__': from xmlschema.tests import print_test_header diff --git a/xmlschema/tests/validation/test_validation.py b/xmlschema/tests/validation/test_validation.py index 3ba4ba4..1e4a10b 100644 --- a/xmlschema/tests/validation/test_validation.py +++ b/xmlschema/tests/validation/test_validation.py @@ -10,6 +10,7 @@ # @author Davide Brunato # import unittest +import sys import xmlschema from xmlschema import XMLSchemaValidationError @@ -55,7 +56,13 @@ class TestValidation(XsdValidatorTestCase): path_line = str(err).splitlines()[-1] else: path_line = '' - self.assertEqual('Path: /vhx:vehicles/vhx:cars', path_line) + + if sys.version_info >= (3, 6): + self.assertEqual('Path: /vhx:vehicles/vhx:cars', path_line) + else: + self.assertTrue( + 'Path: /vh:vehicles/vh:cars' == path_line or 'Path: /vhx:vehicles/vhx:cars', path_line + ) # Due to unordered dicts # Issue #80 vh_2_xt = ElementTree.parse(vh_2_file) From df6eb235167a70ac58943f51ea51967a52191867 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 24 Oct 2019 22:13:06 +0200 Subject: [PATCH 14/34] Add XML data depth limits - Add module xmlschema.limits for store processing limits - Add max_depth optional argument to decode methods - Code cleaning for iter_decode() kwargs (elements and groups) --- doc/usage.rst | 79 ++++++++++++------- xmlschema/__init__.py | 1 + xmlschema/limits.py | 21 +++++ xmlschema/tests/validation/test_validation.py | 24 +++++- .../tests/validators/test_schema_class.py | 6 +- xmlschema/validators/elements.py | 62 +++++++++------ xmlschema/validators/exceptions.py | 1 - xmlschema/validators/groups.py | 56 +++++++------ xmlschema/validators/models.py | 23 +++--- xmlschema/validators/schema.py | 38 +++------ xmlschema/validators/xsdbase.py | 22 ++++++ 11 files changed, 209 insertions(+), 124 deletions(-) create mode 100644 xmlschema/limits.py diff --git a/doc/usage.rst b/doc/usage.rst index bb22bff..6087211 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -526,35 +526,6 @@ For example you can build a schema using a *strict* mode and then decode XML dat using the *validation* argument setted to 'lax'. -XML entity-based attacks protection ------------------------------------ - -The XML data resource loading is protected using the `SafeXMLParser` class, a subclass of -the pure Python version of XMLParser that forbids the use of entities. -The protection is applied both to XSD schemas and to XML data. The usage of this feature is -regulated by the XMLSchema's argument *defuse*. -For default this argument has value *'remote'* that means the protection on XML data is -applied only to data loaded from remote. Other values for this argument can be *'always'* -and *'never'*. - - -Limit on model groups checking ------------------------------- - -From release v1.0.11 the model groups of the schemas are checked against restriction violations -and *Unique Particle Attribution* violations. - -To avoids XSD model recursion attacks a limit of ``MAX_MODEL_DEPTH = 15`` is set. If this limit -is exceeded an ``XMLSchemaModelDepthError`` is raised, the error is caught and a warning is generated. -If you need to set an higher limit for checking all your groups you can import the library and change -the value in the specific module that processes the model checks: - -.. doctest:: - - >>> import xmlschema - >>> xmlschema.validators.models.MAX_MODEL_DEPTH = 20 - - Lazy validation --------------- @@ -570,3 +541,53 @@ From release v1.0.14 XSD 1.1 support has been added to the library through the c :class:`XMLSchema11`. You have to use this class for XSD 1.1 schemas instead the default class :class:`XMLSchema` that is still linked to XSD 1.0 validator :class:`XMLSchema10`. From next minor release (v1.1) the default class will become :class:`XMLSchema11`. + + +XML entity-based attacks protection +................................... + +The XML data resource loading is protected using the `SafeXMLParser` class, a subclass of +the pure Python version of XMLParser that forbids the use of entities. +The protection is applied both to XSD schemas and to XML data. The usage of this feature is +regulated by the XMLSchema's argument *defuse*. +For default this argument has value *'remote'* that means the protection on XML data is +applied only to data loaded from remote. Other values for this argument can be *'always'* +and *'never'*. + +Processing limits +----------------- + +From release v1.0.16 a module has been added in order to group constants that define +processing limits, generally to protect against attacks prepared to exhaust system +resources. These limits usually don't need to be changed, but this possibility has +been left at the module level for situations where a different setting is needed. + +Limit on XSD model groups checking +.................................. + +Model groups of the schemas are checked against restriction violations and *Unique Particle +Attribution* violations. To avoids XSD model recursion attacks a depth limit of 15 levels +is set. If this limit is exceeded an ``XMLSchemaModelDepthError`` is raised, the error is +caught and a warning is generated. If you need to set an higher limit for checking all your +groups you can import the library and change the value of ``MAX_MODEL_DEPTH`` in the limits +module: + +.. doctest:: + + >>> import xmlschema + >>> xmlschema.limits.MAX_MODEL_DEPTH = 20 + + +Limit on XML data depth +....................... + +A limit of 9999 on maximum depth is set for XML validation/decoding/encoding to avoid +attacks based on extremely deep XML data. To increase or decrease this limit change the +value of ``MAX_XML_DEPTH`` in the module *limits* after the import of the package: + +.. doctest:: + + >>> import xmlschema + >>> xmlschema.limits.MAX_XML_DEPTH = 1000 + + diff --git a/xmlschema/__init__.py b/xmlschema/__init__.py index d800a17..cfcf02e 100644 --- a/xmlschema/__init__.py +++ b/xmlschema/__init__.py @@ -8,6 +8,7 @@ # # @author Davide Brunato # +from . import limits from .exceptions import XMLSchemaException, XMLSchemaRegexError, XMLSchemaURLError, \ XMLSchemaNamespaceError from .etree import etree_tostring diff --git a/xmlschema/limits.py b/xmlschema/limits.py new file mode 100644 index 0000000..9ef9489 --- /dev/null +++ b/xmlschema/limits.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). +# All rights reserved. +# This file is distributed under the terms of the MIT License. +# See the file 'LICENSE' in the root directory of the present +# distribution, or http://opensource.org/licenses/MIT. +# +# @author Davide Brunato +# +"""Package protection limits. Values can be changed after import to set different limits.""" + +MAX_XML_DEPTH = 9999 +""" +Maximum depth of XML data. An `XMLSchemaValidationError` is raised if this limit is exceeded. +""" + +MAX_MODEL_DEPTH = 15 +""" +Maximum XSD model group depth. An `XMLSchemaModelDepthError` is raised if this limit is exceeded. +""" diff --git a/xmlschema/tests/validation/test_validation.py b/xmlschema/tests/validation/test_validation.py index 1e4a10b..083bbd1 100644 --- a/xmlschema/tests/validation/test_validation.py +++ b/xmlschema/tests/validation/test_validation.py @@ -77,13 +77,33 @@ class TestValidation(XsdValidatorTestCase): self.assertRaises(XMLSchemaValidationError, xsd_element.decode, source.root, namespaces=namespaces) - # Testing adding 'no_depth' argument for result in xsd_element.iter_decode(source.root, 'strict', namespaces=namespaces, - source=source, no_depth=True): + source=source, max_depth=1): del result self.assertIsNone(xmlschema.validate(self.col_xml_file, lazy=True)) + def test_max_depth_argument(self): + schema = self.schema_class(self.col_xsd_file) + self.assertEqual( + schema.decode(self.col_xml_file, max_depth=1), + {'@xmlns:col': 'http://example.com/ns/collection', + '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', + '@xsi:schemaLocation': 'http://example.com/ns/collection collection.xsd'}) + + xmlschema.limits.MAX_XML_DEPTH = 1 + with self.assertRaises(XMLSchemaValidationError): + self.assertEqual(schema.decode(self.col_xml_file)) + xmlschema.limits.MAX_XML_DEPTH = 9999 + + self.assertEqual( + schema.decode(self.col_xml_file, max_depth=2), + {'@xmlns:col': 'http://example.com/ns/collection', + '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', + '@xsi:schemaLocation': 'http://example.com/ns/collection collection.xsd', + 'object': [{'@id': 'b0836217462', '@available': True}, + {'@id': 'b0836217463', '@available': True}]}) + class TestValidation11(TestValidation): schema_class = XMLSchema11 diff --git a/xmlschema/tests/validators/test_schema_class.py b/xmlschema/tests/validators/test_schema_class.py index 45be457..1253a47 100644 --- a/xmlschema/tests/validators/test_schema_class.py +++ b/xmlschema/tests/validators/test_schema_class.py @@ -142,10 +142,12 @@ class TestXMLSchema10(XsdValidatorTestCase): "Remote networks are not accessible or avoid SSL verification error on Windows.") def test_remote_schemas_loading(self): col_schema = self.schema_class("https://raw.githubusercontent.com/brunato/xmlschema/master/" - "xmlschema/tests/test_cases/examples/collection/collection.xsd") + "xmlschema/tests/test_cases/examples/collection/collection.xsd", + timeout=300) self.assertTrue(isinstance(col_schema, self.schema_class)) vh_schema = self.schema_class("https://raw.githubusercontent.com/brunato/xmlschema/master/" - "xmlschema/tests/test_cases/examples/vehicles/vehicles.xsd") + "xmlschema/tests/test_cases/examples/vehicles/vehicles.xsd", + timeout=300) self.assertTrue(isinstance(vh_schema, self.schema_class)) def test_schema_defuse(self): diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py index a5fdc3f..2b7fe2e 100644 --- a/xmlschema/validators/elements.py +++ b/xmlschema/validators/elements.py @@ -458,14 +458,12 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) text = self.fixed if self.fixed is not None else self.default return self.type.text_decode(text) - def iter_decode(self, elem, validation='lax', converter=None, level=0, **kwargs): + def iter_decode(self, elem, validation='lax', **kwargs): """ Creates an iterator for decoding an Element instance. :param elem: the Element that has to be decoded. :param validation: the validation mode, can be 'lax', 'strict' or 'skip. - :param converter: an :class:`XMLSchemaConverter` subclass or instance to use for the decoding. - :param level: the depth of the element in the tree structure. :param kwargs: keyword arguments for the decoding process. :return: yields a decoded object, eventually preceded by a sequence of \ validation or decoding errors. @@ -473,8 +471,19 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) if self.abstract: yield self.validation_error(validation, "cannot use an abstract element for validation", elem, **kwargs) - if not isinstance(converter, XMLSchemaConverter): - converter = self.schema.get_converter(converter, level=level, **kwargs) + try: + level = kwargs['level'] + except KeyError: + level = 0 + + try: + converter = kwargs['converter'] + except KeyError: + converter = kwargs['converter'] = self.get_converter(**kwargs) + else: + if not isinstance(converter, XMLSchemaConverter): + converter = kwargs['converter'] = self.get_converter(**kwargs) + inherited = kwargs.get('inherited') value = content = attributes = None @@ -492,7 +501,7 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) # Decode attributes attribute_group = self.get_attributes(xsd_type) - for result in attribute_group.iter_decode(elem.attrib, validation, level=level, **kwargs): + for result in attribute_group.iter_decode(elem.attrib, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield self.validation_error(validation, result, elem, **kwargs) else: @@ -529,8 +538,7 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) for error in assertion(elem, **kwargs): yield self.validation_error(validation, error, **kwargs) - for result in xsd_type.content_type.iter_decode( - elem, validation, converter, level + 1, **kwargs): + for result in xsd_type.content_type.iter_decode(elem, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield self.validation_error(validation, result, elem, **kwargs) else: @@ -601,29 +609,40 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) del content if validation != 'skip': - for constraint in self.identities.values(): - if isinstance(constraint, XsdKeyref) and '_no_deep' in kwargs: # TODO: Complete lazy validation - continue - for error in constraint(elem, converter): - yield self.validation_error(validation, error, elem, **kwargs) + if 'max_depth' in kwargs: + # Don't check key references with lazy or shallow validation + for constraint in filter(lambda x: not isinstance(x, XsdKeyref), self.identities.values()): + for error in constraint(elem, converter): + yield self.validation_error(validation, error, elem, **kwargs) + else: + for constraint in self.identities.values(): + for error in constraint(elem, converter): + yield self.validation_error(validation, error, elem, **kwargs) - def iter_encode(self, obj, validation='lax', converter=None, level=0, **kwargs): + def iter_encode(self, obj, validation='lax', **kwargs): """ Creates an iterator for encoding data to an Element. :param obj: the data that has to be encoded. :param validation: the validation mode: can be 'lax', 'strict' or 'skip'. - :param converter: an :class:`XMLSchemaConverter` subclass or instance to use \ - for the encoding. - :param level: the depth of the element data in the tree structure. :param kwargs: keyword arguments for the encoding process. :return: yields an Element, eventually preceded by a sequence of \ validation or encoding errors. """ - if not isinstance(converter, XMLSchemaConverter): - converter = self.schema.get_converter(converter, level=level, **kwargs) - element_data = converter.element_encode(obj, self, level) + try: + converter = kwargs['converter'] + except KeyError: + converter = kwargs['converter'] = self.get_converter(**kwargs) + else: + if not isinstance(converter, XMLSchemaConverter): + converter = kwargs['converter'] = self.get_converter(**kwargs) + try: + level = kwargs['level'] + except KeyError: + level = 0 + + element_data = converter.element_encode(obj, self, level) errors = [] tag = element_data.tag text = None @@ -683,8 +702,7 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) else: text = result else: - for result in xsd_type.content_type.iter_encode( - element_data, validation, converter, level + 1, **kwargs): + for result in xsd_type.content_type.iter_encode(element_data, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): errors.append(result) elif result: diff --git a/xmlschema/validators/exceptions.py b/xmlschema/validators/exceptions.py index 4ff969a..d47d60a 100644 --- a/xmlschema/validators/exceptions.py +++ b/xmlschema/validators/exceptions.py @@ -15,7 +15,6 @@ from __future__ import unicode_literals from ..compat import PY3, string_base_type from ..exceptions import XMLSchemaException, XMLSchemaWarning, XMLSchemaValueError -from ..namespaces import get_namespace from ..qnames import qname_to_prefixed from ..etree import etree_tostring, etree_getpath from ..helpers import is_etree_element diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index e5345b1..e248c0c 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -14,6 +14,7 @@ This module contains classes for XML Schema model groups. from __future__ import unicode_literals import warnings +from .. import limits from ..compat import unicode_type from ..exceptions import XMLSchemaValueError from ..etree import etree_element @@ -555,15 +556,12 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): msg = "Maybe a not equivalent type table between elements %r and %r." % (self, xsd_element) warnings.warn(msg, XMLSchemaTypeTableWarning, stacklevel=3) - def iter_decode(self, elem, validation='lax', converter=None, level=0, **kwargs): + def iter_decode(self, elem, validation='lax', **kwargs): """ Creates an iterator for decoding an Element content. :param elem: the Element that has to be decoded. :param validation: the validation mode, can be 'lax', 'strict' or 'skip. - :param converter: an :class:`XMLSchemaConverter` subclass or instance \ - to use for the decoding. - :param level: the depth of the element in the tree structure. :param kwargs: keyword arguments for the decoding process. :return: yields a list of 3-tuples (key, decoded data, decoder), \ eventually preceded by a sequence of validation or decoding errors. @@ -590,16 +588,21 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): result_list.append((cdata_index, text, None)) cdata_index += 1 - model = ModelVisitor(self) - errors = [] + level = kwargs['level'] = kwargs.pop('level', 0) + 1 + if level > limits.MAX_XML_DEPTH: + reason = "XML data depth exceeded (MAX_XML_DEPTH=%r)" % limits.MAX_XML_DEPTH + self.validation_error('strict', reason, elem, **kwargs) try: - default_namespace = converter.get('') - except (AttributeError, TypeError): - converter = self.schema.get_converter(converter, level=level, **kwargs) - default_namespace = converter.get('') + converter = kwargs['converter'] + except KeyError: + converter = kwargs['converter'] = self.get_converter(**kwargs) + default_namespace = converter.get('') + model = ModelVisitor(self) + errors = [] model_broken = False + for index, child in enumerate(elem): if callable(child.tag): continue # child is a @@ -646,12 +649,13 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): xsd_element = None model_broken = True - if xsd_element is None or kwargs.get('no_depth'): - # TODO: use a default decoder str-->str?? + if 'max_depth' in kwargs and kwargs['max_depth'] <= level: + continue + elif xsd_element is None: + # TODO: apply a default decoder str-->str?? continue - for result in xsd_element.iter_decode( - child, validation, converter=converter, level=level, **kwargs): + for result in xsd_element.iter_decode(child, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield result else: @@ -678,16 +682,12 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): yield result_list - def iter_encode(self, element_data, validation='lax', converter=None, level=0, indent=4, **kwargs): + def iter_encode(self, element_data, validation='lax', **kwargs): """ Creates an iterator for encoding data to a list containing Element data. :param element_data: an ElementData instance with unencoded data. :param validation: the validation mode: can be 'lax', 'strict' or 'skip'. - :param converter: an :class:`XMLSchemaConverter` subclass or instance to use \ - for the encoding. - :param level: the depth of the element data in the tree structure. - :param indent: number of spaces for XML indentation (default is 4). :param kwargs: keyword arguments for the encoding process. :return: yields a couple with the text of the Element and a list of 3-tuples \ (key, decoded data, decoder), eventually preceded by a sequence of validation \ @@ -697,19 +697,26 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): yield element_data.content return + level = kwargs['level'] = kwargs.pop('level', 0) + 1 errors = [] text = None children = [] + try: + indent = kwargs['indent'] + except KeyError: + indent = 4 + padding = '\n' + ' ' * indent * level try: - default_namespace = converter.get('') - except (AttributeError, TypeError): - converter = self.schema.get_converter(converter, level=level, **kwargs) - default_namespace = converter.get('') + converter = kwargs['converter'] + except KeyError: + converter = kwargs['converter'] = self.get_converter(**kwargs) + default_namespace = converter.get('') model = ModelVisitor(self) cdata_index = 0 + if isinstance(element_data.content, dict) or kwargs.get('unordered'): content = model.iter_unordered_content(element_data.content) elif not isinstance(element_data.content, list): @@ -766,8 +773,7 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): yield self.validation_error(validation, reason, value, **kwargs) continue - for result in xsd_element.iter_encode( - value, validation, converter=converter, level=level, indent=indent, **kwargs): + for result in xsd_element.iter_encode(value, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield result else: diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index 7a904f4..77c237f 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -14,17 +14,13 @@ This module contains classes and functions for processing XSD content models. from __future__ import unicode_literals from collections import defaultdict, deque, Counter +from .. import limits from ..compat import PY3, MutableSequence from ..exceptions import XMLSchemaValueError from .exceptions import XMLSchemaModelError, XMLSchemaModelDepthError from .xsdbase import ParticleMixin from .wildcards import XsdAnyElement, Xsd11AnyElement -MAX_MODEL_DEPTH = 15 -"""Limit depth for safe visiting of models""" - -XSD_GROUP_MODELS = {'sequence', 'choice', 'all'} - class ModelGroup(MutableSequence, ParticleMixin): """ @@ -34,7 +30,6 @@ class ModelGroup(MutableSequence, ParticleMixin): parent = None def __init__(self, model): - assert model in XSD_GROUP_MODELS, "Not a valid value for 'model'" self._group = [] self.model = model @@ -61,7 +56,7 @@ class ModelGroup(MutableSequence, ParticleMixin): def __setattr__(self, name, value): if name == 'model' and value is not None: - if value not in XSD_GROUP_MODELS: + if value not in {'sequence', 'choice', 'all'}: raise XMLSchemaValueError("invalid model group %r." % value) if self.model is not None and value != self.model and self.model != 'all': raise XMLSchemaValueError("cannot change group model from %r to %r" % (self.model, value)) @@ -165,11 +160,11 @@ class ModelGroup(MutableSequence, ParticleMixin): """ A generator function iterating elements and groups of a model group. Skips pointless groups, iterating deeper through them. Raises `XMLSchemaModelDepthError` if the argument *depth* is - over `MAX_MODEL_DEPTH` value. + over `limits.MAX_MODEL_DEPTH` value. :param depth: guard for protect model nesting bombs, incremented at each deepest recursion. """ - if depth > MAX_MODEL_DEPTH: + if depth > limits.MAX_MODEL_DEPTH: raise XMLSchemaModelDepthError(self) for item in self: if not isinstance(item, ModelGroup): @@ -183,11 +178,11 @@ class ModelGroup(MutableSequence, ParticleMixin): def iter_elements(self, depth=0): """ A generator function iterating model's elements. Raises `XMLSchemaModelDepthError` if the - argument *depth* is over `MAX_MODEL_DEPTH` value. + argument *depth* is over `limits.MAX_MODEL_DEPTH` value. :param depth: guard for protect model nesting bombs, incremented at each deepest recursion. """ - if depth > MAX_MODEL_DEPTH: + if depth > limits.MAX_MODEL_DEPTH: raise XMLSchemaModelDepthError(self) for item in self: if isinstance(item, ModelGroup): @@ -203,12 +198,12 @@ class ModelGroup(MutableSequence, ParticleMixin): :raises: an `XMLSchemaModelError` at first violated constraint. """ def safe_iter_path(group, depth): - if depth > MAX_MODEL_DEPTH: + if not depth: raise XMLSchemaModelDepthError(group) for item in group: if isinstance(item, ModelGroup): current_path.append(item) - for _item in safe_iter_path(item, depth + 1): + for _item in safe_iter_path(item, depth - 1): yield _item current_path.pop() else: @@ -221,7 +216,7 @@ class ModelGroup(MutableSequence, ParticleMixin): except AttributeError: any_element = None - for e in safe_iter_path(self, 0): + for e in safe_iter_path(self, limits.MAX_MODEL_DEPTH): for pe, previous_path in paths.values(): # EDC check if not e.is_consistent(pe) or any_element and not any_element.is_consistent(pe): diff --git a/xmlschema/validators/schema.py b/xmlschema/validators/schema.py index 1277d26..685f5dd 100644 --- a/xmlschema/validators/schema.py +++ b/xmlschema/validators/schema.py @@ -822,27 +822,6 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): except KeyError: return [] - def get_converter(self, converter=None, namespaces=None, **kwargs): - """ - Returns a new converter instance. - - :param converter: can be a converter class or instance. If it's an instance \ - the new instance is copied from it and configured with the provided arguments. - :param namespaces: is an optional mapping from namespace prefix to URI. - :param kwargs: optional arguments for initialize the converter instance. - :return: a converter instance. - """ - if converter is None: - converter = getattr(self, 'converter', XMLSchemaConverter) - - if isinstance(converter, XMLSchemaConverter): - return converter.copy(namespaces=namespaces, **kwargs) - elif issubclass(converter, XMLSchemaConverter): - return converter(namespaces, **kwargs) - else: - msg = "'converter' argument must be a %r subclass or instance: %r" - raise XMLSchemaTypeError(msg % (XMLSchemaConverter, converter)) - def get_element(self, tag, path=None, namespaces=None): if not path: return self.find(tag, namespaces) @@ -1223,16 +1202,14 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): inherited = {} if source.is_lazy() and path is None: - # TODO: Document validation in lazy mode. - # Validation is done pushing a _no_deep argument for root node and with - # a path='*' for validating children. This is a feature under test. xsd_element = self.get_element(source.root.tag, schema_path) if xsd_element is None: - yield self.validation_error('lax', "%r is not an element of the schema" % source.root, source.root) + msg = "%r is not an element of the schema" + yield self.validation_error('lax', msg % source.root, source.root) for result in xsd_element.iter_decode(source.root, source=source, namespaces=namespaces, - use_defaults=use_defaults, id_map=id_map, no_depth=True, - inherited=inherited, drop_results=True): + use_defaults=use_defaults, id_map=id_map, + inherited=inherited, max_depth=1): if isinstance(result, XMLSchemaValidationError): yield result else: @@ -1249,7 +1226,7 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): for result in xsd_element.iter_decode(elem, source=source, namespaces=namespaces, use_defaults=use_defaults, id_map=id_map, - inherited=inherited, drop_results=True): + inherited=inherited): if isinstance(result, XMLSchemaValidationError): yield result else: @@ -1264,7 +1241,7 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): def iter_decode(self, source, path=None, schema_path=None, validation='lax', process_namespaces=True, namespaces=None, use_defaults=True, decimal_type=None, datetime_types=False, - converter=None, filler=None, fill_missing=False, **kwargs): + converter=None, filler=None, fill_missing=False, max_depth=None, **kwargs): """ Creates an iterator for decoding an XML source to a data structure. @@ -1292,6 +1269,7 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): an attribute declaration. If not provided undecodable data is replaced by `None`. :param fill_missing: if set to `True` the decoder fills also missing attributes. \ The filling value is `None` or a typed value if the *filler* callback is provided. + :param max_depth: maximum level of decoding. For default has no limit. :param kwargs: keyword arguments with other options for converter and decoder. :return: yields a decoded data object, eventually preceded by a sequence of validation \ or decoding errors. @@ -1323,6 +1301,8 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): kwargs['decimal_type'] = decimal_type if filler is not None: kwargs['filler'] = filler + if max_depth is not None: + kwargs['max_depth'] = max_depth for elem in source.iterfind(path, namespaces): xsd_element = self.get_element(elem.tag, schema_path, namespaces) diff --git a/xmlschema/validators/xsdbase.py b/xmlschema/validators/xsdbase.py index 13393ee..fe04ca0 100644 --- a/xmlschema/validators/xsdbase.py +++ b/xmlschema/validators/xsdbase.py @@ -21,6 +21,7 @@ from ..qnames import XSD_ANNOTATION, XSD_APPINFO, XSD_DOCUMENTATION, XML_LANG, \ get_qname, local_name, qname_to_prefixed from ..etree import etree_tostring from ..helpers import is_etree_element +from ..converters import XMLSchemaConverter from .exceptions import XMLSchemaParseError, XMLSchemaValidationError, \ XMLSchemaDecodeError, XMLSchemaEncodeError @@ -195,6 +196,27 @@ class XsdValidator(object): self.parse_error(msg % (value, ' | '.join(admitted_values)), elem) return '' + def get_converter(self, converter=None, namespaces=None, **kwargs): + """ + Returns a new converter instance. + + :param converter: can be a converter class or instance. If it's an instance \ + the new instance is copied from it and configured with the provided arguments. + :param namespaces: is an optional mapping from namespace prefix to URI. + :param kwargs: optional arguments for initialize the converter instance. + :return: a converter instance. + """ + if converter is None: + converter = getattr(self, 'converter', XMLSchemaConverter) + + if isinstance(converter, XMLSchemaConverter): + return converter.copy(namespaces=namespaces, **kwargs) + elif issubclass(converter, XMLSchemaConverter): + return converter(namespaces, **kwargs) + else: + msg = "'converter' argument must be a %r subclass or instance: %r" + raise XMLSchemaTypeError(msg % (XMLSchemaConverter, converter)) + class XsdComponent(XsdValidator): """ From ded91458a142d3b5197789350fb010bcc0d0e91c Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 25 Oct 2019 10:18:34 +0200 Subject: [PATCH 15/34] Extend schema validation to match every defined global element - Should be a fix for issue #140 --- .../features/namespaces/import-case4-1.xml | 5 ++ .../features/namespaces/import-case4-2.xml | 7 ++ .../features/namespaces/import-case4a.xsd | 24 ++++++ .../features/namespaces/import-case4b.xsd | 24 ++++++ xmlschema/tests/test_cases/testfiles | 4 + xmlschema/validators/schema.py | 83 +++++++++++++------ 6 files changed, 123 insertions(+), 24 deletions(-) create mode 100644 xmlschema/tests/test_cases/features/namespaces/import-case4-1.xml create mode 100644 xmlschema/tests/test_cases/features/namespaces/import-case4-2.xml create mode 100644 xmlschema/tests/test_cases/features/namespaces/import-case4a.xsd create mode 100644 xmlschema/tests/test_cases/features/namespaces/import-case4b.xsd diff --git a/xmlschema/tests/test_cases/features/namespaces/import-case4-1.xml b/xmlschema/tests/test_cases/features/namespaces/import-case4-1.xml new file mode 100644 index 0000000..07e306c --- /dev/null +++ b/xmlschema/tests/test_cases/features/namespaces/import-case4-1.xml @@ -0,0 +1,5 @@ + + + diff --git a/xmlschema/tests/test_cases/features/namespaces/import-case4-2.xml b/xmlschema/tests/test_cases/features/namespaces/import-case4-2.xml new file mode 100644 index 0000000..a15a214 --- /dev/null +++ b/xmlschema/tests/test_cases/features/namespaces/import-case4-2.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/xmlschema/tests/test_cases/features/namespaces/import-case4a.xsd b/xmlschema/tests/test_cases/features/namespaces/import-case4a.xsd new file mode 100644 index 0000000..7d87bd5 --- /dev/null +++ b/xmlschema/tests/test_cases/features/namespaces/import-case4a.xsd @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + diff --git a/xmlschema/tests/test_cases/features/namespaces/import-case4b.xsd b/xmlschema/tests/test_cases/features/namespaces/import-case4b.xsd new file mode 100644 index 0000000..4666bf5 --- /dev/null +++ b/xmlschema/tests/test_cases/features/namespaces/import-case4b.xsd @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + diff --git a/xmlschema/tests/test_cases/testfiles b/xmlschema/tests/test_cases/testfiles index 7c7d62e..5f088cb 100644 --- a/xmlschema/tests/test_cases/testfiles +++ b/xmlschema/tests/test_cases/testfiles @@ -60,6 +60,10 @@ features/namespaces/default_ns_valid2.xsd features/namespaces/import-case1.xsd --errors=1 # Unknown type features/namespaces/import-case2.xsd --errors=1 # Missing namespace import in imported chameleon schema features/namespaces/import-case3.xsd +features/namespaces/import-case4a.xsd +features/namespaces/import-case4b.xsd +features/namespaces/import-case4-1.xml # This and the next are also regression tests for issue #140 +features/namespaces/import-case4-2.xml features/namespaces/include-case1.xsd features/namespaces/include-case1bis.xsd features/namespaces/include-case2.xsd diff --git a/xmlschema/validators/schema.py b/xmlschema/validators/schema.py index 685f5dd..198f230 100644 --- a/xmlschema/validators/schema.py +++ b/xmlschema/validators/schema.py @@ -33,7 +33,7 @@ from ..qnames import VC_MIN_VERSION, VC_MAX_VERSION, VC_TYPE_AVAILABLE, \ XSD_OVERRIDE, XSD_DEFAULT_OPEN_CONTENT from ..helpers import get_xsd_derivation_attribute, get_xsd_form_attribute from ..namespaces import XSD_NAMESPACE, XML_NAMESPACE, XSI_NAMESPACE, VC_NAMESPACE, \ - SCHEMAS_DIR, LOCATION_HINTS, NamespaceResourcesMap, NamespaceView + SCHEMAS_DIR, LOCATION_HINTS, NamespaceResourcesMap, NamespaceView, get_namespace from ..etree import etree_element, etree_tostring, prune_etree, ParseError from ..resources import is_remote_url, url_path_is_file, fetch_resource, XMLResource from ..converters import XMLSchemaConverter @@ -1201,15 +1201,30 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): id_map = Counter() inherited = {} - if source.is_lazy() and path is None: - xsd_element = self.get_element(source.root.tag, schema_path) - if xsd_element is None: - msg = "%r is not an element of the schema" - yield self.validation_error('lax', msg % source.root, source.root) + namespace = source.namespace or namespaces.get('', '') + try: + schema = self.maps.namespaces[namespace][0] + except (KeyError, IndexError): + reason = 'the namespace {!r} is not loaded'.format(namespace) + yield self.validation_error('lax', reason, source.root, source, namespaces) + return - for result in xsd_element.iter_decode(source.root, source=source, namespaces=namespaces, - use_defaults=use_defaults, id_map=id_map, - inherited=inherited, max_depth=1): + kwargs = { + 'source': source, + 'namespaces': namespaces, + 'use_defaults': use_defaults, + 'id_map': id_map, + 'inherited': inherited + } + + if source.is_lazy() and path is None: + xsd_element = schema.get_element(source.root.tag, schema_path, namespaces) + if xsd_element is None: + reason = "{!r} is not an element of the schema".format(source.root) + yield schema.validation_error('lax', reason, source.root, source, namespaces) + return + + for result in xsd_element.iter_decode(source.root, max_depth=1, **kwargs): if isinstance(result, XMLSchemaValidationError): yield result else: @@ -1220,13 +1235,13 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): schema_path = '/%s/*' % source.root.tag for elem in source.iterfind(path, namespaces): - xsd_element = self.get_element(elem.tag, schema_path, self.namespaces) + xsd_element = schema.get_element(elem.tag, schema_path, namespaces) if xsd_element is None: - yield self.validation_error('lax', "%r is not an element of the schema" % elem, elem) + reason = "{!r} is not an element of the schema".format(elem) + yield schema.validation_error('lax', reason, elem, source, namespaces) + return - for result in xsd_element.iter_decode(elem, source=source, namespaces=namespaces, - use_defaults=use_defaults, id_map=id_map, - inherited=inherited): + for result in xsd_element.iter_decode(elem, **kwargs): if isinstance(result, XMLSchemaValidationError): yield result else: @@ -1269,7 +1284,7 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): an attribute declaration. If not provided undecodable data is replaced by `None`. :param fill_missing: if set to `True` the decoder fills also missing attributes. \ The filling value is `None` or a typed value if the *filler* callback is provided. - :param max_depth: maximum level of decoding. For default has no limit. + :param max_depth: maximum level of decoding, for default there is no limit. :param kwargs: keyword arguments with other options for converter and decoder. :return: yields a decoded data object, eventually preceded by a sequence of validation \ or decoding errors. @@ -1304,15 +1319,26 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): if max_depth is not None: kwargs['max_depth'] = max_depth + namespace = source.namespace or namespaces.get('', '') + try: + schema = self.maps.namespaces[namespace][0] + except (KeyError, IndexError): + reason = 'the namespace {!r} is not loaded'.format(namespace) + yield self.validation_error('lax', reason, source.root, source, namespaces) + return + for elem in source.iterfind(path, namespaces): - xsd_element = self.get_element(elem.tag, schema_path, namespaces) + xsd_element = schema.get_element(elem.tag, schema_path, namespaces) if xsd_element is None: - yield self.validation_error(validation, "%r is not an element of the schema" % elem, elem) + reason = "{!r} is not an element of the schema".format(elem) + yield schema.validation_error('lax', reason, elem, source, namespaces) + return for obj in xsd_element.iter_decode( - elem, validation, converter=converter, source=source, namespaces=namespaces, - use_defaults=use_defaults, datetime_types=datetime_types, - fill_missing=fill_missing, id_map=id_map, inherited=inherited, **kwargs): + elem, validation, converter=converter, source=source, + namespaces=namespaces, use_defaults=use_defaults, + datetime_types=datetime_types, fill_missing=fill_missing, + id_map=id_map, inherited=inherited, **kwargs): yield obj for k, v in id_map.items(): @@ -1374,7 +1400,16 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): namespaces = {} if namespaces is None else namespaces.copy() converter = self.get_converter(converter, namespaces, **kwargs) - if path is not None: + namespace = get_namespace(path) or namespaces.get('', '') + if namespace: + try: + schema = self.maps.namespaces[namespace][0] + except (KeyError, IndexError): + reason = 'the namespace {!r} is not loaded'.format(namespace) + raise XMLSchemaEncodeError(self, obj, self, reason, namespaces=namespaces) + else: + xsd_element = schema.find(path, namespaces=namespaces) + elif path is not None: xsd_element = self.find(path, namespaces=namespaces) elif isinstance(obj, dict) and len(obj) == 1: xsd_element = self.elements.get(list(obj.keys())[0]) @@ -1386,10 +1421,10 @@ class XMLSchemaBase(XsdValidator, ValidationMixin, ElementPathMixin): if not isinstance(xsd_element, XsdElement): if path is not None: - msg = "the path %r doesn't match any element of the schema!" % path + reason = "the path %r doesn't match any element of the schema!" % path else: - msg = "unable to select an element for decoding data, provide a valid 'path' argument." - yield XMLSchemaEncodeError(self, obj, self.elements, reason=msg) + reason = "unable to select an element for decoding data, provide a valid 'path' argument." + raise XMLSchemaEncodeError(self, obj, self.elements, reason, namespaces=namespaces) else: for result in xsd_element.iter_encode(obj, validation, converter=converter, unordered=unordered, **kwargs): From c963970549a2e58f885107aae377f869b7c69793 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 25 Oct 2019 23:02:13 +0200 Subject: [PATCH 16/34] Fix openContent's appliesToEmpty attribute use --- xmlschema/validators/complex_types.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xmlschema/validators/complex_types.py b/xmlschema/validators/complex_types.py index e45ff30..bcff57c 100644 --- a/xmlschema/validators/complex_types.py +++ b/xmlschema/validators/complex_types.py @@ -344,11 +344,9 @@ class XsdComplexType(XsdType, ValidationMixin): "derived an empty content from base type that has not empty content.", elem ) - if not self.open_content: - if self.schema.default_open_content: + if not self.open_content and self.schema.default_open_content: + if content_type or self.schema.default_open_content.applies_to_empty: self.open_content = self.schema.default_open_content - elif getattr(base_type, 'open_content', None): - self.open_content = base_type.open_content if self.open_content and content_type and \ not self.open_content.is_restriction(base_type.open_content): @@ -679,7 +677,13 @@ class Xsd11ComplexType(XsdComplexType): # Add open content to complex content type if isinstance(self.content_type, XsdGroup): - open_content = self.open_content or self.schema.default_open_content + open_content = self.open_content + if open_content is not None: + pass + elif self.schema.default_open_content is not None: + if self.content_type or self.schema.default_open_content.applies_to_empty: + open_content = self.schema.default_open_content + if open_content is None: pass elif open_content.mode == 'interleave': From 732864edc7fd35fd4bbf4d6cbcf1c3f9728e68b6 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Sat, 26 Oct 2019 23:50:38 +0200 Subject: [PATCH 17/34] Fix xs:ID counting for nodes without parent - Consider attributes with level+1 as child elements - Clean XsdAtomicBuiltin.iter_decode() method --- xmlschema/tests/test_w3c_suite.py | 3 +++ xmlschema/validators/attributes.py | 2 ++ xmlschema/validators/complex_types.py | 6 ++++-- xmlschema/validators/elements.py | 7 ++----- xmlschema/validators/groups.py | 2 +- xmlschema/validators/simple_types.py | 29 ++++++++++++--------------- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/xmlschema/tests/test_w3c_suite.py b/xmlschema/tests/test_w3c_suite.py index dbeb25c..2170e46 100644 --- a/xmlschema/tests/test_w3c_suite.py +++ b/xmlschema/tests/test_w3c_suite.py @@ -99,6 +99,9 @@ SKIPPED_TESTS = { '../msData/additional/test93490_4.xml', # 4795: https://www.w3.org/Bugs/Public/show_bug.cgi?id=4078 '../msData/additional/test93490_8.xml', # 4799: Idem + # Valid XML tests + '../ibmData/instance_invalid/S3_4_2_4/s3_4_2_4ii03.xml', # defaultAttributeApply is true (false in comment) + # Skip for missing XML version 1.1 implementation '../saxonData/XmlVersions/xv001.v01.xml', # 14850 '../saxonData/XmlVersions/xv003.v01.xml', # 14852 diff --git a/xmlschema/validators/attributes.py b/xmlschema/validators/attributes.py index 78df62d..04f2dc2 100644 --- a/xmlschema/validators/attributes.py +++ b/xmlschema/validators/attributes.py @@ -594,7 +594,9 @@ class XsdAttributeGroup(MutableMapping, XsdComponent, ValidationMixin): reason = "missing required attribute: %r" % k yield self.validation_error(validation, reason, attrs, **kwargs) + kwargs['level'] = kwargs.get('level', 0) + 1 use_defaults = kwargs.get('use_defaults', True) + additional_attrs = [(k, v) for k, v in self.iter_predefined(use_defaults) if k not in attrs] if additional_attrs: attrs = {k: v for k, v in attrs.items()} diff --git a/xmlschema/validators/complex_types.py b/xmlschema/validators/complex_types.py index bcff57c..86ef0cc 100644 --- a/xmlschema/validators/complex_types.py +++ b/xmlschema/validators/complex_types.py @@ -701,8 +701,10 @@ class Xsd11ComplexType(XsdComplexType): self.parse_error("attribute %r must be inheritable") if 'defaultAttributesApply' in self.elem.attrib: - if self.elem.attrib['defaultAttributesApply'].strip() in {'false', '0'}: - self.default_attributes_apply = False + attr = self.elem.attrib['defaultAttributesApply'].strip() + self.default_attributes_apply = False if attr in {'false', '0'} else True + else: + self.default_attributes_apply = True # Add default attributes if self.redefine is None: diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py index 2b7fe2e..7420670 100644 --- a/xmlschema/validators/elements.py +++ b/xmlschema/validators/elements.py @@ -474,7 +474,7 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) try: level = kwargs['level'] except KeyError: - level = 0 + level = kwargs['level'] = 0 try: converter = kwargs['converter'] @@ -574,15 +574,12 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) xsd_type = xsd_type.content_type if text is None: - for result in xsd_type.iter_decode('', validation, _skip_id=True, **kwargs): + for result in xsd_type.iter_decode('', validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield self.validation_error(validation, result, elem, **kwargs) if 'filler' in kwargs: value = kwargs['filler'](self) else: - if level == 0 or self.xsd_version != '1.0': - kwargs['_skip_id'] = True - for result in xsd_type.iter_decode(text, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): yield self.validation_error(validation, result, elem, **kwargs) diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index e248c0c..2dfe9c9 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -697,7 +697,7 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): yield element_data.content return - level = kwargs['level'] = kwargs.pop('level', 0) + 1 + level = kwargs['level'] = kwargs.get('level', 0) + 1 errors = [] text = None children = [] diff --git a/xmlschema/validators/simple_types.py b/xmlschema/validators/simple_types.py index 62bed94..182015a 100644 --- a/xmlschema/validators/simple_types.py +++ b/xmlschema/validators/simple_types.py @@ -513,28 +513,25 @@ class XsdAtomicBuiltin(XsdAtomic): yield self.decode_error(validation, obj, self.to_python, reason="value is not an instance of {!r}".format(self.instance_types)) - if self.name == XSD_ID: - try: - id_map = kwargs['id_map'] - except KeyError: - pass - else: - try: - id_map[obj] += 1 - except TypeError: - id_map[obj] = 1 - - if id_map[obj] > 1 and '_skip_id' not in kwargs: - yield self.validation_error(validation, "Duplicated xsd:ID value {!r}".format(obj)) - - elif self.name == XSD_IDREF: + if self.name == XSD_IDREF: try: id_map = kwargs['id_map'] except KeyError: pass else: if obj not in id_map: - id_map[obj] = kwargs.get('node', 0) + id_map[obj] = 0 + + elif self.name == XSD_ID and kwargs.get('level') != 0: + try: + id_map = kwargs['id_map'] + except KeyError: + pass + else: + if not id_map[obj]: + id_map[obj] = 1 + else: + yield self.validation_error(validation, "Duplicated xsd:ID value {!r}".format(obj)) if validation == 'skip': try: From 2b1497860b1e339cef454eaed783fe212c38427d Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 30 Oct 2019 07:13:47 +0100 Subject: [PATCH 18/34] Fix 'all' model groups visiting - Also at each match the group element is changed (TODO: check if it's better to restart like choice groups) - In XSD 1.1 __iter__ now yields wildcards at the end for 'all' and 'choice' model groups --- xmlschema/tests/test_models.py | 56 +++++++++++++++++++++++++++++++++- xmlschema/validators/groups.py | 5 +++ xmlschema/validators/models.py | 11 +++++-- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index 3748ead..e8e41c4 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -15,13 +15,15 @@ This module runs tests concerning model groups validation. import unittest from xmlschema import XMLSchema10, XMLSchema11 -from xmlschema.validators import ModelVisitor +from xmlschema.validators import XsdElement, ModelVisitor from xmlschema.compat import ordered_dict_class from xmlschema.tests import casepath, XsdValidatorTestCase class TestModelValidation(XsdValidatorTestCase): + schema_class = XMLSchema10 + # --- Test helper functions --- def check_advance_true(self, model, expected=None): @@ -514,6 +516,32 @@ class TestModelValidation(XsdValidatorTestCase): self.check_advance_true(model) # match choice with self.assertIsNone(model.element) + def test_empty_choice_groups(self): + schema = self.schema_class(""" + + + + + + + + + + + + + + + + + """) + + xml_data = "" + model = ModelVisitor(schema.elements['root'].type.content_type) + self.assertIsInstance(model.element, XsdElement) + self.assertEqual(model.element.name, 'elem1') + self.assertIsNone(schema.validate(xml_data)) + # # Tests on issues def test_issue_086(self): @@ -576,6 +604,32 @@ class TestModelValidation(XsdValidatorTestCase): class TestModelValidation11(TestModelValidation): schema_class = XMLSchema11 + def test_all_model_with_wildcard(self): + schema = self.schema_class( + """ + + + + + + + + + + + """) + + xml_data = """ + + + 1 + + + + """ + + self.assertIsNone(schema.validate(xml_data)) + class TestModelBasedSorting(XsdValidatorTestCase): diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index 2dfe9c9..738df4f 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -821,6 +821,11 @@ class Xsd11Group(XsdGroup): Content: (annotation?, (element | any | group)*) """ + def __iter__(self): + if self.model == 'sequence': + return iter(self._group) + return iter(sorted(self._group, key=lambda x: isinstance(x, XsdAnyElement))) + def _parse_content_model(self, content_model): self.model = local_name(content_model.tag) if self.model == 'all': diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index 77c237f..46263a2 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -379,7 +379,12 @@ class ModelVisitor(MutableSequence): def _start(self): while True: item = next(self.items, None) - if item is None or not isinstance(item, ModelGroup): + if item is None: + if not self: + break + else: + self.group, self.items, self.match = self.pop() + elif not isinstance(item, ModelGroup): self.element = item break elif item: @@ -464,7 +469,9 @@ class ModelVisitor(MutableSequence): if match: occurs[element] += 1 self.match = True - if not element.is_over(occurs[element]): + if self.group.model == 'all': + pass + elif not element.is_over(occurs[element]): return obj = None From 4c624af6c91219a52865ab969436b2b1fed935cb Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 30 Oct 2019 18:05:41 +0100 Subject: [PATCH 19/34] Fix name matching and targetNamespace for XSD 1.1 declarations - Use the targetNamespace of reference - In case of a default namespace try also the match with the local name --- xmlschema/validators/attributes.py | 7 +++-- xmlschema/validators/elements.py | 45 ++++++++++++++++++++---------- xmlschema/validators/wildcards.py | 17 +++++++---- xmlschema/validators/xsdbase.py | 2 +- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/xmlschema/validators/attributes.py b/xmlschema/validators/attributes.py index 04f2dc2..ecb203b 100644 --- a/xmlschema/validators/attributes.py +++ b/xmlschema/validators/attributes.py @@ -286,9 +286,12 @@ class Xsd11Attribute(XsdAttribute): @property def target_namespace(self): - if self._target_namespace is None: + if self._target_namespace is not None: + return self._target_namespace + elif self.ref is not None: + return self.ref.target_namespace + else: return self.schema.target_namespace - return self._target_namespace def _parse(self): super(Xsd11Attribute, self)._parse() diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py index 7420670..ba5a8b0 100644 --- a/xmlschema/validators/elements.py +++ b/xmlschema/validators/elements.py @@ -715,26 +715,38 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) def is_matching(self, name, default_namespace=None, group=None): if default_namespace and name[0] != '{': - name = '{%s}%s' % (default_namespace, name) - - if name in self.names: - return True - - for xsd_element in self.iter_substitutes(): - if name in xsd_element.names: + qname = '{%s}%s' % (default_namespace, name) + if name in self.names or qname in self.names: return True + + for xsd_element in self.iter_substitutes(): + if name in xsd_element.names or qname in xsd_element.names: + return True + + elif name in self.names: + return True + else: + for xsd_element in self.iter_substitutes(): + if name in xsd_element.names: + return True return False def match(self, name, default_namespace=None, **kwargs): if default_namespace and name[0] != '{': - name = '{%s}%s' % (default_namespace, name) + qname = '{%s}%s' % (default_namespace, name) + if name in self.names or qname in self.names: + return self - if name in self.names: + for xsd_element in self.iter_substitutes(): + if name in xsd_element.names or qname in xsd_element.names: + return xsd_element + + elif name in self.names: return self - - for xsd_element in self.iter_substitutes(): - if name in xsd_element.names: - return xsd_element + else: + for xsd_element in self.iter_substitutes(): + if name in xsd_element.names: + return xsd_element def is_restriction(self, other, check_occurs=True): if isinstance(other, XsdAnyElement): @@ -905,9 +917,12 @@ class Xsd11Element(XsdElement): @property def target_namespace(self): - if self._target_namespace is None: + if self._target_namespace is not None: + return self._target_namespace + elif self.ref is not None: + return self.ref.target_namespace + else: return self.schema.target_namespace - return self._target_namespace def iter_components(self, xsd_classes=None): if xsd_classes is None: diff --git a/xmlschema/validators/wildcards.py b/xmlschema/validators/wildcards.py index 849c22c..beb14b0 100644 --- a/xmlschema/validators/wildcards.py +++ b/xmlschema/validators/wildcards.py @@ -140,7 +140,8 @@ class XsdWildcard(XsdComponent, ValidationMixin): elif default_namespace is None: return self.is_namespace_allowed('') else: - return self.is_namespace_allowed(default_namespace) + return self.is_namespace_allowed('') or \ + self.is_namespace_allowed(default_namespace) def is_namespace_allowed(self, namespace): if self.not_namespace: @@ -656,12 +657,15 @@ class Xsd11AnyElement(XsdAnyElement): if name is None: return False elif not name or name[0] == '{': - namespace = get_namespace(name) - elif default_namespace is None: - namespace = '' + if not self.is_namespace_allowed(get_namespace(name)): + return False + elif default_namespace is not None: + if not self.is_namespace_allowed(''): + return False else: name = '{%s}%s' % (default_namespace, name) - namespace = default_namespace + if not self.is_namespace_allowed('') and not self.is_namespace_allowed(default_namespace): + return False if group in self.precedences: if occurs is None: @@ -676,7 +680,8 @@ class Xsd11AnyElement(XsdAnyElement): if any(e.is_matching(name) for e in group.iter_elements() if not isinstance(e, XsdAnyElement)): return False - return name not in self.not_qname and self.is_namespace_allowed(namespace) + + return name not in self.not_qname def is_consistent(self, other): if isinstance(other, XsdAnyElement) or self.process_contents == 'skip': diff --git a/xmlschema/validators/xsdbase.py b/xmlschema/validators/xsdbase.py index fe04ca0..aab0b89 100644 --- a/xmlschema/validators/xsdbase.py +++ b/xmlschema/validators/xsdbase.py @@ -299,7 +299,7 @@ class XsdComponent(XsdValidator): @property def target_namespace(self): """Property that references to schema's targetNamespace.""" - return self.schema.target_namespace + return self.schema.target_namespace if self.ref is None else self.ref.target_namespace @property def default_namespace(self): From b95d890f51a2f84646e713e78d23ab37a3cc6288 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 31 Oct 2019 07:29:09 +0100 Subject: [PATCH 20/34] Refine 'all' models visiting - Restart at every match with not exhausted items - Do not check occurs on stop_item() - Do not reset element counter when repeat --- xmlschema/tests/test_models.py | 21 +++++++++++++++++++++ xmlschema/validators/models.py | 19 +++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index e8e41c4..d47c691 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -630,6 +630,27 @@ class TestModelValidation11(TestModelValidation): self.assertIsNone(schema.validate(xml_data)) + def test_all_model_with_extended_occurs(self): + schema = self.schema_class( + """ + + + + + + + + + + + + + """) + + xml_data = '' + + self.assertIsNone(schema.validate(xml_data)) + class TestModelBasedSorting(XsdValidatorTestCase): diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index 46263a2..df921b0 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -440,7 +440,10 @@ class ModelVisitor(MutableSequence): item_occurs = occurs[item] model = self.group.model - if item_occurs: + if model == 'all': + return False + + elif item_occurs: self.match = True if model == 'choice': occurs[item] = 0 @@ -470,7 +473,7 @@ class ModelVisitor(MutableSequence): occurs[element] += 1 self.match = True if self.group.model == 'all': - pass + self.items = (e for e in self.group if not e.is_over(occurs[e])) elif not element.is_over(occurs[element]): return @@ -487,8 +490,6 @@ class ModelVisitor(MutableSequence): if obj is None: if not self.match: if self.group.model == 'all': - for e in self.group: - occurs[e] = occurs[(e,)] if all(e.min_occurs <= occurs[e] for e in self.group): occurs[self.group] = 1 group, expected = self.group, self.expected @@ -497,16 +498,14 @@ class ModelVisitor(MutableSequence): elif self.group.model != 'all': self.items, self.match = iter(self.group), False elif any(not e.is_over(occurs[e]) for e in self.group): - for e in self.group: - occurs[(e,)] += occurs[e] self.items, self.match = (e for e in self.group if not e.is_over(occurs[e])), False else: - for e in self.group: - occurs[(e,)] += occurs[e] occurs[self.group] = 1 elif not isinstance(obj, ModelGroup): # XsdElement or XsdAnyElement - self.element, occurs[obj] = obj, 0 + self.element = obj + if self.group.model != 'all': + occurs[obj] = 0 return else: @@ -515,7 +514,7 @@ class ModelVisitor(MutableSequence): occurs[obj] = 0 if obj.model == 'all': for e in obj: - occurs[(e,)] = 0 + occurs[e] = 0 except IndexError: # Model visit ended From dd2ab7265467eadecf3c13c1b5850d8f9b35d74c Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Tue, 5 Nov 2019 11:09:34 +0100 Subject: [PATCH 21/34] Add iter_group() to ModelVisitor --- xmlschema/tests/test_models.py | 45 ++++++++++++++++++++++++++++++ xmlschema/validators/attributes.py | 3 +- xmlschema/validators/groups.py | 7 +---- xmlschema/validators/models.py | 31 +++++++++++++++----- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index d47c691..17bb15f 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -651,6 +651,51 @@ class TestModelValidation11(TestModelValidation): self.assertIsNone(schema.validate(xml_data)) + def test_all_model_with_relaxed_occurs(self): + schema = self.schema_class( + """ + + + + + + + + + + + + + """) + + xml_data = '' + + self.assertIsNone(schema.validate(xml_data)) + + schema = self.schema_class( + """ + + + + + + + + + + + + + + + + + + + """) + + self.assertIsNone(schema.validate(xml_data)) + class TestModelBasedSorting(XsdValidatorTestCase): diff --git a/xmlschema/validators/attributes.py b/xmlschema/validators/attributes.py index ecb203b..051d69d 100644 --- a/xmlschema/validators/attributes.py +++ b/xmlschema/validators/attributes.py @@ -235,7 +235,8 @@ class XsdAttribute(XsdComponent, ValidationMixin): elif text == self.fixed or validation == 'skip': pass elif self.type.text_decode(text) != self.type.text_decode(self.fixed): - yield self.validation_error(validation, "value differs from fixed value", text, **kwargs) + msg = "attribute {!r} has a fixed value {!r}".format(self.name, self.fixed) + yield self.validation_error(validation, msg, text, **kwargs) for result in self.type.iter_decode(text, validation, **kwargs): if isinstance(result, XMLSchemaValidationError): diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index 738df4f..c9ecc2e 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -821,11 +821,6 @@ class Xsd11Group(XsdGroup): Content: (annotation?, (element | any | group)*) """ - def __iter__(self): - if self.model == 'sequence': - return iter(self._group) - return iter(sorted(self._group, key=lambda x: isinstance(x, XsdAnyElement))) - def _parse_content_model(self, content_model): self.model = local_name(content_model.tag) if self.model == 'all': @@ -855,7 +850,7 @@ class Xsd11Group(XsdGroup): if ref != self.name: self.append(Xsd11Group(child, self.schema, self)) if (self.model != 'all') ^ (self[-1].model != 'all'): - msg = "an xs:%s group cannot reference to an x:%s group" + msg = "an xs:%s group cannot include a reference to an x:%s group" self.parse_error(msg % (self.model, self[-1].model)) self.pop() diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index df921b0..fac02dc 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -338,7 +338,9 @@ class ModelVisitor(MutableSequence): self.occurs = Counter() self._subgroups = [] self.element = None - self.group, self.items, self.match = root, iter(root), False + self.group = root + self.items = self.iter_group() + self.match = False self._start() def __str__(self): @@ -374,7 +376,9 @@ class ModelVisitor(MutableSequence): del self._subgroups[:] self.occurs.clear() self.element = None - self.group, self.items, self.match = self.root, iter(self.root), False + self.group = self.root + self.items = self.iter_group() + self.match = False def _start(self): while True: @@ -421,6 +425,18 @@ class ModelVisitor(MutableSequence): for e in self.advance(): yield e + def iter_group(self): + if self.group.model != 'all': + for item in self.group: + yield item + elif not self.occurs: + for e in self.group.iter_elements(): + yield e + else: + for e in self.group.iter_elements(): + if not e.is_over(self.occurs[e]): + yield e + def advance(self, match=False): """ Generator function for advance to the next element. Yields tuples with @@ -448,7 +464,7 @@ class ModelVisitor(MutableSequence): if model == 'choice': occurs[item] = 0 occurs[self.group] += 1 - self.items, self.match = iter(self.group), False + self.items, self.match = self.iter_group(), False elif model == 'sequence' and item is self.group[-1]: self.occurs[self.group] += 1 return item.is_missing(item_occurs) @@ -473,7 +489,7 @@ class ModelVisitor(MutableSequence): occurs[element] += 1 self.match = True if self.group.model == 'all': - self.items = (e for e in self.group if not e.is_over(occurs[e])) + self.items = (e for e in self.group.iter_elements() if not e.is_over(occurs[e])) elif not element.is_over(occurs[element]): return @@ -490,15 +506,16 @@ class ModelVisitor(MutableSequence): if obj is None: if not self.match: if self.group.model == 'all': - if all(e.min_occurs <= occurs[e] for e in self.group): + if all(e.min_occurs <= occurs[e] for e in self.group.iter_elements()): occurs[self.group] = 1 group, expected = self.group, self.expected if stop_item(group) and expected: yield group, occurs[group], expected elif self.group.model != 'all': - self.items, self.match = iter(self.group), False + self.items, self.match = self.iter_group(), False elif any(not e.is_over(occurs[e]) for e in self.group): - self.items, self.match = (e for e in self.group if not e.is_over(occurs[e])), False + self.items = self.iter_group() + self.match = False else: occurs[self.group] = 1 From 896982222f0a9e26a8644db54e2c975cc31f5b8a Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 6 Nov 2019 09:49:00 +0100 Subject: [PATCH 22/34] Fix Windows paths normalization --- xmlschema/resources.py | 2 ++ xmlschema/tests/test_resources.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 55ad1ab..b65af5d 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -73,6 +73,8 @@ def normalize_url(url, base_url=None, keep_relative=False): x = x.strip().replace('\\', '/') while x.startswith('//'): x = x.replace('//', '/', 1) + while x.startswith('file:////'): + x = x.replace('file:////', 'file:///', 1) if not urlsplit(x).scheme: x = x.replace('#', '%23') return x diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 447ddad..3aaebee 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -43,6 +43,15 @@ def add_leading_slash(path): return '/' + path if path and path[0] not in ('/', '\\') else path +def filter_windows_path(path): + if path.startswith('/\\'): + return path[1:] + elif path and path[0] not in ('/', '\\'): + return '/' + path + else: + return path + + class TestResources(unittest.TestCase): @classmethod @@ -68,14 +77,14 @@ class TestResources(unittest.TestCase): self.assertEqual(url_parts.fragment, expected_parts.fragment, "%r: Fragment parts differ." % url) if is_windows_path(url_parts.path) or is_windows_path(expected_parts.path): - path = PureWindowsPath(url_parts.path) - expected_path = PureWindowsPath(add_leading_slash(expected_parts.path)) + path = PureWindowsPath(filter_windows_path(url_parts.path)) + expected_path = PureWindowsPath(filter_windows_path(expected_parts.path)) else: path = PurePath(url_parts.path) expected_path = PurePath(expected_parts.path) self.assertEqual(path, expected_path, "%r: Paths differ." % url) - def test_normalize_url(self): + def test_normalize_url_posix(self): url1 = "https://example.com/xsd/other_schema.xsd" self.check_url(normalize_url(url1, base_url="/path_my_schema/schema.xsd"), url1) @@ -98,6 +107,7 @@ class TestResources(unittest.TestCase): self.check_url(normalize_url('dummy path.xsd', 'http://site/base'), 'http://site/base/dummy%20path.xsd') self.check_url(normalize_url('dummy path.xsd', 'file://host/home/'), 'file://host/home/dummy path.xsd') + def test_normalize_url_windows(self): win_abs_path1 = 'z:\\Dir_1_0\\Dir2-0\\schemas/XSD_1.0/XMLSchema.xsd' win_abs_path2 = 'z:\\Dir-1.0\\Dir-2_0\\' self.check_url(normalize_url(win_abs_path1), win_abs_path1) @@ -108,7 +118,9 @@ class TestResources(unittest.TestCase): self.check_url( normalize_url('xsd1.0/schema.xsd', win_abs_path2), 'file:///z:\\Dir-1.0\\Dir-2_0/xsd1.0/schema.xsd' ) + self.check_url(normalize_url('file:///\\k:\\Dir A\\schema.xsd'), 'file:///k:\\Dir A\\schema.xsd') + def test_normalize_url_issue_116(self): # Issue #116 self.assertEqual( normalize_url('//anaconda/envs/testenv/lib/python3.6/site-packages/xmlschema/validators/schemas/'), From 24a08c4442798dbefc0bb8e89a0ffd4a72e05b2b Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 6 Nov 2019 10:22:09 +0100 Subject: [PATCH 23/34] Add replacing of backslashes from normalize_path result --- xmlschema/resources.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index b65af5d..2f9bea1 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -113,9 +113,9 @@ def normalize_url(url, base_url=None, keep_relative=False): url_parts = urlsplit(url, scheme='file') if url_parts.scheme not in uses_relative: - return 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/.... + normalized_url = 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/.... elif url_parts.scheme != 'file': - return urlunsplit(( + normalized_url = urlunsplit(( url_parts.scheme, url_parts.netloc, pathname2url(url_parts.path), @@ -123,18 +123,20 @@ def normalize_url(url, base_url=None, keep_relative=False): url_parts.fragment, )) elif os.path.isabs(url_parts.path): - return url_parts.geturl() + normalized_url = url_parts.geturl() elif keep_relative: # Can't use urlunsplit with a scheme because it converts relative paths to absolute ones. - return 'file:{}'.format(urlunsplit(('',) + url_parts[1:])) + normalized_url = 'file:{}'.format(urlunsplit(('',) + url_parts[1:])) else: - return urlunsplit(( + normalized_url = urlunsplit(( url_parts.scheme, url_parts.netloc, os.path.abspath(url_parts.path), url_parts.query, url_parts.fragment, )) + + return normalized_url.replace('\\', '/') def fetch_resource(location, base_url=None, timeout=30): From dc82f0487428116d2ef9989401315e719ff09c6c Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 6 Nov 2019 10:39:51 +0100 Subject: [PATCH 24/34] Filter normalize_url result --- xmlschema/resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index 2f9bea1..b072ab2 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -75,7 +75,7 @@ def normalize_url(url, base_url=None, keep_relative=False): x = x.replace('//', '/', 1) while x.startswith('file:////'): x = x.replace('file:////', 'file:///', 1) - if not urlsplit(x).scheme: + if urlsplit(x).scheme in {'', 'file'}: x = x.replace('#', '%23') return x @@ -136,7 +136,7 @@ def normalize_url(url, base_url=None, keep_relative=False): url_parts.fragment, )) - return normalized_url.replace('\\', '/') + return filter_url(normalized_url) def fetch_resource(location, base_url=None, timeout=30): From b8ccfac6f104e9389684133000a1d91e459d3fc6 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 6 Nov 2019 11:40:51 +0100 Subject: [PATCH 25/34] Update test_resources avoiding usage of unapplicable 'file' scheme --- xmlschema/tests/test_resources.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 3aaebee..1534393 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -120,7 +120,7 @@ class TestResources(unittest.TestCase): ) self.check_url(normalize_url('file:///\\k:\\Dir A\\schema.xsd'), 'file:///k:\\Dir A\\schema.xsd') - def test_normalize_url_issue_116(self): + def test_normalize_url_slashes(self): # Issue #116 self.assertEqual( normalize_url('//anaconda/envs/testenv/lib/python3.6/site-packages/xmlschema/validators/schemas/'), @@ -134,12 +134,13 @@ class TestResources(unittest.TestCase): self.assertEqual(normalize_url('dir2/schema.xsd', '//root/dir1'), 'file:///root/dir1/dir2/schema.xsd') self.assertEqual(normalize_url('dir2/schema.xsd', '////root/dir1'), 'file:///root/dir1/dir2/schema.xsd') - self.check_url(normalize_url('issue #000.xml', 'file://host/home/'), - 'file://host/home/issue %23000.xml') - self.check_url(normalize_url('data.xml', 'file://host/home/issue 000'), - 'file://host/home/issue 000/data.xml') - self.check_url(normalize_url('data.xml', '/host/home/issue #000'), - '/host/home/issue %23000/data.xml') + def test_normalize_url_hash_character(self): + self.check_url(normalize_url('issue #000.xml', 'file:///dir1/dir2/'), + 'file:///dir1/dir2/issue %23000.xml') + self.check_url(normalize_url('data.xml', 'file:///dir1/dir2/issue 000'), + 'file:///dir1/dir2/issue 000/data.xml') + self.check_url(normalize_url('data.xml', '/dir1/dir2/issue #000'), + '/dir1/dir2/issue %23000/data.xml') def test_fetch_resource(self): wrong_path = casepath('resources/dummy_file.txt') @@ -456,11 +457,11 @@ class TestResources(unittest.TestCase): xml_file = resource.open() self.assertTrue(callable(xml_file.read)) - xml_file = open(self.vh_xml_file) - resource = XMLResource(source=xml_file) - resource.close() - with self.assertRaises(ValueError): - resource.open() + with open(self.vh_xml_file) as xml_file: + resource = XMLResource(source=xml_file) + resource.close() + with self.assertRaises(ValueError): + resource.open() def test_xml_resource_iter(self): resource = XMLResource(self.schema_class.meta_schema.source.url, lazy=False) From d0f3a0f6c8b10d9bcb132596a2343d055e853ae6 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Wed, 6 Nov 2019 11:49:09 +0100 Subject: [PATCH 26/34] Skip ElementTree import test with external process on Windows platform --- xmlschema/tests/test_etree.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xmlschema/tests/test_etree.py b/xmlschema/tests/test_etree.py index e039181..22e42a9 100644 --- a/xmlschema/tests/test_etree.py +++ b/xmlschema/tests/test_etree.py @@ -15,6 +15,7 @@ import os import importlib import sys import subprocess +import platform @unittest.skipIf(sys.version_info < (3,), "In Python 2 ElementTree is not overwritten by cElementTree") @@ -51,6 +52,7 @@ class TestElementTree(unittest.TestCase): self.assertIs(importlib.import_module('xml.etree.ElementTree'), ElementTree) self.assertIs(xmlschema_etree.ElementTree, ElementTree) + @unittest.skipIf(platform.system() == 'Windows', "Run only for UNIX based systems.") def test_element_tree_import_script(self): test_dir = os.path.dirname(__file__) or '.' From 2bcf78549ccea9a7a7e8aefbd1cc72d02d6ca58e Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 8 Nov 2019 07:17:29 +0100 Subject: [PATCH 27/34] Add count_occurs() to ModelGroup --- xmlschema/resources.py | 1 - xmlschema/tests/test_models.py | 18 +++++++++++++++++ xmlschema/validators/groups.py | 10 ++++++---- xmlschema/validators/models.py | 33 ++++++++++++++++++++++++++++--- xmlschema/validators/wildcards.py | 2 +- xmlschema/validators/xsdbase.py | 7 +++++++ 6 files changed, 62 insertions(+), 9 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index b072ab2..adb9c02 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -135,7 +135,6 @@ def normalize_url(url, base_url=None, keep_relative=False): url_parts.query, url_parts.fragment, )) - return filter_url(normalized_url) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index 17bb15f..4f101c9 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -542,6 +542,24 @@ class TestModelValidation(XsdValidatorTestCase): self.assertEqual(model.element.name, 'elem1') self.assertIsNone(schema.validate(xml_data)) + def test_sequence_model_with_extended_occurs(self): + schema = self.schema_class( + """ + + + + + + + + + + """) + + xml_data = '' + + self.assertIsNone(schema.validate(xml_data)) + # # Tests on issues def test_issue_086(self): diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index c9ecc2e..2684135 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -988,16 +988,18 @@ class Xsd11Group(XsdGroup): for item in restriction_items: if other_item is item or item.is_restriction(other_item, check_occurs): if max_occurs is not None: - if item.effective_max_occurs is None: + effective_max_occurs = item.effective_max_occurs + if effective_max_occurs is None: max_occurs = None else: - max_occurs = counter_func(max_occurs, item.effective_max_occurs) + max_occurs = counter_func(max_occurs, effective_max_occurs) if other_max_occurs is not None: - if other_item.effective_max_occurs is None: + effective_max_occurs = other_item.effective_max_occurs + if effective_max_occurs is None: other_max_occurs = None else: - other_max_occurs = max(other_max_occurs, other_item.effective_max_occurs) + other_max_occurs = max(other_max_occurs, effective_max_occurs) break else: continue diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index fac02dc..fc4b9af 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -156,6 +156,33 @@ class ModelGroup(MutableSequence, ParticleMixin): else: return self.max_occurs * sum(e.max_occurs for e in self) <= other.max_occurs + def count_occurs(self, occurs): + """ + Calculates the current model group occurrences from the occurs of its items. + """ + group_occurs = None + if self.model == 'sequence': + for item in filter(lambda x: occurs[x], self): + if group_occurs is not None: + return 1 + group_occurs = item.min_occurs_reps(occurs) + + elif self.model == 'choice': + for item in filter(lambda x: occurs[x], self): + group_occurs = item.min_occurs_reps(occurs) + break + + else: + for item in filter(lambda x: occurs[x], self): + group_occurs = min(1, item.min_occurs_reps(occurs)) + + if group_occurs is None: + return 0 + elif self.is_over(group_occurs): + return self.max_occurs + else: + return group_occurs + def iter_model(self, depth=0): """ A generator function iterating elements and groups of a model group. Skips pointless groups, @@ -462,17 +489,17 @@ class ModelVisitor(MutableSequence): elif item_occurs: self.match = True if model == 'choice': + occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) occurs[item] = 0 - occurs[self.group] += 1 self.items, self.match = self.iter_group(), False elif model == 'sequence' and item is self.group[-1]: - self.occurs[self.group] += 1 + self.occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) return item.is_missing(item_occurs) elif model == 'sequence': if self.match: if item is self.group[-1]: - occurs[self.group] += 1 + occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) return not item.is_emptiable() elif item.is_emptiable(): return False diff --git a/xmlschema/validators/wildcards.py b/xmlschema/validators/wildcards.py index beb14b0..fe2e448 100644 --- a/xmlschema/validators/wildcards.py +++ b/xmlschema/validators/wildcards.py @@ -141,7 +141,7 @@ class XsdWildcard(XsdComponent, ValidationMixin): return self.is_namespace_allowed('') else: return self.is_namespace_allowed('') or \ - self.is_namespace_allowed(default_namespace) + self.is_namespace_allowed(default_namespace) def is_namespace_allowed(self, namespace): if self.not_namespace: diff --git a/xmlschema/validators/xsdbase.py b/xmlschema/validators/xsdbase.py index aab0b89..65ae512 100644 --- a/xmlschema/validators/xsdbase.py +++ b/xmlschema/validators/xsdbase.py @@ -958,6 +958,13 @@ class ParticleMixin(object): def is_over(self, occurs): return self.max_occurs is not None and self.max_occurs <= occurs + def min_occurs_reps(self, occurs): + """Returns the repetitions of minimum occurrences.""" + if not self.min_occurs: + return occurs[self] + else: + return occurs[self] // self.min_occurs + def has_occurs_restriction(self, other): if self.min_occurs == self.max_occurs == 0: return True From 7c4cd8b4d3bed9d924ec3c81c5710edf056631fa Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 8 Nov 2019 16:40:02 +0100 Subject: [PATCH 28/34] Change stop_item() in ModelVisitor.advance() - Removed ParticleMixin.min_occurs_reps() - Removed ModelGroup.group_occurs() --- xmlschema/tests/test_models.py | 26 ++++++++-- xmlschema/validators/exceptions.py | 2 +- xmlschema/validators/models.py | 79 ++++++++++++++---------------- xmlschema/validators/xsdbase.py | 7 --- 4 files changed, 60 insertions(+), 54 deletions(-) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index 4f101c9..df19ae1 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -516,6 +516,8 @@ class TestModelValidation(XsdValidatorTestCase): self.check_advance_true(model) # match choice with self.assertIsNone(model.element) + # + # Test pathological cases def test_empty_choice_groups(self): schema = self.schema_class(""" @@ -549,17 +551,35 @@ class TestModelValidation(XsdValidatorTestCase): - + """) - xml_data = '' - + xml_data = '' self.assertIsNone(schema.validate(xml_data)) + def test_choice_model_with_extended_occurs(self): + schema = self.schema_class( + """ + + + + + + + + + + + """) + + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + # # Tests on issues def test_issue_086(self): diff --git a/xmlschema/validators/exceptions.py b/xmlschema/validators/exceptions.py index d47d60a..b766ac4 100644 --- a/xmlschema/validators/exceptions.py +++ b/xmlschema/validators/exceptions.py @@ -346,7 +346,7 @@ class XMLSchemaChildrenValidationError(XMLSchemaValidationError): if not expected_tags: pass # reason += " No child element is expected at this point." <-- this can be misleading elif len(expected_tags) == 1: - reason += " Tag %s expected." % expected_tags[0] + reason += " Tag %r expected." % expected_tags[0] else: reason += " Tag (%s) expected." % ' | '.join(expected_tags) diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index fc4b9af..c63ca14 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -156,33 +156,6 @@ class ModelGroup(MutableSequence, ParticleMixin): else: return self.max_occurs * sum(e.max_occurs for e in self) <= other.max_occurs - def count_occurs(self, occurs): - """ - Calculates the current model group occurrences from the occurs of its items. - """ - group_occurs = None - if self.model == 'sequence': - for item in filter(lambda x: occurs[x], self): - if group_occurs is not None: - return 1 - group_occurs = item.min_occurs_reps(occurs) - - elif self.model == 'choice': - for item in filter(lambda x: occurs[x], self): - group_occurs = item.min_occurs_reps(occurs) - break - - else: - for item in filter(lambda x: occurs[x], self): - group_occurs = min(1, item.min_occurs_reps(occurs)) - - if group_occurs is None: - return 0 - elif self.is_over(group_occurs): - return self.max_occurs - else: - return group_occurs - def iter_model(self, depth=0): """ A generator function iterating elements and groups of a model group. Skips pointless groups, @@ -486,27 +459,47 @@ class ModelVisitor(MutableSequence): if model == 'all': return False - elif item_occurs: + elif model == 'choice': + if not item_occurs: + return False + self.match = True - if model == 'choice': - occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) + + group_occurs = min(1, occurs[item] // (item.min_occurs or 1)) + if self.group.is_over(group_occurs): + group_occurs = self.group.max_occurs + occurs[self.group] += group_occurs + + if group_occurs == 1: occurs[item] = 0 - self.items, self.match = self.iter_group(), False - elif model == 'sequence' and item is self.group[-1]: - self.occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) + else: + item_occurs %= item.min_occurs + occurs[item] = item_occurs + + self.items, self.match = self.iter_group(), False return item.is_missing(item_occurs) - elif model == 'sequence': - if self.match: - if item is self.group[-1]: - occurs[self.group] += max(1, self.group.count_occurs(self.occurs)) - return not item.is_emptiable() - elif item.is_emptiable(): - return False - elif self.group.min_occurs <= occurs[self.group] or self: - return stop_item(self.group) + elif item_occurs: + self.match = True + elif self.match: + pass + elif item.is_emptiable(): + return False + elif self.group.min_occurs <= occurs[self.group] or self: + return stop_item(self.group) + else: + return True + + if item is self.group[-1]: + if any(occurs[x] for x in self if x is not item): + group_occurs = 1 else: - return True + group_occurs = max(1, occurs[item] // (item.min_occurs or 1)) + if self.group.is_over(group_occurs): + group_occurs = self.group.max_occurs + self.occurs[self.group] += max(1, group_occurs) + + return item.is_missing(item_occurs) element, occurs = self.element, self.occurs if element is None: diff --git a/xmlschema/validators/xsdbase.py b/xmlschema/validators/xsdbase.py index 65ae512..aab0b89 100644 --- a/xmlschema/validators/xsdbase.py +++ b/xmlschema/validators/xsdbase.py @@ -958,13 +958,6 @@ class ParticleMixin(object): def is_over(self, occurs): return self.max_occurs is not None and self.max_occurs <= occurs - def min_occurs_reps(self, occurs): - """Returns the repetitions of minimum occurrences.""" - if not self.min_occurs: - return occurs[self] - else: - return occurs[self] // self.min_occurs - def has_occurs_restriction(self, other): if self.min_occurs == self.max_occurs == 0: return True From 79cf89af86efb95ef1c9db747f9194a45c735bc3 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 14 Nov 2019 11:14:51 +0100 Subject: [PATCH 29/34] Fix ambiguous choice models visiting - Fixed registering max occurs for tuple (group,) - TODO: maybe the same solution for 1-length sequence groups --- xmlschema/tests/test_models.py | 71 ++++++++++++++++++-- xmlschema/validators/models.py | 114 ++++++++++++++++++--------------- 2 files changed, 125 insertions(+), 60 deletions(-) diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index df19ae1..b671cd6 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -558,8 +558,48 @@ class TestModelValidation(XsdValidatorTestCase): """) - xml_data = '' - self.assertIsNone(schema.validate(xml_data)) + self.assertIsNone(schema.validate('')) + + schema = self.schema_class( + """ + + + + + + + + + + """) + + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + + schema = self.schema_class( + """ + + + + + + + + + + + + + + + + + """) + + self.assertIsNone(schema.validate('')) + # self.assertIsNone(schema.validate('')) + # self.assertIsNone(schema.validate('')) def test_choice_model_with_extended_occurs(self): schema = self.schema_class( @@ -568,17 +608,34 @@ class TestModelValidation(XsdValidatorTestCase): - - + + """) - self.assertIsNone(schema.validate('')) - self.assertIsNone(schema.validate('')) - self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + + schema = self.schema_class( + """ + + + + + + + + + + + + """) + + self.assertIsNone(schema.validate('')) # # Tests on issues diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index c63ca14..eb79aec 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -393,7 +393,9 @@ class ModelVisitor(MutableSequence): break elif item: self.append((self.group, self.items, self.match)) - self.group, self.items, self.match = item, iter(item), False + self.group = item + self.items = self.iter_group() + self.match = False @property def expected(self): @@ -426,16 +428,13 @@ class ModelVisitor(MutableSequence): yield e def iter_group(self): + """Returns an iterator for the current model group.""" if self.group.model != 'all': - for item in self.group: - yield item + return iter(self.group) elif not self.occurs: - for e in self.group.iter_elements(): - yield e + return self.group.iter_elements() else: - for e in self.group.iter_elements(): - if not e.is_over(self.occurs[e]): - yield e + return (e for e in self.group.iter_elements() if not e.is_over(self.occurs[e])) def advance(self, match=False): """ @@ -444,6 +443,17 @@ class ModelVisitor(MutableSequence): :param match: provides current element match. """ + def get_choices(self, occurs): + max_group_occurs = max(1, occurs // (self.min_occurs or 1)) + if self.max_occurs is None: + return [x for x in range(1, max_group_occurs + 1)] + else: + delta_occurs = self.max_occurs - self.min_occurs + 1 + if occurs % max_group_occurs > delta_occurs: + return [] + else: + return [x for x in range(1, max_group_occurs + 1)] + def stop_item(item): """ Stops element or group matching, incrementing current group counter. @@ -455,30 +465,24 @@ class ModelVisitor(MutableSequence): self.group, self.items, self.match = self.pop() item_occurs = occurs[item] - model = self.group.model - if model == 'all': - return False - - elif model == 'choice': + if self.group.model == 'choice': if not item_occurs: return False - self.match = True + item_max_occurs = occurs[(item,)] or item_occurs + min_group_occurs = max(1, item_occurs // (item.max_occurs or item_occurs)) + max_group_occurs = max(1, item_max_occurs // (item.min_occurs or 1)) - group_occurs = min(1, occurs[item] // (item.min_occurs or 1)) - if self.group.is_over(group_occurs): - group_occurs = self.group.max_occurs - occurs[self.group] += group_occurs + occurs[self.group] += min_group_occurs + occurs[(self.group,)] += max_group_occurs + occurs[item] = 0 - if group_occurs == 1: - occurs[item] = 0 - else: - item_occurs %= item.min_occurs - occurs[item] = item_occurs - - self.items, self.match = self.iter_group(), False - return item.is_missing(item_occurs) + self.items = self.iter_group() + self.match = False + return item.is_missing(max(item_occurs, occurs[(item,)])) + elif self.group.model == 'all': + return False elif item_occurs: self.match = True elif self.match: @@ -494,12 +498,11 @@ class ModelVisitor(MutableSequence): if any(occurs[x] for x in self if x is not item): group_occurs = 1 else: - group_occurs = max(1, occurs[item] // (item.min_occurs or 1)) + group_occurs = max(1, item_occurs // (item.min_occurs or 1)) if self.group.is_over(group_occurs): group_occurs = self.group.max_occurs self.occurs[self.group] += max(1, group_occurs) - - return item.is_missing(item_occurs) + return item.is_missing(max(item_occurs, occurs[(item,)])) element, occurs = self.element, self.occurs if element is None: @@ -510,6 +513,9 @@ class ModelVisitor(MutableSequence): self.match = True if self.group.model == 'all': self.items = (e for e in self.group.iter_elements() if not e.is_over(occurs[e])) + elif self.group.model == 'choice': # or len(self.group) == 1: + if not element.is_over(occurs[element]) or element.is_ambiguous(): + return elif not element.is_over(occurs[element]): return @@ -523,40 +529,42 @@ class ModelVisitor(MutableSequence): stop_item(self.group) obj = next(self.items, None) - if obj is None: - if not self.match: - if self.group.model == 'all': - if all(e.min_occurs <= occurs[e] for e in self.group.iter_elements()): - occurs[self.group] = 1 - group, expected = self.group, self.expected - if stop_item(group) and expected: - yield group, occurs[group], expected - elif self.group.model != 'all': - self.items, self.match = self.iter_group(), False - elif any(not e.is_over(occurs[e]) for e in self.group): - self.items = self.iter_group() - self.match = False - else: - occurs[self.group] = 1 + if isinstance(obj, ModelGroup): + # inner 'sequence' or 'choice' XsdGroup + self.append((self.group, self.items, self.match)) + self.group = obj + self.items = self.iter_group() + self.match = False + occurs[obj] = 0 - elif not isinstance(obj, ModelGroup): # XsdElement or XsdAnyElement + elif obj is not None: + # XsdElement or XsdAnyElement self.element = obj - if self.group.model != 'all': + if self.group.model == 'sequence': occurs[obj] = 0 return + elif not self.match: + if self.group.model == 'all': + if all(e.min_occurs <= occurs[e] for e in self.group.iter_elements()): + occurs[self.group] = 1 + + group, expected = self.group, self.expected + if stop_item(group) and expected: + yield group, occurs[group], expected + + elif self.group.model != 'all': + self.items, self.match = self.iter_group(), False + elif any(not e.is_over(occurs[e]) for e in self.group): + self.items = self.iter_group() + self.match = False else: - self.append((self.group, self.items, self.match)) - self.group, self.items, self.match = obj, iter(obj), False - occurs[obj] = 0 - if obj.model == 'all': - for e in obj: - occurs[e] = 0 + occurs[self.group] = 1 except IndexError: # Model visit ended self.element = None - if self.group.is_missing(occurs[self.group]): + if self.group.is_missing(max(occurs[self.group], occurs[(self.group,)])): if self.group.model == 'choice': yield self.group, occurs[self.group], self.expected elif self.group.model == 'sequence': From 4b7b16a750052e1bc23fc13744e784521229dfdf Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Thu, 14 Nov 2019 23:01:40 +0700 Subject: [PATCH 30/34] setup.py: Add setup-requires --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 603a73e..fd93f00 100755 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ class InstallCommand(install): setup( name='xmlschema', version='1.0.15', + setup_requires=['elementpath~=1.3.0'], install_requires=['elementpath~=1.3.0'], packages=['xmlschema'], include_package_data=True, From fc3141283de7ff5409885b4e9fea12456520689a Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Thu, 14 Nov 2019 11:20:26 +0100 Subject: [PATCH 31/34] Fix some W3C failed tests - fix inherited attrs composition in XSD elements - check single ID for element's attributes validation --- xmlschema/validators/attributes.py | 6 ++++++ xmlschema/validators/complex_types.py | 2 +- xmlschema/validators/elements.py | 9 ++++----- xmlschema/validators/groups.py | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/xmlschema/validators/attributes.py b/xmlschema/validators/attributes.py index 051d69d..cbdc1e9 100644 --- a/xmlschema/validators/attributes.py +++ b/xmlschema/validators/attributes.py @@ -600,6 +600,8 @@ class XsdAttributeGroup(MutableMapping, XsdComponent, ValidationMixin): kwargs['level'] = kwargs.get('level', 0) + 1 use_defaults = kwargs.get('use_defaults', True) + id_map = kwargs.get('id_map', '') + num_id = len(id_map) additional_attrs = [(k, v) for k, v in self.iter_predefined(use_defaults) if k not in attrs] if additional_attrs: @@ -644,6 +646,10 @@ class XsdAttributeGroup(MutableMapping, XsdComponent, ValidationMixin): result_list.append((name, result)) break + if self.xsd_version == '1.0' and len(id_map) - num_id > 1: + reason = "No more than one attribute of type ID should be present in an element" + yield self.validation_error(validation, reason, attrs, **kwargs) + if kwargs.get('fill_missing') is True: if filler is None: result_list.extend((k, None) for k in self._attribute_group diff --git a/xmlschema/validators/complex_types.py b/xmlschema/validators/complex_types.py index 86ef0cc..4010445 100644 --- a/xmlschema/validators/complex_types.py +++ b/xmlschema/validators/complex_types.py @@ -502,7 +502,7 @@ class XsdComplexType(XsdType, ValidationMixin): elif other.name == XSD_ANY_TYPE: return True elif self.base_type is other: - return derivation is None or self.base_type.derivation == derivation + return derivation is None # or self.base_type.derivation == derivation elif hasattr(other, 'member_types'): return any(self.is_derived(m, derivation) for m in other.member_types) elif self.base_type is None: diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py index ba5a8b0..4b2d3c7 100644 --- a/xmlschema/validators/elements.py +++ b/xmlschema/validators/elements.py @@ -21,7 +21,7 @@ from ..exceptions import XMLSchemaAttributeError from ..qnames import XSD_ANNOTATION, XSD_GROUP, XSD_SEQUENCE, XSD_ALL, \ XSD_CHOICE, XSD_ATTRIBUTE_GROUP, XSD_COMPLEX_TYPE, XSD_SIMPLE_TYPE, \ XSD_ALTERNATIVE, XSD_ELEMENT, XSD_ANY_TYPE, XSD_UNIQUE, XSD_KEY, \ - XSD_KEYREF, XSI_NIL, XSI_TYPE, XSD_ID, XSD_ERROR, get_qname + XSD_KEYREF, XSI_NIL, XSI_TYPE, XSD_ERROR, get_qname from ..etree import etree_element from ..helpers import get_xsd_derivation_attribute, get_xsd_form_attribute, \ ParticleCounter, strictly_equal @@ -244,15 +244,13 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) if not self.type.is_valid(attrib['default']): msg = "'default' value {!r} is not compatible with the type {!r}" self.parse_error(msg.format(attrib['default'], self.type)) - elif self.xsd_version == '1.0' and ( - self.type.name == XSD_ID or self.type.is_derived(self.schema.meta_schema.types['ID'])): + elif self.xsd_version == '1.0' and self.type.is_key(): self.parse_error("'xs:ID' or a type derived from 'xs:ID' cannot has a 'default'") elif 'fixed' in attrib: if not self.type.is_valid(attrib['fixed']): msg = "'fixed' value {!r} is not compatible with the type {!r}" self.parse_error(msg.format(attrib['fixed'], self.type)) - elif self.xsd_version == '1.0' and ( - self.type.name == XSD_ID or self.type.is_derived(self.schema.meta_schema.types['ID'])): + elif self.xsd_version == '1.0' and self.type.is_key(): self.parse_error("'xs:ID' or a type derived from 'xs:ID' cannot has a 'default'") return 0 @@ -963,6 +961,7 @@ class Xsd11Element(XsdElement): if inherited: dummy = etree_element('_dummy_element', attrib=inherited) + dummy.attrib.update(elem.attrib) for alt in filter(lambda x: x.type is not None, self.alternatives): if alt.token is None or alt.test(elem) or alt.test(dummy): diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index 2684135..f3fa2ce 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -486,7 +486,7 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): if 'substitution' in model_element.block \ or xsd_element.type.is_blocked(model_element): raise XMLSchemaValidationError( - model_element, "substitution of %r is blocked" % model_element + model_element, elem, "substitution of %r is blocked" % model_element ) alternatives = () From 8207284c5a4f9f5a53e89aad6be57f8248686ee7 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 15 Nov 2019 09:25:04 +0100 Subject: [PATCH 32/34] Clean models.py module from unused code --- xmlschema/validators/models.py | 103 ++------------------------------- 1 file changed, 6 insertions(+), 97 deletions(-) diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index eb79aec..c26859d 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -443,17 +443,6 @@ class ModelVisitor(MutableSequence): :param match: provides current element match. """ - def get_choices(self, occurs): - max_group_occurs = max(1, occurs // (self.min_occurs or 1)) - if self.max_occurs is None: - return [x for x in range(1, max_group_occurs + 1)] - else: - delta_occurs = self.max_occurs - self.min_occurs + 1 - if occurs % max_group_occurs > delta_occurs: - return [] - else: - return [x for x in range(1, max_group_occurs + 1)] - def stop_item(item): """ Stops element or group matching, incrementing current group counter. @@ -496,12 +485,13 @@ class ModelVisitor(MutableSequence): if item is self.group[-1]: if any(occurs[x] for x in self if x is not item): - group_occurs = 1 + self.occurs[self.group] += 1 else: group_occurs = max(1, item_occurs // (item.min_occurs or 1)) if self.group.is_over(group_occurs): group_occurs = self.group.max_occurs - self.occurs[self.group] += max(1, group_occurs) + self.occurs[self.group] += max(1, group_occurs) + return item.is_missing(max(item_occurs, occurs[(item,)])) element, occurs = self.element, self.occurs @@ -513,11 +503,10 @@ class ModelVisitor(MutableSequence): self.match = True if self.group.model == 'all': self.items = (e for e in self.group.iter_elements() if not e.is_over(occurs[e])) - elif self.group.model == 'choice': # or len(self.group) == 1: - if not element.is_over(occurs[element]) or element.is_ambiguous(): - return elif not element.is_over(occurs[element]): return + elif self.group.model == 'choice' and element.is_ambiguous(): + return obj = None try: @@ -535,7 +524,7 @@ class ModelVisitor(MutableSequence): self.group = obj self.items = self.iter_group() self.match = False - occurs[obj] = 0 + occurs[obj] = occurs[(obj,)] = 0 elif obj is not None: # XsdElement or XsdAnyElement @@ -694,83 +683,3 @@ class ModelVisitor(MutableSequence): for name, values in unordered_content.items(): for v in values: yield name, v - - -class Occurrence(object): - """ - Class for XSD particles occurrence counting and comparison. - """ - def __init__(self, occurs): - self.occurs = occurs - - def add(self, occurs): - if self.occurs is None: - pass - elif occurs is None: - self.occurs = None - else: - self.occurs += occurs - - def sub(self, occurs): - if self.occurs is None: - pass - elif occurs is None: - self.occurs = 0 - else: - self.occurs -= occurs - - def mul(self, occurs): - if occurs == 0: - self.occurs = 0 - elif not self.occurs: - pass - elif occurs is None: - self.occurs = None - else: - self.occurs *= occurs - - def max(self, occurs): - if self.occurs is None: - pass - elif occurs is None: - self.occurs = occurs - else: - self.occurs = max(self.occurs, occurs) - - def __eq__(self, occurs): - return self.occurs == occurs - - def __ne__(self, occurs): - return self.occurs != occurs - - def __ge__(self, occurs): - if self.occurs is None: - return True - elif occurs is None: - return False - else: - return self.occurs >= occurs - - def __gt__(self, occurs): - if self.occurs is None: - return True - elif occurs is None: - return False - else: - return self.occurs > occurs - - def __le__(self, occurs): - if occurs is None: - return True - elif self.occurs is None: - return False - else: - return self.occurs <= occurs - - def __lt__(self, occurs): - if occurs is None: - return True - elif self.occurs is None: - return False - else: - return self.occurs < occurs From a60532a3ab01d71074964cf3bdd388933783f7ed Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Fri, 15 Nov 2019 17:49:46 +0100 Subject: [PATCH 33/34] Fix sequence model stop criteria --- CHANGELOG.rst | 8 ++++--- xmlschema/documents.py | 2 +- xmlschema/tests/test_models.py | 31 +++++++++++++++++++++------ xmlschema/validators/models.py | 39 +++++++++++++++++++++------------- 4 files changed, 55 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0af63c5..f884662 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,11 +2,12 @@ CHANGELOG ********* -`v1.0.16`_ (2019-10-XX) +`v1.0.16`_ (2019-11-15) ======================= -* Improved XMLResource with zip files interface and lazy +* Improved XMLResource class for working with compressed files * Fix for validation with XSD wildcards and 'lax' process content -* Fix for issue #1... +* Fix ambiguous items validation for xs:choice and xs:sequence models +* Dozens of W3C's failed tests fixed `v1.0.15`_ (2019-10-13) ======================= @@ -271,3 +272,4 @@ v0.9.6 (2017-05-05) .. _v1.0.13: https://github.com/brunato/xmlschema/compare/v1.0.11...v1.0.13 .. _v1.0.14: https://github.com/brunato/xmlschema/compare/v1.0.13...v1.0.14 .. _v1.0.15: https://github.com/brunato/xmlschema/compare/v1.0.14...v1.0.15 +.. _v1.0.16: https://github.com/brunato/xmlschema/compare/v1.0.15...v1.0.16 diff --git a/xmlschema/documents.py b/xmlschema/documents.py index 439a8c9..02e6471 100644 --- a/xmlschema/documents.py +++ b/xmlschema/documents.py @@ -171,7 +171,7 @@ def from_json(source, schema, path=None, converter=None, json_options=None, **kw :param source: can be a string or a :meth:`read()` supporting file-like object \ containing the JSON document. - :param schema: an :class:`XMLSchema` instance. + :param schema: an :class:`XMLSchema` or an :class:`XMLSchema11` instance. :param path: is an optional XPath expression for selecting the element of the schema \ that matches the data that has to be encoded. For default the first global element of \ the schema is used. diff --git a/xmlschema/tests/test_models.py b/xmlschema/tests/test_models.py index b671cd6..a02b9b7 100644 --- a/xmlschema/tests/test_models.py +++ b/xmlschema/tests/test_models.py @@ -551,14 +551,14 @@ class TestModelValidation(XsdValidatorTestCase): - + """) - self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) schema = self.schema_class( """ @@ -577,6 +577,8 @@ class TestModelValidation(XsdValidatorTestCase): self.assertIsNone(schema.validate('')) self.assertIsNone(schema.validate('')) + def test_sequence_model_with_nested_choice_model(self): + schema = self.schema_class( """ @@ -589,7 +591,7 @@ class TestModelValidation(XsdValidatorTestCase): - + @@ -597,9 +599,26 @@ class TestModelValidation(XsdValidatorTestCase): """) - self.assertIsNone(schema.validate('')) - # self.assertIsNone(schema.validate('')) - # self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + self.assertIsNone(schema.validate('')) + + def test_sequence_model_with_optional_elements(self): + schema = self.schema_class( + """ + + + + + + + + + + + """) + + self.assertIsNone(schema.validate('')) def test_choice_model_with_extended_occurs(self): schema = self.schema_class( diff --git a/xmlschema/validators/models.py b/xmlschema/validators/models.py index c26859d..96cfaaf 100644 --- a/xmlschema/validators/models.py +++ b/xmlschema/validators/models.py @@ -453,12 +453,12 @@ class ModelVisitor(MutableSequence): if isinstance(item, ModelGroup): self.group, self.items, self.match = self.pop() - item_occurs = occurs[item] if self.group.model == 'choice': + item_occurs = occurs[item] if not item_occurs: return False - item_max_occurs = occurs[(item,)] or item_occurs + min_group_occurs = max(1, item_occurs // (item.max_occurs or item_occurs)) max_group_occurs = max(1, item_max_occurs // (item.min_occurs or 1)) @@ -468,31 +468,40 @@ class ModelVisitor(MutableSequence): self.items = self.iter_group() self.match = False - return item.is_missing(max(item_occurs, occurs[(item,)])) + return item.is_missing(item_max_occurs) elif self.group.model == 'all': return False - elif item_occurs: - self.match = True elif self.match: pass + elif occurs[item]: + self.match = True elif item.is_emptiable(): return False - elif self.group.min_occurs <= occurs[self.group] or self: + elif self.group.min_occurs <= max(occurs[self.group], occurs[(self.group,)]) or self: return stop_item(self.group) else: return True if item is self.group[-1]: - if any(occurs[x] for x in self if x is not item): - self.occurs[self.group] += 1 - else: - group_occurs = max(1, item_occurs // (item.min_occurs or 1)) - if self.group.is_over(group_occurs): - group_occurs = self.group.max_occurs - self.occurs[self.group] += max(1, group_occurs) + for k, item2 in enumerate(self.group, start=1): + item_occurs = occurs[item2] + if not item_occurs: + continue - return item.is_missing(max(item_occurs, occurs[(item,)])) + item_max_occurs = occurs[(item2,)] or item_occurs + if item_max_occurs == 1 or any(not x.is_emptiable() for x in self.group[k:]): + self.occurs[self.group] += 1 + break + + min_group_occurs = max(1, item_occurs // (item2.max_occurs or item_occurs)) + max_group_occurs = max(1, item_max_occurs // (item2.min_occurs or 1)) + + occurs[self.group] += min_group_occurs + occurs[(self.group,)] += max_group_occurs + break + + return item.is_missing(max(occurs[item], occurs[(item,)])) element, occurs = self.element, self.occurs if element is None: @@ -514,7 +523,7 @@ class ModelVisitor(MutableSequence): yield element, occurs[element], [element] while True: - while self.group.is_over(occurs[self.group]): + while self.group.is_over(max(occurs[self.group], occurs[(self.group,)])): stop_item(self.group) obj = next(self.items, None) From 92de835afa9bbd354ead7ad1e4b06051eef085c6 Mon Sep 17 00:00:00 2001 From: Davide Brunato Date: Mon, 18 Nov 2019 06:40:16 +0100 Subject: [PATCH 34/34] Fix defaultOpenContent and defaultAttributes parsing --- xmlschema/validators/complex_types.py | 118 ++++++++++++++++---------- xmlschema/validators/elements.py | 4 + xmlschema/validators/groups.py | 5 +- xmlschema/validators/identities.py | 2 +- xmlschema/validators/simple_types.py | 7 +- xmlschema/validators/wildcards.py | 4 +- xmlschema/validators/xsdbase.py | 4 +- 7 files changed, 92 insertions(+), 52 deletions(-) diff --git a/xmlschema/validators/complex_types.py b/xmlschema/validators/complex_types.py index 4010445..edfe0b1 100644 --- a/xmlschema/validators/complex_types.py +++ b/xmlschema/validators/complex_types.py @@ -13,8 +13,9 @@ from __future__ import unicode_literals from ..exceptions import XMLSchemaValueError from ..qnames import XSD_ANNOTATION, XSD_GROUP, XSD_ATTRIBUTE_GROUP, XSD_SEQUENCE, \ XSD_ALL, XSD_CHOICE, XSD_ANY_ATTRIBUTE, XSD_ATTRIBUTE, XSD_COMPLEX_CONTENT, \ - XSD_RESTRICTION, XSD_COMPLEX_TYPE, XSD_EXTENSION, XSD_ANY_TYPE, XSD_SIMPLE_CONTENT, \ - XSD_ANY_SIMPLE_TYPE, XSD_OPEN_CONTENT, XSD_ASSERT, get_qname, local_name + XSD_RESTRICTION, XSD_COMPLEX_TYPE, XSD_EXTENSION, XSD_ANY_TYPE, XSD_OVERRIDE, \ + XSD_SIMPLE_CONTENT, XSD_ANY_SIMPLE_TYPE, XSD_OPEN_CONTENT, XSD_ASSERT, \ + get_qname, local_name from ..helpers import get_xsd_derivation_attribute from .exceptions import XMLSchemaValidationError, XMLSchemaDecodeError @@ -52,6 +53,8 @@ class XsdComplexType(XsdType, ValidationMixin): mixed = False assertions = () open_content = None + content_type = None + default_open_content = None _block = None _ADMITTED_TAGS = {XSD_COMPLEX_TYPE, XSD_RESTRICTION} @@ -138,6 +141,10 @@ class XsdComplexType(XsdType, ValidationMixin): elif content_elem.tag in {XSD_GROUP, XSD_SEQUENCE, XSD_ALL, XSD_CHOICE}: self.content_type = self.schema.BUILDERS.group_class(content_elem, self.schema, self) + default_open_content = self.default_open_content + if default_open_content and \ + (self.mixed or self.content_type or default_open_content.applies_to_empty): + self.open_content = default_open_content self._parse_content_tail(elem) elif content_elem.tag == XSD_SIMPLE_CONTENT: @@ -179,6 +186,7 @@ class XsdComplexType(XsdType, ValidationMixin): self.base_type = base_type elif self.redefine: self.base_type = self.redefine + self.open_content = None if derivation_elem.tag == XSD_RESTRICTION: self._parse_complex_content_restriction(derivation_elem, base_type) @@ -344,9 +352,11 @@ class XsdComplexType(XsdType, ValidationMixin): "derived an empty content from base type that has not empty content.", elem ) - if not self.open_content and self.schema.default_open_content: - if content_type or self.schema.default_open_content.applies_to_empty: - self.open_content = self.schema.default_open_content + if not self.open_content: + default_open_content = self.default_open_content + if default_open_content and \ + (self.mixed or content_type or default_open_content.applies_to_empty): + self.open_content = default_open_content if self.open_content and content_type and \ not self.open_content.is_restriction(base_type.open_content): @@ -453,6 +463,8 @@ class XsdComplexType(XsdType, ValidationMixin): def is_empty(self): if self.name == XSD_ANY_TYPE: return False + elif self.open_content and self.open_content.mode != 'none': + return False return self.content_type.is_empty() def is_emptiable(self): @@ -571,6 +583,10 @@ class XsdComplexType(XsdType, ValidationMixin): :return: yields a 3-tuple (simple content, complex content, attributes) containing \ the decoded parts, eventually preceded by a sequence of validation or decoding errors. """ + if self.is_empty() and elem.text: + reason = "character data between child elements not allowed because the type's content is empty" + yield self.validation_error(validation, reason, elem, **kwargs) + # XSD 1.1 assertions for assertion in self.assertions: for error in assertion(elem, **kwargs): @@ -665,6 +681,32 @@ class Xsd11ComplexType(XsdComplexType): _CONTENT_TAIL_TAGS = {XSD_ATTRIBUTE_GROUP, XSD_ATTRIBUTE, XSD_ANY_ATTRIBUTE, XSD_ASSERT} + @property + def default_attributes(self): + if self.redefine is not None: + return self.schema.default_attributes + + for child in filter(lambda x: x.tag == XSD_OVERRIDE, self.schema.root): + if self.elem in child: + schema = self.schema.includes[child.attrib['schemaLocation']] + if schema.override is self.schema: + return schema.default_attributes + else: + return self.schema.default_attributes + + @property + def default_open_content(self): + if self.parent is not None: + return self.schema.default_open_content + + for child in filter(lambda x: x.tag == XSD_OVERRIDE, self.schema.root): + if self.elem in child: + schema = self.schema.includes[child.attrib['schemaLocation']] + if schema.override is self.schema: + return schema.default_open_content + else: + return self.schema.default_open_content + def _parse(self): super(Xsd11ComplexType, self)._parse() @@ -677,19 +719,12 @@ class Xsd11ComplexType(XsdComplexType): # Add open content to complex content type if isinstance(self.content_type, XsdGroup): - open_content = self.open_content - if open_content is not None: - pass - elif self.schema.default_open_content is not None: - if self.content_type or self.schema.default_open_content.applies_to_empty: - open_content = self.schema.default_open_content - - if open_content is None: - pass - elif open_content.mode == 'interleave': - self.content_type.interleave = self.content_type.suffix = open_content.any_element - elif open_content.mode == 'suffix': - self.content_type.suffix = open_content.any_element + if self.open_content is None: + assert self.content_type.interleave is None and self.content_type.suffix is None + elif self.open_content.mode == 'interleave': + self.content_type.interleave = self.content_type.suffix = self.open_content.any_element + elif self.open_content.mode == 'suffix': + self.content_type.suffix = self.open_content.any_element # Add inheritable attributes if hasattr(self.base_type, 'attributes'): @@ -707,19 +742,12 @@ class Xsd11ComplexType(XsdComplexType): self.default_attributes_apply = True # Add default attributes - if self.redefine is None: - default_attributes = self.schema.default_attributes - else: - default_attributes = self.redefine.schema.default_attributes - - if default_attributes is None: - pass - elif self.default_attributes_apply and not self.is_override(): - if self.redefine is None and any(k in self.attributes for k in default_attributes): - self.parse_error("at least a default attribute is already declared in the complex type") - self.attributes.update( - (k, v) for k, v in default_attributes.items() if k not in self.attributes - ) + if self.default_attributes_apply: + default_attributes = self.default_attributes + if default_attributes is not None: + if self.redefine is None and any(k in self.attributes for k in default_attributes): + self.parse_error("at least a default attribute is already declared in the complex type") + self.attributes.update((k, v) for k, v in default_attributes.items()) def _parse_complex_content_extension(self, elem, base_type): # Complex content extension with simple base is forbidden XSD 1.1. @@ -744,19 +772,6 @@ class Xsd11ComplexType(XsdComplexType): else: group_elem = None - if not self.open_content: - if self.schema.default_open_content: - self.open_content = self.schema.default_open_content - elif getattr(base_type, 'open_content', None): - self.open_content = base_type.open_content - - try: - if self.open_content and not base_type.open_content.is_restriction(self.open_content): - msg = "{!r} is not an extension of the base type {!r}" - self.parse_error(msg.format(self.open_content, base_type.open_content)) - except AttributeError: - pass - if not base_type.content_type: if not base_type.mixed: # Empty element-only model extension: don't create a nested sequence group. @@ -831,6 +846,21 @@ class Xsd11ComplexType(XsdComplexType): else: self.content_type = self.schema.create_empty_content_group(self) + if not self.open_content: + default_open_content = self.default_open_content + if default_open_content and \ + (self.mixed or self.content_type or default_open_content.applies_to_empty): + self.open_content = default_open_content + elif base_type.open_content: + self.open_content = base_type.open_content + + if base_type.open_content and self.open_content is not base_type.open_content: + if self.open_content.mode == 'none': + self.open_content = base_type.open_content + elif not base_type.open_content.is_restriction(self.open_content): + msg = "{!r} is not an extension of the base type {!r}" + self.parse_error(msg.format(self.open_content, base_type.open_content)) + self._parse_content_tail(elem, derivation='extension', base_attributes=base_type.attributes) def _parse_content_tail(self, elem, **kwargs): diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py index 4b2d3c7..636c537 100644 --- a/xmlschema/validators/elements.py +++ b/xmlschema/validators/elements.py @@ -531,6 +531,10 @@ class XsdElement(XsdComponent, ValidationMixin, ParticleMixin, ElementPathMixin) yield converter.element_decode(element_data, self, level) return + if xsd_type.is_empty() and elem.text: + reason = "character data is not allowed because the type's content is empty" + yield self.validation_error(validation, reason, elem, **kwargs) + if not xsd_type.has_simple_content(): for assertion in xsd_type.assertions: for error in assertion(elem, **kwargs): diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py index f3fa2ce..23ccbad 100644 --- a/xmlschema/validators/groups.py +++ b/xmlschema/validators/groups.py @@ -526,7 +526,8 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): if model_element is not xsd_element and model_element.block: for derivation in model_element.block.split(): - if xsd_type.is_derived(model_element.type, derivation): + if xsd_type is not model_element.type and \ + xsd_type.is_derived(model_element.type, derivation): reason = "usage of %r with type %s is blocked by head element" raise XMLSchemaValidationError(self, reason % (xsd_element, derivation)) @@ -578,7 +579,7 @@ class XsdGroup(XsdComponent, ModelGroup, ValidationMixin): if len(self) == 1 and isinstance(self[0], XsdAnyElement): pass # [XsdAnyElement()] equals to an empty complexType declaration else: - reason = "character data between child elements not allowed!" + reason = "character data between child elements not allowed" yield self.validation_error(validation, reason, elem, **kwargs) cdata_index = 0 # Do not decode CDATA diff --git a/xmlschema/validators/identities.py b/xmlschema/validators/identities.py index 1e51d95..65b4fd3 100644 --- a/xmlschema/validators/identities.py +++ b/xmlschema/validators/identities.py @@ -201,7 +201,7 @@ class XsdIdentity(XsdComponent): yield XMLSchemaValidationError(self, e, "{!r} is not an element".format(xsd_element)) xsd_fields = self.get_fields(xsd_element) - if all(fld is None for fld in xsd_fields): + if not xsd_fields or all(fld is None for fld in xsd_fields): continue try: diff --git a/xmlschema/validators/simple_types.py b/xmlschema/validators/simple_types.py index 182015a..e6e5a3d 100644 --- a/xmlschema/validators/simple_types.py +++ b/xmlschema/validators/simple_types.py @@ -334,6 +334,10 @@ class XsdSimpleType(XsdType, ValidationMixin): else: return self.base_type.is_derived(other, derivation) + def is_dynamic_consistent(self, other): + return other is self.any_type or other is self.any_simple_type or self.is_derived(other) or \ + hasattr(other, 'member_types') and any(self.is_derived(mt) for mt in other.member_types) + def normalize(self, text): """ Normalize and restrict value-space with pre-lexical and lexical facets. @@ -867,7 +871,8 @@ class XsdUnion(XsdSimpleType): return all(mt.is_list() for mt in self.member_types) def is_dynamic_consistent(self, other): - return other.is_derived(self) or hasattr(other, 'member_types') and \ + return other is self.any_type or other is self.any_simple_type or \ + other.is_derived(self) or hasattr(other, 'member_types') and \ any(mt1.is_derived(mt2) for mt1 in other.member_types for mt2 in self.member_types) def iter_components(self, xsd_classes=None): diff --git a/xmlschema/validators/wildcards.py b/xmlschema/validators/wildcards.py index fe2e448..82c2071 100644 --- a/xmlschema/validators/wildcards.py +++ b/xmlschema/validators/wildcards.py @@ -782,8 +782,8 @@ class XsdOpenContent(XsdComponent): return True def is_restriction(self, other): - if self.mode == 'none' or other is None or other.mode == 'none': - return True + if other is None or other.mode == 'none': + return self.mode == 'none' elif self.mode == 'interleave' and other.mode == 'suffix': return False else: diff --git a/xmlschema/validators/xsdbase.py b/xmlschema/validators/xsdbase.py index aab0b89..a1af296 100644 --- a/xmlschema/validators/xsdbase.py +++ b/xmlschema/validators/xsdbase.py @@ -701,8 +701,8 @@ class XsdType(XsdComponent): return any(self.is_derived(xsd_type, derivation) for derivation in block) def is_dynamic_consistent(self, other): - return self.is_derived(other) or hasattr(other, 'member_types') and \ - any(self.is_derived(mt) for mt in other.member_types) + return other is self.any_type or self.is_derived(other) or \ + hasattr(other, 'member_types') and any(self.is_derived(mt) for mt in other.member_types) def is_key(self): return self.name == XSD_ID or self.is_derived(self.maps.types[XSD_ID])