From 4085e8daa5600fd880cdc02b768dcfe3d6f91fcd Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 5 Jul 2019 00:06:34 +1000 Subject: [PATCH 1/2] Add failing test when reloading xml from zipfile content. This problem comes from the way `XMLResource` re-opens files if it wants to get more information from them. `XMLResource` is deriving the source location of file like objects from their `name` attr. When that attr doesn't correspond to a file on disk (zipfile contents, Django files) an error is raised when `XMLResource.open()` is called. --- xmlschema/tests/test_resources.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 038e05f..4022c08 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -15,6 +15,7 @@ This module runs tests concerning resources. import unittest import os import platform +import zipfile try: from pathlib import PureWindowsPath, PurePath @@ -26,7 +27,7 @@ from xmlschema import ( load_xml_resource, XMLResource, XMLSchemaURLError ) from xmlschema.tests import XMLSchemaTestCase, SKIP_REMOTE_TESTS -from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO +from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO, BytesIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, is_etree_element, etree_element, py_etree_element @@ -383,6 +384,27 @@ class TestResources(XMLSchemaTestCase): for schema in vh_schema.maps.iter_schemas(): self.assertIsInstance(schema.root, etree_element) + def test_fid_with_name_attr(self): + """XMLResource gets correct data when passed a file like object + with a name attribute that isn't on disk. + + These file descriptors appear when working with the contents from a + zip using the zipfile module and with Django files in some + instances. + """ + zipname = "not__on____disk.xml" + bytes_fid = BytesIO() + with zipfile.ZipFile(bytes_fid, 'w') as zf: + with open(self.vh_xml_file) as fid: + zf.writestr(zipname, fid.read()) + + bytes_fid.seek(0) + with zipfile.ZipFile(bytes_fid) as zf: + with zf.open(zipname) as fid: + resource = XMLResource(fid) + # This should not cause an error. + resource.load() + if __name__ == '__main__': from xmlschema.tests import print_test_header From 61e1f609fcfe1b4acb44d75c416227ef9b56474d Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 5 Jul 2019 22:23:28 +1000 Subject: [PATCH 2/2] Stop reading `name` and `url` from file object attrs These attrs shouldn't be used to reopen the file object as: - they may not reflect the original file or resource (file objects opened from a zipfile will have a name that doesn't correspond to any file on disk). - Depending on how the fid was opened, these attrs could be crafted to read arbitrary files from disk. If the creator of a .zip gives a file inside the zip file a path of `/etc/passwd` we may end up opening that file. Instead of reopening the file, we keep track of the file object and seek to the beginning of the file. This means (for most operations) the file object must be seekable. On Python 2 urlopen returns an unseekable object for 'file://' paths. One test had to be skipped in Python 2 for this reason. --- xmlschema/resources.py | 98 ++++++++++++++++++------------- xmlschema/tests/test_resources.py | 61 +++++++++++++------ 2 files changed, 100 insertions(+), 59 deletions(-) diff --git a/xmlschema/resources.py b/xmlschema/resources.py index b2898f7..be9f1e8 100644 --- a/xmlschema/resources.py +++ b/xmlschema/resources.py @@ -245,7 +245,7 @@ class XMLResource(object): if base_url is not None and not isinstance(base_url, string_base_type): raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) - self._root = self._document = self._url = self._text = None + self._root = self._document = self._url = self._text = self._fid = None self._base_url = base_url self.defuse = defuse self.timeout = timeout @@ -274,7 +274,7 @@ class XMLResource(object): def __setattr__(self, name, value): if name == 'source': - self._root, self._document, self._text, self._url = self._fromsource(value) + self._root, self._document, self._text, self._url, self._fid = self._fromsource(value) elif name == 'defuse' and value not in DEFUSE_MODES: raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) elif name == 'timeout' and (not isinstance(value, int) or value <= 0): @@ -287,16 +287,16 @@ class XMLResource(object): url, lazy = None, self._lazy if is_etree_element(source): self._lazy = False - return source, None, None, None # Source is already an Element --> nothing to load + return source, None, None, None, None # Source is already an Element --> nothing to load elif isinstance(source, string_base_type): _url, self._url = self._url, None try: if lazy: # check if source is a string containing a valid XML root for _, root in self.iterparse(StringIO(source), events=('start',)): - return root, None, source, None + return root, None, source, None, None else: - return self.fromstring(source), None, source, None + return self.fromstring(source), None, source, None, None except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): if '\n' in source: raise @@ -309,33 +309,24 @@ class XMLResource(object): try: if lazy: for _, root in self.iterparse(source, events=('start',)): - return root, None, source.getvalue(), None + return root, None, source.getvalue(), None, None else: document = self.parse(source) - return document.getroot(), document, source.getvalue(), None + return document.getroot(), document, source.getvalue(), None, None finally: self._url = _url elif hasattr(source, 'read'): - # source should be a file-like object + _url, self._url = self._url, url try: - if hasattr(source, 'url'): - url = source.url + if lazy: + for _, root in self.iterparse(source, events=('start',)): + return root, None, None, url, source else: - url = normalize_url(source.name) - except AttributeError: - pass - else: - _url, self._url = self._url, url - try: - if lazy: - for _, root in self.iterparse(source, events=('start',)): - return root, None, None, url - else: - document = self.parse(source) - return document.getroot(), document, None, url - finally: - self._url = _url + document = self.parse(source) + return document.getroot(), document, None, url, source + finally: + self._url = _url else: # Try ElementTree object at last @@ -346,7 +337,7 @@ class XMLResource(object): else: if is_etree_element(root): self._lazy = False - return root, source, None, None + return root, source, None, None, None if url is None: raise XMLSchemaTypeError( @@ -359,11 +350,11 @@ class XMLResource(object): try: if lazy: for _, root in self.iterparse(resource, events=('start',)): - return root, None, None, url + return root, None, None, url, None else: document = self.parse(resource) root = document.getroot() - return root, document, None, url + return root, document, None, url, None finally: self._url = _url resource.close() @@ -482,6 +473,10 @@ class XMLResource(object): def open(self): """Returns a opened resource reader object for the instance URL.""" + if self._fid is not None: + self._fid.seek(0) + return self._fid + if self._url is None: raise XMLSchemaValueError("can't open, the resource has no URL associated.") try: @@ -494,7 +489,7 @@ class XMLResource(object): Loads the XML text from the data source. If the data source is an Element the source XML text can't be retrieved. """ - if self._url is None: + if self._url is None and self._fid is None: return # Created from Element or text source --> already loaded resource = self.open() @@ -503,16 +498,25 @@ class XMLResource(object): except (OSError, IOError) as err: raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err)) finally: - resource.close() + # We don't want to close the file obj if it wasn't originally + # opened by `XMLResource`. That is the concern of the code + # where the file obj came from. + if self._fid is None: + resource.close() - try: - self._text = data.decode('utf-8') if PY3 else data.encode('utf-8') - except UnicodeDecodeError: - if PY3: - self._text = data.decode('iso-8859-1') - else: - with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: - self._text = f.read().encode('iso-8859-1') + if isinstance(data, bytes): + try: + text = data.decode('utf-8') if PY3 else data.encode('utf-8') + except UnicodeDecodeError: + if PY3: + text = data.decode('iso-8859-1') + else: + with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: + text = f.read().encode('iso-8859-1') + else: + text = data + + self._text = text def is_lazy(self): """Returns `True` if the XML resource is lazy.""" @@ -528,6 +532,9 @@ class XMLResource(object): for elem in self._root.iter(tag): yield elem return + elif self._fid is not None: + self._fid.seek(0) + resource = self._fid elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -539,7 +546,8 @@ class XMLResource(object): yield elem elem.clear() finally: - resource.close() + if self._fid is None: + resource.close() def iterfind(self, path=None, namespaces=None): """XML resource tree iterfind selector.""" @@ -550,6 +558,9 @@ class XMLResource(object): for e in iter_select(self._root, path, namespaces, strict=False): yield e return + elif self._fid is not None: + self._fid.seek(0) + resource = self._fid elif self._url is not None: resource = urlopen(self._url, timeout=self.timeout) else: @@ -587,7 +598,8 @@ class XMLResource(object): elif level == 0: elem.clear() finally: - resource.close() + if self._fid is None: + resource.close() def iter_location_hints(self): """Yields schema location hints from the XML tree.""" @@ -639,7 +651,7 @@ class XMLResource(object): local_root = self.root.tag[0] != '{' nsmap = {} - if self._url is not None: + if self._url is not None or self._fid is not None: resource = self.open() try: for event, node in self.iterparse(resource, events=('start-ns', 'end')): @@ -650,7 +662,11 @@ class XMLResource(object): except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): pass finally: - resource.close() + # We don't want to close the file obj if it wasn't + # originally opened by `XMLResource`. That is the concern + # of the code where the file obj came from. + if self._fid is None: + resource.close() elif isinstance(self._text, string_base_type): try: for event, node in self.iterparse(StringIO(self._text), events=('start-ns', 'end')): diff --git a/xmlschema/tests/test_resources.py b/xmlschema/tests/test_resources.py index 4022c08..b259c5c 100644 --- a/xmlschema/tests/test_resources.py +++ b/xmlschema/tests/test_resources.py @@ -15,7 +15,6 @@ This module runs tests concerning resources. import unittest import os import platform -import zipfile try: from pathlib import PureWindowsPath, PurePath @@ -27,7 +26,7 @@ from xmlschema import ( load_xml_resource, XMLResource, XMLSchemaURLError ) from xmlschema.tests import XMLSchemaTestCase, SKIP_REMOTE_TESTS -from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO, BytesIO +from xmlschema.compat import urlopen, urlsplit, uses_relative, StringIO from xmlschema.etree import ElementTree, PyElementTree, lxml_etree, is_etree_element, etree_element, py_etree_element @@ -196,17 +195,22 @@ class TestResources(XMLSchemaTestCase): resource.load() self.assertIsNone(resource.text) + @unittest.skipIf( + platform.python_version_tuple()[0] < '3', + "Skip: urlopen on Python 2 can't seek 'file://' paths." + ) def test_xml_resource_from_resource(self): xml_file = urlopen('file://{}'.format(add_leading_slash(self.vh_xml_file))) try: resource = XMLResource(xml_file) self.assertEqual(resource.source, xml_file) self.assertEqual(resource.root.tag, '{http://example.com/vehicles}vehicles') - self.check_url(resource.url, self.vh_xml_file) + self.assertIsNone(resource.url) self.assertIsNone(resource.document) self.assertIsNone(resource.text) resource.load() self.assertTrue(resource.text.startswith('