passerelle/passerelle/utils/xml.py

383 lines
15 KiB
Python

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2018 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import copy
from collections import OrderedDict
import jsonschema
from .logging import ignore_loggers
with ignore_loggers('xmlschema'):
import xmlschema
try:
import xmlschema.names as xmlschema_names # xmschema >= 1.1
except ModuleNotFoundError:
import xmlschema.qnames as xmlschema_names # xmschema < 1.1
def text_content(node):
"""Extract text content from node and all its children. Equivalent to
xmlNodeGetContent from libxml."""
if node is None:
return ''
def helper(node):
s = []
if node.text:
s.append(node.text)
for child in node:
s.extend(helper(child))
if child.tail:
s.append(child.tail)
return s
return ''.join(helper(node))
def to_json(root):
"""Convert an XML document (a rooted tree) into dictionnary compatible with
JSON serialization following those rules:
- root is converted into a dictionnary, its children's node name are the
keys,
- all child nodes without child are considered to be only text and
converted to a JSON string,
- all child nodes with children are converted to an array with they
children as root of a new conversion from XML to JSON.
Ex.:
<root>
<child1>wtv</chidl1>
<rows>
<row>
<child2>2</child2>
</row>
<row>
<child3>3</child3>
</row>
</rows>
</root>
is converted to:
{
"child1": "wtv",
"rows": [
{"child2": "2"},
{"child3": "3"}
]
}"""
d = {}
for child in root:
if len(child) == 0: # text node
value = text_content(child)
if value:
d[child.tag] = value
else:
values = d.setdefault(child.tag, [])
for row in child:
if len(row) == 0 and row.text:
child_content = row.text
else:
child_content = to_json(row)
if child_content:
values.append(child_content)
return d
class TransformConverter(xmlschema.UnorderedConverter):
def __init__(self, *args, **kwargs):
self.transformer = kwargs.pop('transformer', None)
super().__init__(*args, **kwargs)
def copy(self, **kwargs):
transformer = kwargs.pop('transformer', self.transformer)
new = super().copy(**kwargs)
new.transformer = transformer
return new
def element_decode(self, data, xsd_element, *args):
if self.transformer:
if xsd_element.type.name in getattr(self.transformer, 'type_map', {}):
mapped = self.transformer.type_map[xsd_element.type.name]
if hasattr(self.transformer, 'decode_%s' % mapped):
data = getattr(self.transformer, 'decode_%s' % mapped)(data)
return super().element_decode(data, xsd_element, *args)
def element_encode(self, obj, xsd_element, *args):
if self.transformer:
if xsd_element.type.name in getattr(self.transformer, 'type_map', {}):
mapped = self.transformer.type_map[xsd_element.type.name]
if hasattr(self.transformer, 'encode_%s' % mapped):
obj = getattr(self.transformer, 'encode_%s' % mapped)(obj)
return super().element_encode(obj, xsd_element, *args)
class JSONSchemaFromXMLSchema:
SIMPLE_TYPE_MAPPING = {
xmlschema_names.XSD_STRING: 'string',
xmlschema_names.XSD_INTEGER: 'integer',
xmlschema_names.XSD_INT: 'integer',
xmlschema_names.XSD_POSITIVE_INTEGER: 'integer',
xmlschema_names.XSD_UNSIGNED_INT: 'integer',
xmlschema_names.XSD_NON_NEGATIVE_INTEGER: 'integer',
xmlschema_names.XSD_BOOLEAN: 'boolean',
xmlschema_names.XSD_DOUBLE: 'number',
xmlschema_names.XSD_DECIMAL: 'number',
}
def __init__(self, xml_schema, root_element):
if not isinstance(xml_schema, xmlschema.XMLSchema):
with ignore_loggers('xmlschema'):
xml_schema = xmlschema.XMLSchema(xml_schema, converter=TransformConverter(transformer=self))
self.xml_schema = xml_schema
self.root_element = root_element
self.json_schema = {
'type': 'object',
'properties': {
root_element: self.element_to_jsonschema(xml_schema.elements[root_element]),
},
'required': [root_element],
'additionalProperties': False,
}
@classmethod
def simpletype_to_jsonschema(cls, simple_type):
assert isinstance(simple_type, xmlschema.validators.XsdSimpleType)
def add_patterns():
if simple_type.patterns:
if len(simple_type.patterns) == 1:
schema['pattern'] = simple_type.patterns.regexps[0]
else:
schema['pattern'] = '|'.join(simple_type.patterns.regexps)
if simple_type.name in getattr(cls, 'type_map', {}):
mapped = cls.type_map[simple_type.name]
if hasattr(cls, 'schema_%s' % mapped):
return getattr(cls, 'schema_%s' % mapped)()
if isinstance(simple_type, xmlschema.validators.XsdAtomicBuiltin):
if (
simple_type.min_length
or simple_type.max_length
or simple_type.white_space not in ('collapse', 'preserve')
):
raise NotImplementedError(simple_type)
if simple_type.name in cls.SIMPLE_TYPE_MAPPING:
schema = {'type': cls.SIMPLE_TYPE_MAPPING[simple_type.name]}
else:
raise NotImplementedError(simple_type)
add_patterns()
return schema
if isinstance(simple_type, xmlschema.validators.XsdAtomicRestriction):
if simple_type.white_space not in ('collapse', 'preserve'):
raise NotImplementedError(simple_type)
schema = OrderedDict(cls.simpletype_to_jsonschema(simple_type.base_type))
for validator in simple_type.validators:
if isinstance(validator, xmlschema.validators.XsdEnumerationFacets):
schema['enum'] = validator.enumeration
elif (
isinstance(validator, xmlschema.validators.XsdMinLengthFacet)
and simple_type.base_type.name == xmlschema_names.XSD_STRING
):
schema['minLength'] = validator.value
elif (
isinstance(validator, xmlschema.validators.XsdMaxLengthFacet)
and simple_type.base_type.name == xmlschema_names.XSD_STRING
):
schema['maxLength'] = validator.value
elif (
isinstance(validator, xmlschema.validators.XsdLengthFacet)
and simple_type.base_type.name == xmlschema_names.XSD_STRING
):
schema['minLength'] = validator.value
schema['maxLength'] = validator.value
elif isinstance(validator, xmlschema.validators.XsdMinInclusiveFacet):
schema['minimum'] = validator.value
elif isinstance(validator, xmlschema.validators.XsdMaxInclusiveFacet):
schema['maximum'] = validator.value
elif (
isinstance(validator, xmlschema.validators.XsdTotalDigitsFacet)
and simple_type.base_type.name == xmlschema_names.XSD_DECIMAL
):
schema['exclusiveMaximum'] = 10**validator.value
elif (
isinstance(validator, xmlschema.validators.XsdFractionDigitsFacet)
and simple_type.base_type.name == xmlschema_names.XSD_DECIMAL
):
schema['multipleOf'] = 1 / 10.0**validator.value
else:
raise NotImplementedError(validator)
add_patterns()
return schema
if isinstance(simple_type, xmlschema.validators.XsdUnion):
return {'oneOf': [cls.simpletype_to_jsonschema(m) for m in simple_type.member_types]}
raise NotImplementedError(simple_type)
@classmethod
def attributegroup_to_jsonschema(cls, attributegroup, schema, required=None):
assert isinstance(attributegroup, xmlschema.validators.XsdAttributeGroup)
properties = schema.setdefault('properties', OrderedDict())
for component in attributegroup.values():
if component.use == 'prohibited':
continue
if required is not None and component.use != 'optional':
if component.name not in schema.get('required', []):
schema.setdefault('required', []).append(component.name)
properties[component.name] = cls.simpletype_to_jsonschema(component.type)
@classmethod
def group_to_alternatives(cls, group, alternatives=None):
alternatives = alternatives or [[]]
if group.model == 'choice':
cls.choice_to_alternatives(group, alternatives=alternatives)
elif group.model in ('sequence', 'all'):
cls.sequence_to_alternatives(group, alternatives=alternatives)
else:
raise NotImplementedError(group)
return alternatives
@classmethod
def choice_to_alternatives(cls, group, alternatives):
new_alternatives = alternatives
alternatives = list(alternatives)
new_alternatives[:] = []
for component in group:
if isinstance(component, xmlschema.validators.XsdElement):
for alternative in alternatives:
alternative = alternative + [component]
new_alternatives.append(alternative)
elif isinstance(component, xmlschema.validators.XsdGroup):
sub_alternatives = [list(alternative) for alternative in alternatives]
cls.group_to_alternatives(component, alternatives=sub_alternatives)
for alternative in sub_alternatives:
new_alternatives.append(alternative)
else:
raise NotImplementedError(component)
@classmethod
def sequence_to_alternatives(cls, group, alternatives):
for component in group:
if isinstance(component, xmlschema.validators.XsdElement):
for alternative in alternatives:
alternative.append(component)
elif isinstance(component, xmlschema.validators.XsdGroup):
cls.group_to_alternatives(component, alternatives=alternatives)
else:
raise NotImplementedError(component)
@classmethod
def group_to_jsonschema(cls, group, schema, base_schema=None):
assert isinstance(group, xmlschema.validators.XsdGroup)
alternatives = cls.group_to_alternatives(group)
def fill_schema_with_alternative(schema, alternative):
for component in alternative:
properties = schema.setdefault('properties', OrderedDict())
properties[component.name] = cls.element_to_jsonschema(component)
if component.min_occurs > 0 and component.name not in schema.get('required', []):
schema.setdefault('required', []).append(component.name)
if len(alternatives) == 1:
fill_schema_with_alternative(schema, alternatives[0])
elif len(alternatives) > 1:
base_schema = copy.deepcopy(schema)
schema.clear()
one_of = []
schema['oneOf'] = one_of
for alternative in alternatives:
new_schema = copy.deepcopy(base_schema)
fill_schema_with_alternative(new_schema, alternative)
one_of.append(new_schema)
@classmethod
def type_to_jsonschema(cls, xmltype):
assert isinstance(xmltype, xmlschema.validators.XsdType)
if xmltype.is_simple():
base_schema = cls.simpletype_to_jsonschema(xmltype)
try:
xmltype.decode('')
except xmlschema.XMLSchemaValidationError:
return base_schema
else:
if base_schema.get('oneOf'):
base_schema['oneOf'].append({'type': 'null'})
return base_schema
return {'oneOf': [{'type': 'null'}, base_schema]}
elif xmltype.has_simple_content():
base_schema = cls.type_to_jsonschema(xmltype.base_type)
if not xmltype.attributes:
schema = base_schema
else:
schema = OrderedDict({'type': 'object', 'properties': OrderedDict()})
schema['properties']['$'] = base_schema
cls.attributegroup_to_jsonschema(xmltype.attributes, schema)
return schema
else:
if xmltype.has_mixed_content() or xmltype.name == xmlschema_names.XSD_ANY_TYPE:
raise NotImplementedError(xmltype)
schema = OrderedDict({'type': 'object'})
schema['additionalProperties'] = False
if xmltype.attributes:
cls.attributegroup_to_jsonschema(xmltype.attributes, schema)
cls.group_to_jsonschema(xmltype.content_type, schema)
return schema
@classmethod
def element_to_jsonschema(cls, element):
assert isinstance(element, xmlschema.validators.XsdElement)
is_array = element.max_occurs is None or element.max_occurs > 1
item_schema = cls.type_to_jsonschema(element.type)
if is_array:
item_schema = {
'type': 'array',
'items': item_schema,
'minItems': element.min_occurs,
}
if element.max_occurs is not None:
item_schema['maxItems'] = element.max_occurs
return item_schema
def validate(self, instance):
return jsonschema.validate(instance=instance, schema=self.json_schema)
def encode(self, instance):
return self.xml_schema.elements[self.root_element].encode(
instance[self.root_element], converter=TransformConverter, transformer=self
)
def decode(self, source):
return self.xml_schema.elements[self.root_element].decode(
source, converter=TransformConverter, transformer=self
)