wcs/wcs/data_sources.py

859 lines
30 KiB
Python

# w.c.s. - web application for online forms
# Copyright (C) 2005-2012 Entr'ouvert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
import collections
import hashlib
import xml.etree.ElementTree as ET
from django.template import TemplateSyntaxError, VariableDoesNotExist
from django.utils import six
from django.utils.encoding import force_text, force_bytes
from django.utils.six.moves.urllib import parse as urllib
from django.utils.six.moves.urllib import parse as urlparse
from quixote import get_publisher, get_request, get_session
from quixote.html import TemplateIO
from .qommon import _, N_, force_str
from .qommon import misc
from .qommon import get_logger
from .qommon.cron import CronJob
from .qommon.form import *
from .qommon.humantime import seconds2humanduration
from .qommon.misc import get_variadic_url
from .qommon.publisher import get_publisher_class
from .qommon.afterjobs import AfterJob
from .qommon.storage import StorableObject
from .qommon.template import Template
from .qommon.xml_storage import XmlStorableObject
from .api_utils import sign_url_auto_orig
data_source_functions = {}
def register_data_source_function(function, function_name=None):
if not function_name:
function_name = function.__name__
if not function_name in data_source_functions:
data_source_functions[function_name] = function
class DataSourceSelectionWidget(CompositeWidget):
def __init__(
self, name, value=None, allow_jsonp=True, allow_geojson=False, allow_named_sources=True, **kwargs
):
CompositeWidget.__init__(self, name, value, **kwargs)
if not value:
value = {}
options = [(None, _('None'), None)]
if allow_named_sources:
from wcs.carddef import CardDef
cards_options = [(t[2], t[1], t[2]) for t in CardDef.get_carddefs_as_data_source()]
cards_options.sort(key=lambda x: misc.simplify(x[1]))
if cards_options:
options.append(OptGroup(_('Cards')))
options.extend(cards_options)
nds_options = []
nds_agenda_options = []
for ds in NamedDataSource.select():
option = (
ds.slug,
ds.name,
ds.slug,
{
'data-type': ds.type,
'data-maybe-datetimes': 'true' if ds.maybe_datetimes() else 'false',
},
)
if ds.external == 'agenda':
nds_agenda_options.append(option)
else:
nds_options.append(option)
nds_agenda_options.sort(key=lambda x: misc.simplify(x[1]))
if nds_agenda_options:
options.append(OptGroup(_('Agendas')))
options.extend(nds_agenda_options)
nds_options.sort(key=lambda x: misc.simplify(x[1]))
if nds_options:
options.append(OptGroup(_('Manually Configured Data Sources')))
options.extend(nds_options)
if len(options) > 1:
options.append(OptGroup(_('Generic Data Sources')))
options.append(('json', _('JSON URL'), 'json', {'data-maybe-datetimes': 'true'}))
if allow_jsonp:
options.append(('jsonp', _('JSONP URL'), 'jsonp'))
if allow_geojson:
options.append(('geojson', _('GeoJSON URL'), 'geojson'))
options.append(('formula', _('Python Expression'), 'python'))
self.add(
SingleSelectWidget,
'type',
options=options,
value=value.get('type'),
attrs={'data-dynamic-display-parent': 'true'},
)
self.parse()
if not self.value:
self.value = {}
self.add(
StringWidget,
'value',
value=value.get('value'),
size=80,
attrs={
'data-dynamic-display-child-of': 'data_source$type',
'data-dynamic-display-value-in': 'json|jsonp|geojson|python',
},
)
self._parsed = False
def _parse(self, request):
values = {}
for name in ('type', 'value'):
value = self.get(name)
if value:
values[name] = value
if values.get('type', '') in ('none', ''):
values = None
self.value = values or None
def render_content(self):
r = TemplateIO(html=True)
for widget in self.get_widgets():
r += widget.render_content()
return r.getvalue()
def get_items(data_source, include_disabled=False, mode=None):
structured_items = get_structured_items(data_source, mode=mode)
tupled_items = []
for item in structured_items:
if item.get('disabled') and not include_disabled:
continue
tupled_items.append((str(item['id']), str(item['text']), str(item.get('key', item['id'])), item))
return tupled_items
def get_json_from_url(url, data_source=None, log_message_part='JSON data source'):
url = sign_url_auto_orig(url)
data_source = data_source or {}
data_key = data_source.get('data_attribute') or 'data'
geojson = data_source.get('type') == 'geojson'
try:
entries = misc.json_loads(misc.urlopen(url).read())
if not isinstance(entries, dict):
raise ValueError('not a json dict')
if entries.get('err') not in (None, 0, "0"):
raise ValueError('err %s' % entries['err'])
if geojson:
if not isinstance(entries.get('features'), list):
raise ValueError('bad geojson format')
else:
if not isinstance(entries.get(data_key), list):
raise ValueError('not a json dict with a %s list attribute' % data_key)
except misc.ConnectionError as e:
get_logger().warning('Error loading %s (%s)' % (log_message_part, str(e)))
return None
except (ValueError, TypeError) as e:
get_logger().warning('Error reading %s output (%s)' % (log_message_part, str(e)))
return None
return entries
def request_json_items(url, data_source):
entries = get_json_from_url(url, data_source)
if entries is None:
return None
data_key = data_source.get('data_attribute') or 'data'
id_attribute = data_source.get('id_attribute') or 'id'
text_attribute = data_source.get('text_attribute') or 'text'
items = []
for item in entries.get(data_key):
# skip malformed items
if not isinstance(item, dict):
continue
if item.get(id_attribute) is None or item.get(id_attribute) == '':
continue
item['id'] = item[id_attribute]
if text_attribute not in item:
item['text'] = str(item['id'])
else:
item['text'] = item[text_attribute]
items.append(item)
return items
def request_geojson_items(url, data_source):
entries = get_json_from_url(url, data_source)
if entries is None:
return None
items = []
id_property = data_source.get('id_property') or 'id'
for item in entries.get('features'):
if id_property == 'id' and 'id' in item:
# If a Feature has a commonly used identifier, that identifier
# SHOULD be included as a member of the Feature object with the
# name "id", and the value of this member is either a JSON string
# or number.
# -- https://tools.ietf.org/html/rfc7946#section-3.2
pass
elif item.get('properties', {}).get(id_property):
item['id'] = item['properties'][id_property]
else:
# missing id property, skip entry
continue
try:
item['text'] = Template(data_source.get('label_template_property') or '{{ text }}').render(
item['properties']
)
except (TemplateSyntaxError, VariableDoesNotExist):
pass
if not item.get('text'):
item['text'] = item['id']
items.append(item)
return items
def get_structured_items(data_source, mode=None):
cache_duration = 0
if data_source.get('type') and data_source.get('type').startswith('carddef:'):
# cards
from wcs.carddef import CardDef
return CardDef.get_data_source_items(data_source['type'])
if data_source.get('type') not in ('json', 'jsonp', 'geojson', 'formula'):
# named data source
named_data_source = NamedDataSource.get_by_slug(data_source['type'])
if named_data_source.cache_duration:
cache_duration = int(named_data_source.cache_duration)
data_source = named_data_source.extended_data_source
if data_source.get('type') == 'formula':
# the result of a python expression, it must be a list.
# - of strings
# - of dictionaries, in which case it has to have both a "id" and a
# "text" keys
# - of lists or tuples, in which case it may have up to three elements:
# - three elements, (id, text, key)
# - two elements, (id, text)
# - a single element, (id,)
variables = get_publisher().substitutions.get_context_variables(mode=mode)
global_eval_dict = get_publisher().get_global_eval_dict()
global_eval_dict.update(data_source_functions)
try:
value = eval(data_source.get('value'), global_eval_dict, variables)
if not isinstance(value, collections.Iterable):
get_logger().warning(
'Python data source (%r) gave a non-iterable result' % data_source.get('value')
)
return []
if len(value) == 0:
return []
value = misc.json_encode_helper(value, get_publisher().site_charset)
if isinstance(value[0], list) or isinstance(value[0], tuple):
if len(value[0]) >= 3:
return [{'id': x[0], 'text': x[1], 'key': x[2]} for x in value]
elif len(value[0]) == 2:
return [{'id': x[0], 'text': x[1]} for x in value]
elif len(value[0]) == 1:
return [{'id': x[0], 'text': x[0]} for x in value]
return value
elif isinstance(value[0], six.string_types):
return [{'id': x, 'text': x} for x in value]
return value
except:
get_logger().warning('Failed to eval() Python data source (%r)' % data_source.get('value'))
return []
elif data_source.get('type') in ['json', 'geojson']:
# the content available at a json URL, it must answer with a dict with
# a 'data' key holding the list of items, each of them being a dict
# with at least both an "id" and a "text" key.
geojson = data_source.get('type') == 'geojson'
url = data_source.get('value')
if not url:
if geojson:
get_logger().warning('Empty URL in GeoJSON data source')
else:
get_logger().warning('Empty URL in JSON data source')
return []
url = url.strip()
if Template.is_template_string(url):
vars = get_publisher().substitutions.get_context_variables(mode='lazy')
url = get_variadic_url(url, vars)
request = get_request()
if hasattr(request, 'datasources_cache') and url in request.datasources_cache:
return request.datasources_cache[url]
if cache_duration:
cache_key = 'data-source-%s' % force_str(hashlib.md5(force_bytes(url)).hexdigest())
from django.core.cache import cache
items = cache.get(cache_key)
if items is not None:
return items
if geojson:
items = request_geojson_items(url, data_source)
else:
items = request_json_items(url, data_source)
if items is None:
return []
if hasattr(request, 'datasources_cache'):
request.datasources_cache[url] = items
if cache_duration:
cache.set(cache_key, items, cache_duration)
return items
return []
def get_real(data_source):
if not data_source:
return None
ds_type = data_source.get('type')
if ds_type in ('json', 'jsonp', 'geojson', 'formula'):
return data_source
if ds_type and ds_type.startswith('carddef:'):
return data_source
return NamedDataSource.get_by_slug(ds_type).data_source
def get_object(data_source, ignore_errors=True):
if not data_source:
return None
ds_type = data_source.get('type')
if ds_type is None:
return None
if ds_type in ('json', 'jsonp', 'geojson', 'formula'):
named_data_source = NamedDataSource()
named_data_source.data_source = data_source
return named_data_source
if ds_type.startswith('carddef:'):
named_data_source = NamedDataSource()
named_data_source.data_source = data_source
return named_data_source
return NamedDataSource.get_by_slug(ds_type, ignore_errors=ignore_errors)
class NamedDataSource(XmlStorableObject):
_names = 'datasources'
_indexes = ['slug']
xml_root_node = 'datasource'
name = None
slug = None
description = None
data_source = None
cache_duration = None
query_parameter = None
id_parameter = None
data_attribute = None
id_attribute = None
text_attribute = None
id_property = None
label_template_property = None
external = None
external_status = None
# declarations for serialization
XML_NODES = [
('name', 'str'),
('slug', 'str'),
('description', 'str'),
('cache_duration', 'str'),
('query_parameter', 'str'),
('id_parameter', 'str'),
('data_attribute', 'str'),
('id_attribute', 'str'),
('text_attribute', 'str'),
('id_property', 'str'),
('label_template_property', 'str'),
('external', 'str'),
('external_status', 'str'),
('data_source', 'data_source'),
]
def __init__(self, name=None):
StorableObject.__init__(self)
self.name = name
@property
def type(self):
return self.data_source.get('type')
@property
def extended_data_source(self):
if self.type == 'geojson':
data_source = self.data_source.copy()
data_source.update(
{
'id_property': self.id_property,
'label_template_property': self.label_template_property,
}
)
return data_source
if self.type == 'json':
data_source = self.data_source.copy()
data_source.update(
{
'data_attribute': self.data_attribute,
'id_attribute': self.id_attribute,
'text_attribute': self.text_attribute,
}
)
return data_source
return self.data_source
def can_jsonp(self):
if self.type == 'jsonp':
return True
if self.type == 'json' and self.query_parameter:
return True
if self.type and self.type.startswith('carddef:'):
return True
return False
def maybe_datetimes(self):
return self.type == 'json' and 'datetimes' in (self.data_source.get('value') or '')
def migrate(self):
changed = False
if not self.slug:
# .store() will take care of setting the slug
changed = True
if changed:
self.store()
def store(self, comment=None):
assert not self.is_readonly()
if self.slug is None:
# set slug if it's not yet there
self.slug = self.get_new_slug()
super(NamedDataSource, self).store()
if get_publisher().snapshot_class:
get_publisher().snapshot_class.snap(instance=self, comment=comment)
def get_new_slug(self):
new_slug = misc.simplify(self.name, space='_')
base_new_slug = new_slug
suffix_no = 0
while True:
try:
obj = self.get_on_index(new_slug, 'slug', ignore_migration=True)
except KeyError:
break
if obj.id == self.id:
break
suffix_no += 1
new_slug = '%s-%s' % (base_new_slug, suffix_no)
return new_slug
def get_admin_url(self):
base_url = get_publisher().get_backoffice_url()
for section in ('settings', 'forms', 'workflows'):
if get_publisher().get_backoffice_root().is_accessible(section):
return '%s/%s/data-sources/%s/' % (base_url, section, self.id)
# fallback to settings section
section = 'settings'
return '%s/%s/data-sources/%s/' % (base_url, section, self.id)
def export_data_source_to_xml(self, element, attribute_name, charset):
data_source = getattr(self, attribute_name)
ET.SubElement(element, 'type').text = data_source.get('type')
ET.SubElement(element, 'value').text = force_text(data_source.get('value') or '', charset)
def import_data_source_from_xml(self, element, charset):
return {
'type': force_str(element.find('type').text),
'value': force_str(element.find('value').text or ''),
}
@classmethod
def get_by_slug(cls, slug, ignore_errors=True):
objects = [x for x in cls.select() if x.slug == slug]
if objects:
return objects[0]
if not ignore_errors:
raise KeyError(slug)
get_logger().warning("data source '%s' does not exist" % slug)
return StubNamedDataSource(name=slug)
def get_json_query_url(self):
url = self.data_source.get('value').strip()
if Template.is_template_string(url):
vars = get_publisher().substitutions.get_context_variables(mode='lazy')
url = get_variadic_url(url, vars)
if not url:
return ''
if not '?' in url:
url += '?' + self.query_parameter + '='
else:
url += '&' + self.query_parameter + '='
return url
def get_jsonp_url(self):
if self.type == 'jsonp':
return self.data_source.get('value')
if self.type == 'json' and self.query_parameter:
json_url = self.get_json_query_url()
info = None
if json_url:
info = {'url': json_url}
return '/api/autocomplete/%s' % (get_session().get_data_source_query_info_token(info))
if self.type and self.type.startswith('carddef:'):
parts = self.type.split(':')
if len(parts) > 2:
# custom view, check if it's dynamic
from wcs.carddef import CardDef
from wcs.workflows import WorkflowStatusItem
custom_view = CardDef.get_data_source_custom_view(self.type)
had_template = False
for filter_key, filter_value in custom_view.filters.items():
if not Template.is_template_string(filter_value):
continue
custom_view.filters[filter_key] = WorkflowStatusItem.compute(filter_value)
had_template = True
if had_template:
# keep altered custom view in session
return '/api/autocomplete/%s' % (
get_session().get_data_source_query_info_token(
{'carddef_ref': self.type, 'dynamic_custom_view': custom_view}
)
)
return '/api/autocomplete/%s' % (
get_session().get_data_source_query_info_token(
{
'carddef_ref': self.type,
}
)
)
return None
def get_geojson_url(self):
assert self.type == 'geojson'
return '/api/geojson/%s' % self.slug
def get_geojson_data(self):
url = self.data_source.get('value').strip()
if Template.is_template_string(url):
vars = get_publisher().substitutions.get_context_variables(mode='lazy')
url = get_variadic_url(url, vars)
request = get_request()
if hasattr(request, 'datasources_cache') and url in request.datasources_cache:
return request.datasources_cache[url]
cache_duration = 0
if self.cache_duration:
cache_duration = int(self.cache_duration)
if cache_duration:
cache_key = 'geojson-data-source-%s' % force_str(hashlib.md5(force_bytes(url)).hexdigest())
from django.core.cache import cache
data = cache.get(cache_key)
if data is not None:
return data
data = get_json_from_url(url, self.data_source)
id_property = self.id_property or 'id'
label_template_property = self.label_template_property or '{{ text }}'
for feature in data['features']:
feature['properties']['_id'] = feature['properties'][id_property]
try:
feature['properties']['_text'] = Template(label_template_property).render(
feature['properties']
)
except (TemplateSyntaxError, VariableDoesNotExist):
pass
if not feature['properties'].get('_text'):
feature['properties']['_text'] = feature['properties']['_id']
if hasattr(request, 'datasources_cache'):
request.datasources_cache[url] = data
if cache_duration:
cache.set(cache_key, data, cache_duration)
return data
def get_value_by_id(self, param_name, param_value):
url = self.data_source.get('value').strip()
if Template.is_template_string(url):
vars = get_publisher().substitutions.get_context_variables(mode='lazy')
url = get_variadic_url(url, vars)
if '?' not in url:
url += '?'
else:
url += '&'
url += param_name + '=' + urllib.quote(param_value)
def find_item(items, name, value):
for item in items:
if str(item.get(name)) == str(value):
return item
# not found
get_publisher().record_error(_('Could not find element by id "%s"') % value)
return None
request = get_request()
if hasattr(request, 'datasources_cache') and url in request.datasources_cache:
items = request.datasources_cache[url]
if not items: # cache may contains empty list from get_structured_items
return None
return find_item(items, param_name, param_value)
items = request_json_items(url, self.data_source)
if not items: # None or empty list are not valid
return None
if hasattr(request, 'datasources_cache'):
request.datasources_cache[url] = items
return find_item(items, param_name, param_value)
def get_card_structured_value_by_id(self, option_id):
from wcs.carddef import CardDef
values = []
try:
int(option_id)
except ValueError:
pass
else:
values = CardDef.get_data_source_items(self.type, get_by_id=option_id)
if not values:
values = CardDef.get_data_source_items(self.type, get_by_text=option_id)
if not values:
return None
return values[0]
def get_display_value(self, option_id):
value = self.get_structured_value(option_id)
if value:
return value.get('text')
return None
def get_structured_value(self, option_id):
value = None
if self.type and self.type.startswith('carddef:'):
value = self.get_card_structured_value_by_id(option_id)
elif self.type == 'json' and self.id_parameter:
value = self.get_value_by_id(self.id_parameter, option_id)
else:
structured_items = get_structured_items(self.extended_data_source, mode='lazy')
for item in structured_items:
if str(item['id']) == str(option_id):
value = item
break
else:
# recheck in case option label was given instead of option id.
for item in structured_items:
if str(item['text']) == str(option_id):
value = item
break
if value is None:
return None
return value
@classmethod
def get_substitution_variables(cls):
return {'data_source': DataSourcesSubstitutionProxy()}
def type_label(self):
data_source_labels = {
'json': _('JSON'),
'jsonp': _('JSONP'),
'geojson': _('GeoJSON'),
'formula': _('Python Expression'),
}
data_source_type = self.data_source.get('type')
return data_source_labels.get(data_source_type)
def humanized_cache_duration(self):
return seconds2humanduration(int(self.cache_duration))
def get_referenced_varnames(self, formdef):
from .fields import Field
if self.type == 'json':
return Field.get_referenced_varnames(formdef, self.data_source.get('value'))
# else: carddef
assert self.type.startswith('carddef:'), 'data source must be carddef'
from wcs.carddef import CardDef
return CardDef.get_data_source_referenced_varnames(self.type, formdef=formdef)
def get_variadic_url(self):
url = self.data_source.get('value').strip()
if url and Template.is_template_string(url):
vars = get_publisher().substitutions.get_context_variables(mode='lazy')
url = get_variadic_url(url, vars)
return url
def is_used(self):
from wcs.formdef import get_formdefs_of_all_kinds
for formdef in get_formdefs_of_all_kinds():
if self.is_used_in_formdef(formdef):
return True
return False
def is_used_in_formdef(self, formdef):
for field in formdef.fields or []:
data_source = getattr(field, 'data_source', None)
if not data_source:
continue
if data_source.get('type') == self.slug:
return True
return False
class StubNamedDataSource(NamedDataSource):
type = 'formula'
data_source = {'type': 'formula', 'value': []}
cache_duration = None
def __init__(self, name=None):
self.name = name
def store(self):
pass
def __repr__(self):
return '<StubNamedDataSource %r>' % self.name
class DataSourcesSubstitutionProxy(object):
def __getattr__(self, attr):
return get_structured_items(NamedDataSource.get_by_slug(attr).data_source)
def inspect_keys(self):
return []
def has_chrono(publisher):
return publisher.get_site_option('chrono_url') is not None
def chrono_url(publisher, url):
chrono_url = publisher.get_site_option('chrono_url')
return urlparse.urljoin(chrono_url, url)
def collect_agenda_data(publisher):
agenda_url = chrono_url(publisher, 'api/agenda/')
result = get_json_from_url(agenda_url, log_message_part='agenda')
if result is None:
return
# build datasources from chrono
agenda_data = []
for agenda in result.get('data') or []:
if agenda['kind'] == 'events':
agenda_data.append({'text': agenda['text'], 'url': agenda['api']['datetimes_url']})
elif agenda['kind'] in ['meetings', 'virtual']:
agenda_data.append(
{'text': _('%s - Slot types') % agenda['text'], 'url': agenda['api']['meetings_url']}
)
# get also meeting types
mt_url = chrono_url(publisher, 'api/agenda/%s/meetings/' % agenda['id'])
mt_results = get_json_from_url(mt_url, log_message_part='agenda')
if mt_results is None:
return
for meetingtype in mt_results.get('data') or []:
agenda_data.append(
{
'text': _('%s - Slots of type %s') % (agenda['text'], meetingtype['text']),
'url': meetingtype['api']['datetimes_url'],
}
)
return agenda_data
def build_agenda_datasources(publisher):
if not has_chrono(publisher):
return
agenda_data = collect_agenda_data(publisher)
if agenda_data is None:
return
# fetch existing datasources
existing_datasources = {}
for datasource in NamedDataSource.select():
if datasource.external != 'agenda':
continue
existing_datasources[datasource.data_source['value']] = datasource
seen_datasources = []
# build datasources from chrono
for agenda in agenda_data:
url = agenda['url']
datasource = existing_datasources.get(url)
if datasource is None:
datasource = NamedDataSource()
datasource.external = 'agenda'
datasource.data_source = {'type': 'json', 'value': url}
datasource.external_status = None # reset
datasource.name = agenda['text']
datasource.store()
# maintain caches
existing_datasources[url] = datasource
seen_datasources.append(url)
# now check outdated agenda datasources
for url, datasource in existing_datasources.items():
if url in seen_datasources:
continue
if datasource.is_used():
datasource.external_status = 'not-found'
datasource.store()
continue
datasource.remove_self()
class RefreshAgendas(AfterJob):
label = N_('Refreshing agendas')
def execute(self):
build_agenda_datasources(get_publisher())
if get_publisher_class():
# every hour: check for agenda datasources
get_publisher_class().register_cronjob(
CronJob(build_agenda_datasources, name='build_agenda_datasources', minutes=[0])
)