passerelle/passerelle/apps/opendatasoft/models.py

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2020  Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re
from urllib import parse as urlparse

from django.db import models
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils.translation import gettext_lazy as _
from requests import RequestException

from passerelle.base.models import BaseQuery, BaseResource
from passerelle.utils.api import endpoint
from passerelle.utils.jsonresponse import APIError
from passerelle.utils.templates import render_to_string, validate_template


class OpenDataSoft(BaseResource):
    service_url = models.CharField(
        _('Site URL'),
        max_length=256,
        blank=False,
        help_text=_('URL without ending "api/records/1.0/search/"'),
    )
    api_key = models.CharField(
        _('API key'),
        max_length=128,
        blank=True,
        help_text=_('API key used as credentials'),
    )

    category = _('Data Sources')

    class Meta:
        verbose_name = _('OpenDataSoft Web Service')

    def export_json(self):
        data = super().export_json()
        data['queries'] = [query.export_json() for query in self.queries.all()]
        return data

    @classmethod
    def import_json_real(cls, overwrite, instance, d, **kwargs):
        data_queries = d.pop('queries', [])
        instance = super().import_json_real(overwrite, instance, d, **kwargs)
        queries = []
        if instance and overwrite:
            Query.objects.filter(resource=instance).delete()
        for data_query in data_queries:
            query = Query.import_json(data_query)
            query.resource = instance
            queries.append(query)
        Query.objects.bulk_create(queries)
        return instance

    def call_search(
        self, dataset=None, text_template='', filter_expression='', sort=None, limit=None, id=None, q=None
    ):
        scheme, netloc, path, params, query, fragment = urlparse.urlparse(self.service_url)
        path = urlparse.urljoin(path, 'api/records/1.0/search/')
        url = urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

        params = {'dataset': dataset}
        if id is not None:
            params['q'] = 'recordid:%s' % id
        elif q is not None:
            # remove query language operators
            terms = re.split(r'[^\w]', q)
            terms = [term for term in terms if len(term) > 1 and term.lower() not in ['and', 'or', 'not']]
            params['q'] = ' '.join(terms)
        elif sort:
            params['sort'] = sort
        if self.api_key:
            params['apikey'] = self.api_key
        if limit:
            params['rows'] = limit
        params.update(urlparse.parse_qs(filter_expression))

        try:
            response = self.requests.get(url, params=params)
        except RequestException as e:
            raise APIError('OpenDataSoft error: %s' % e)
        try:
            json_response = response.json()
        except ValueError:
            json_response = None
        if json_response and json_response.get('error'):
            raise APIError(json_response.get('error'))
        try:
            response.raise_for_status()
        except RequestException as e:
            raise APIError('OpenDataSoft error: %s' % e)
        if not json_response:
            raise APIError('OpenDataSoft error: bad JSON response')

        result = []
        for record in json_response.get('records'):
            data = {}
            for key, value in record.get('fields').items():
                if key in ('id', 'text'):
                    key = 'original_%s' % key
                data[key] = value
            data['id'] = record.get('recordid')
            data['text'] = render_to_string(text_template, data).strip()
            result.append(data)

        return result

    @endpoint(
        description=_('Search'),
        parameters={
            'dataset': {'description': _('Dataset')},
            'text_template': {'description': _('Text template')},
            'sort': {'description': _('Sort field')},
            'limit': {'description': _('Maximum items')},
            'id': {'description': _('Record identifier')},
            'q': {'description': _('Full text query')},
        },
    )
    def search(
        self, request, dataset=None, text_template='', sort=None, limit=None, id=None, q=None, **kwargs
    ):
        result = self.call_search(dataset, text_template, '', sort, limit, id, q)
        return {'data': result}

    @endpoint(
        name='q',
        description=_('Query'),
        pattern=r'^(?P<query_slug>[\w:_-]+)/$',
        show=False,
    )
    def q(self, request, query_slug, **kwargs):
        query = get_object_or_404(Query, resource=self, slug=query_slug)
        result = query.q(request, **kwargs)
        return {'data': result}

    def create_query_url(self):
        return reverse('opendatasoft-query-new', kwargs={'slug': self.slug})


class Query(BaseQuery):
    resource = models.ForeignKey(
        to=OpenDataSoft, related_name='queries', verbose_name=_('Resource'), on_delete=models.CASCADE
    )
    dataset = models.CharField(
        _('Dataset'),
        max_length=128,
        blank=False,
        help_text=_('dataset to query'),
    )
    text_template = models.TextField(
        verbose_name=_('Text template'),
        help_text=_("Use Django's template syntax. Attributes can be accessed through {{ attributes.name }}"),
        validators=[validate_template],
        blank=True,
    )
    filter_expression = models.TextField(
        verbose_name=_('filter'),
        help_text=_('Specify refine and exclude facet expressions separated lines'),
        blank=True,
    )
    sort = models.CharField(
        verbose_name=_('Sort field'),
        help_text=_(
            'Sorts results by the specified field. A minus sign - may be used to perform an ascending sort.'
        ),
        max_length=256,
        blank=True,
    )
    limit = models.PositiveIntegerField(
        default=10,
        verbose_name='Limit',
        help_text=_('Number of results to return in a single call'),
    )

    delete_view = 'opendatasoft-query-delete'
    edit_view = 'opendatasoft-query-edit'

    def q(self, request, **kwargs):
        return self.resource.call_search(
            dataset=self.dataset,
            text_template=self.text_template,
            filter_expression='&'.join(
                [x.strip() for x in str(self.filter_expression).splitlines() if x.strip()]
            ),
            sort=self.sort,
            limit=self.limit,
            id=kwargs.get('id'),
            q=kwargs.get('q'),
        )

    def as_endpoint(self):
        endpoint = super().as_endpoint(path=self.resource.q.endpoint_info.name)

        search_endpoint = self.resource.search.endpoint_info
        endpoint.func = search_endpoint.func
        endpoint.show_undocumented_params = False

        # Copy generic params descriptions from original endpoint
        # if they are not overloaded by the query
        for param in search_endpoint.parameters:
            if param in ('dataset', 'text_template') and getattr(self, param):
                continue
            endpoint.parameters[param] = search_endpoint.parameters[param]
        return endpoint