general: redo full text search using querysets (#33632)

2020-01-20 16:31:56 +01:00 · 2020-01-20 16:31:56 +01:00 · 7698d8a398
parent 06417b1ff9
commit 7698d8a398
21 changed files with 355 additions and 307 deletions
--- a/combo/apps/search/init.py
+++ b/combo/apps/search/init.py
@ -15,6 +15,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import django.apps
+from django.core.urlresolvers import reverse
 from django.utils.translation import ugettext_lazy as _

 from .engines import engines
@ -28,4 +29,22 @@ class AppConfig(django.apps.AppConfig):
        from . import urls
        return urls.urlpatterns

+    def hourly(self):
+        from .utils import index_site
+        index_site()
+
+    def ready(self):
+        # register built-in search engine for page contents
+        engines.register(self.get_search_engines)
+
+    def get_search_engines(self):
+        from .utils import search_site
+        return {
+            '_text': {
+                'function': search_site,
+                'label': _('Page Contents'),
+            }
+        }
+
+
 default_app_config = 'combo.apps.search.AppConfig'
--- a/combo/apps/search/management/init.py
+++ b/combo/apps/search/management/init.py
--- a/combo/apps/search/management/commands/init.py
+++ b/combo/apps/search/management/commands/init.py
--- a/combo/apps/search/management/commands/update_index.py
+++ b/combo/apps/search/management/commands/update_index.py
@ -1,78 +0,0 @@
-# combo - content management system
-# Copyright (C) 2017  Entr'ouvert
-#
-# This program is free software: you can redistribute it and/or modify it
-# under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-from django.utils.timezone import now
-
-from haystack.management.commands.update_index import Command as UpdateIndexCommand
-
-from combo.data.models import Page, ExternalLinkSearchItem
-from combo.apps.search.models import SearchCell
-
-
-class Command(UpdateIndexCommand):
-
-    def add_arguments(self, parser):
-        super(Command, self).add_arguments(parser)
-        parser.add_argument(
-            '--skip-external-links-collection', action='store_true', default=False,
-            dest='skip_external_links_collection')
-
-    def handle(self, **options):
-        if not any(SearchCell.get_cells_by_search_service('_text')):
-            # do not index site if there's no matching search cell
-            return
-        if not options.get('skip_external_links_collection', False):
-            self.collect_external_links(options)
-        return super(Command, self).handle(**options)
-
-    def collect_external_links(self, options):
-        start_time = now()
-
-        if options.get('remove'):
-            ExternalLinkSearchItem.objects.all().delete()
-
-        # assemble external links data
-        links = {}
-        for page in Page.objects.filter(sub_slug=''):
-            if not page.is_visible(user=None):
-                continue
-            for cell in page.get_cells():
-                if not cell.is_visible(user=None):
-                    continue
-                for link_data in cell.get_external_links_data():
-                    if not link_data['url'] in links:
-                        # create an entry for that link.
-                        links[link_data['url']] = {}
-                        links[link_data['url']]['title'] = link_data['title']
-                        links[link_data['url']]['all_texts'] = []
-                    else:
-                        # if that link already exists, just keep the title as
-                        # text.
-                        links[link_data['url']]['all_texts'].append(link_data['title'])
-                    # additional texts will be assembled and indexed
-                    links[link_data['url']]['all_texts'].append(link_data.get('text') or '')
-
-        # save data as ExternalLinkSearchItem objects
-        for link_url, link_data in links.items():
-            link_object, created = ExternalLinkSearchItem.objects.get_or_create(
-                    url=link_url,
-                    defaults={'title': link_data['title']})
-            link_object.title = link_data['title']
-            link_object.text = '\n'.join(link_data['all_texts'])
-            link_object.save()
-
-        # remove obsolete objects
-        ExternalLinkSearchItem.objects.filter(last_update_timestamp__lt=start_time).delete()
--- a/combo/apps/search/migrations/0006_indexedcell.py
+++ b/combo/apps/search/migrations/0006_indexedcell.py
@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.17 on 2020-01-20 15:30
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('data', '0043_delete_externallinksearchitem'),
+        ('auth', '0008_alter_user_username_max_length'),
+        ('contenttypes', '0002_remove_content_type_name'),
+        ('search', '0005_searchcell_autofocus'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='IndexedCell',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('cell_pk', models.PositiveIntegerField(null=True)),
+                ('url', models.CharField(blank=True, max_length=500, null=True)),
+                ('title', models.CharField(blank=True, max_length=500, null=True)),
+                ('indexed_text', models.TextField(blank=True, null=True)),
+                ('public_access', models.BooleanField(default=False)),
+                ('last_update_timestamp', models.DateTimeField(auto_now=True)),
+                ('cell_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='contenttypes.ContentType')),
+                ('excluded_groups', models.ManyToManyField(blank=True, related_name='_indexedcell_excluded_groups_+', to='auth.Group')),
+                ('page', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='data.Page')),
+                ('restricted_groups', models.ManyToManyField(blank=True, related_name='_indexedcell_restricted_groups_+', to='auth.Group')),
+            ],
+        ),
+    ]
--- a/combo/apps/search/models.py
+++ b/combo/apps/search/models.py
@ -16,21 +16,21 @@

 import os

-from django.conf import settings
+from django.contrib.auth.models import Group
+from django.contrib.contenttypes import fields
+from django.contrib.contenttypes.models import ContentType
 from django.db import models
 from django.utils.translation import ugettext_lazy as _
 from django import template
 from django.http import HttpResponse
 from django.core.exceptions import PermissionDenied
-from django.core.urlresolvers import reverse
 from django.utils.http import quote
 from django.template import RequestContext, Template

 from jsonfield import JSONField
-from haystack import connections

 from combo.utils import requests
-from combo.data.models import CellBase
+from combo.data.models import CellBase, Page
 from combo.data.library import register_cell_class
 from combo.utils import get_templated_url

@ -69,7 +69,7 @@ class SearchCell(CellBase):
        services = []
        for service_slug in self._search_services.get('data') or []:
            service = engines.get(service_slug)
-            if service and service.get('url'):
+            if service and (service.get('url') or service.get('function')):
                service['slug'] = service_slug
                services.append(service)
        return services
@ -141,30 +141,33 @@ class SearchCell(CellBase):
        if not query:
            return render_response(service)

-        url = get_templated_url(service['url'],
-                context={'request': request, 'q': query, 'search_service': service})
-        url = url % {'q': quote(query.encode('utf-8'))}  # if url contains %(q)s
-        if url.startswith('/'):
-            url = request.build_absolute_uri(url)
+        if service.get('function'):  # internal search engine
+            results = {'data': service['function'](request, query)}
+        else:
+            url = get_templated_url(service['url'],
+                    context={'request': request, 'q': query, 'search_service': service})
+            url = url % {'q': quote(query.encode('utf-8'))}  # if url contains %(q)s
+            if url.startswith('/'):
+                url = request.build_absolute_uri(url)

-        if not url:
-            return render_response(service)
+            if not url:
+                return render_response(service)

-        kwargs = {}
-        kwargs['cache_duration'] = service.get('cache_duration', 0)
-        kwargs['remote_service'] = 'auto' if service.get('signature') else None
-        # don't automatically add user info to query string, if required it can
-        # be set explicitely in the URL template in the engine definition (via
-        # {{user_nameid}} or {{user_email}}).
-        kwargs['without_user'] = True
-        # don't send error traces on HTTP errors
-        kwargs['log_errors'] = 'warn'
+            kwargs = {}
+            kwargs['cache_duration'] = service.get('cache_duration', 0)
+            kwargs['remote_service'] = 'auto' if service.get('signature') else None
+            # don't automatically add user info to query string, if required it can
+            # be set explicitely in the URL template in the engine definition (via
+            # {{user_nameid}} or {{user_email}}).
+            kwargs['without_user'] = True
+            # don't send error traces on HTTP errors
+            kwargs['log_errors'] = 'warn'

-        response = requests.get(url, **kwargs)
-        try:
-            results = response.json()
-        except ValueError:
-            return render_response(service)
+            response = requests.get(url, **kwargs)
+            try:
+                results = response.json()
+            except ValueError:
+                return render_response(service)

        if service.get('data_key'):
            results['data'] = results.get(service['data_key']) or []
@ -179,10 +182,25 @@ class SearchCell(CellBase):
            for hit in results.get('data') or []:
                for k, v in hit_templates.items():
                    hit[k] = v.render(RequestContext(request, hit))
+
        return render_response(service, results)

    def has_text_search_service(self):
        return '_text' in self._search_services.get('data', [])

    def missing_index(self):
-        return not os.path.exists(connections['default'].get_backend().path)
+        return IndexedCell.objects.all().count() == 0
+
+
+class IndexedCell(models.Model):
+    cell_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
+    cell_pk = models.PositiveIntegerField(null=True)
+    cell = fields.GenericForeignKey('cell_type', 'cell_pk')
+    page = models.ForeignKey(Page, on_delete=models.CASCADE, blank=True, null=True)
+    url = models.CharField(max_length=500, blank=True, null=True)
+    title = models.CharField(max_length=500, blank=True, null=True)
+    indexed_text = models.TextField(blank=True, null=True)
+    public_access = models.BooleanField(default=False)
+    restricted_groups = models.ManyToManyField(Group, blank=True, related_name='+')
+    excluded_groups = models.ManyToManyField(Group, blank=True, related_name='+')
+    last_update_timestamp = models.DateTimeField(auto_now=True)
--- a/combo/apps/search/utils.py
+++ b/combo/apps/search/utils.py
@ -0,0 +1,111 @@
+# combo - content management system
+# Copyright (C) 2014-2020  Entr'ouvert
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from django.conf import settings
+from django.contrib.contenttypes.models import ContentType
+from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
+from combo.data.models import CellBase
+from django.db import connection
+from django.db.models import Q
+from django.db.transaction import atomic
+
+from .models import IndexedCell
+
+
+def set_cell_access(indexed_cell, cell):
+    indexed_cell.public_access = bool(cell.page.public and cell.public)
+    indexed_cell.excluded_groups.clear()
+    indexed_cell.restricted_groups.clear()
+    if not indexed_cell.public_access:
+        indexed_cell.restricted_groups.set(cell.groups.all())
+        if cell.restricted_to_unlogged:
+            indexed_cell.excluded_groups.set(cell.page.groups.all())
+        else:
+            for group in cell.page.groups.all():
+                indexed_cell.restricted_groups.add(group)
+    indexed_cell.save()
+
+
+@atomic
+def index_site():
+    IndexedCell.objects.all().delete()
+    external_urls = {}
+    for klass in CellBase.get_cell_classes():
+        for cell in klass.objects.filter(page__snapshot__isnull=True).exclude(placeholder__startswith='_'):
+            cell_type = ContentType.objects.get_for_model(cell)
+            indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
+            try:
+                indexed_cell.indexed_text = cell.render_for_search()
+            except Exception:  # ignore rendering error
+                continue
+            if indexed_cell.indexed_text:
+                indexed_cell.page_id = cell.page_id
+                indexed_cell.url = cell.page.get_online_url()
+                indexed_cell.title = cell.page.title
+                indexed_cell.save()
+                set_cell_access(indexed_cell, cell)
+
+            for link_data in cell.get_external_links_data():
+                # index external links
+                indexed_cell = external_urls.get(indexed_cell.url)
+                if indexed_cell is None:
+                    # create an entry for that link.
+                    indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
+                    indexed_cell.save()
+                    set_cell_access(indexed_cell, cell)
+                    indexed_cell.url = link_data['url']
+                    indexed_cell.title = link_data['title']
+                    indexed_cell.indexed_text = link_data.get('text') or ''
+                    external_urls[indexed_cell.url] = indexed_cell
+                else:
+                    # if that link already exists, add detailed texts
+                    indexed_cell.indexed_text += ' ' + link_data['title']
+                    indexed_cell.indexed_text += ' ' + link_data.get('text') or ''
+                indexed_cell.save()
+
+
+def search_site(request, query):
+    if connection.vendor == 'postgresql':
+        config = settings.POSTGRESQL_FTS_SEARCH_CONFIG
+        vector = SearchVector('title', config=config, weight='A') + SearchVector('indexed_text', config=config, weight='A')
+        query = SearchQuery(query)
+        qs = IndexedCell.objects.annotate(rank=SearchRank(vector, query)).filter(rank__gte=0.3).order_by('-rank')
+    else:
+        qs = IndexedCell.objects.filter(
+                Q(indexed_text__icontains=query) | Q(title__icontains=query))
+    if request.user.is_anonymous:
+        qs = qs.exclude(public_access=False)
+    else:
+        qs = qs.filter(
+                Q(restricted_groups=None) |
+                Q(restricted_groups__in=request.user.groups.all()))
+        qs = qs.exclude(excluded_groups__in=request.user.groups.all())
+
+    hits = []
+    seen = {}
+    for hit in qs:
+        if hit.url in seen:
+            continue
+        hits.append({
+            'text': hit.title,
+            'rank': getattr(hit, 'rank', None),
+            'url': hit.url,
+        })
+        seen[hit.url] = True
+        if len(hits) == 10:
+            break
+
+    return hits
--- a/combo/data/apps.py
+++ b/combo/data/apps.py
@ -15,23 +15,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 from django.apps import AppConfig
-from django.core.urlresolvers import reverse
-from django.utils.translation import ugettext_lazy as _


 class DataConfig(AppConfig):
    name = 'combo.data'
    verbose_name = 'data'
-
-    def ready(self):
-        # register built-in search engine for page contents
-        from combo.apps.search import engines
-        engines.register(self.get_search_engines)
-
-    def get_search_engines(self):
-        return {
-            '_text': {
-                'url': reverse('api-search') + '?q=%(q)s',
-                'label': _('Page Contents'),
-            }
-        }
--- a/combo/data/migrations/0043_delete_externallinksearchitem.py
+++ b/combo/data/migrations/0043_delete_externallinksearchitem.py
@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.17 on 2020-01-20 15:30
+from __future__ import unicode_literals
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('data', '0042_page_creation_timestamp'),
+    ]
+
+    operations = [
+        migrations.DeleteModel(
+            name='ExternalLinkSearchItem',
+        ),
+    ]
--- a/combo/data/models.py
+++ b/combo/data/models.py
@ -755,10 +755,6 @@ class CellBase(six.with_metaclass(CellMeta, models.Model)):
            return ''
        if self.user_dependant:
            return ''
-        if not self.page.is_visible(user=None):
-            return ''
-        if not self.is_visible(user=None):
-            return ''
        request = RequestFactory().get(self.page.get_online_url())
        request.user = None  # compat
        context = {
@ -1474,18 +1470,6 @@ class ConfigJsonCell(JsonCellBase):
        return context


-class ExternalLinkSearchItem(models.Model):
-    # Link to an external site.
-    #
-    # Those are automatically collected during by the "update_index" command,
-    # that calls get_external_links_data from all available cells, to be used
-    # by the general search engine.
-    title = models.CharField(_('Title'), max_length=150)
-    text = models.TextField(blank=True)
-    url = models.CharField(_('URL'), max_length=200, blank=True)
-    last_update_timestamp = models.DateTimeField(auto_now=True)
-
-
@receiver(pre_save, sender=Page)
 def create_redirects(sender, instance, raw, **kwargs):
    if raw or not instance.id or instance.snapshot_id:
--- a/combo/data/search_indexes.py
+++ b/combo/data/search_indexes.py
@ -1,46 +0,0 @@
-# combo - content management system
-# Copyright (C) 2014-2017  Entr'ouvert
-#
-# This program is free software: you can redistribute it and/or modify it
-# under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-from haystack import indexes
-from haystack.exceptions import SkipDocument
-
-from .models import Page, CellBase, ExternalLinkSearchItem
-
-class PageIndex(indexes.SearchIndex, indexes.Indexable):
-    title = indexes.CharField(model_attr='title', boost=1.5)
-    text = indexes.CharField(document=True, use_template=True,
-            template_name='combo/search/page.txt')
-    url = indexes.CharField(indexed=False)
-
-    def get_model(self):
-        return Page
-
-    def prepare_url(self, obj):
-        return obj.get_online_url()
-
-    def prepare(self, obj):
-        if not obj.is_visible(user=None):
-            raise SkipDocument()
-        return super(PageIndex, self).prepare(obj)
-
-
-class ExternalLinkSearchIndex(indexes.SearchIndex, indexes.Indexable):
-    title = indexes.CharField(model_attr='title', boost=1.5)
-    text = indexes.CharField(model_attr='text', document=True)
-    url = indexes.CharField(model_attr='url', indexed=False)
-
-    def get_model(self):
-        return ExternalLinkSearchItem
--- a/combo/data/templates/combo/search/page.txt
+++ b/combo/data/templates/combo/search/page.txt
@ -1,7 +0,0 @@
-{% autoescape off %}
-{% for cell in object.get_cells %}
- {% if cell.placeholder|first != '_' %} {# ignore technical placeholders #}
-  {{ cell.render_for_search }}
- {% endif %}
-{% endfor %}
-{% endautoescape %}
--- a/combo/public/urls.py
+++ b/combo/public/urls.py
@ -21,7 +21,6 @@ from . import views

 urlpatterns = [
    url(r'^api/menu-badges/$', views.menu_badges),
-    url(r'^api/search/$', views.api_search, name='api-search'),
    url(r'^ajax/cell/(?P<page_pk>\w+)/(?P<cell_reference>[\w_-]+)/$',
        views.ajax_page_cell, name='combo-public-ajax-page-cell'),
    url(r'^snapshot/(?P<pk>\w+)/$', manager_required(views.snapshot), name='combo-snapshot-view'),
--- a/combo/public/views.py
+++ b/combo/public/views.py
@ -40,9 +40,6 @@ from django.views.decorators.csrf import csrf_exempt
 from django.utils.translation import ugettext as _
 from django.forms.widgets import Media

-from haystack.inputs import AutoQuery
-from haystack.query import SearchQuerySet, SQ
-
 if 'mellon' in settings.INSTALLED_APPS:
    from mellon.utils import get_idps
 else:
@ -577,31 +574,6 @@ def menu_badges(request):
 menu_badges.mellon_no_passive = True


-def api_search(request):
-    for cell in SearchCell.get_cells_by_search_service('_text'):
-        if not cell.is_visible(request.user):
-            continue
-        break
-    else:
-        raise Http404()
-    query = request.GET.get('q') or ''
-    sqs = SearchQuerySet().filter(SQ(content=AutoQuery(query)) | SQ(title=AutoQuery(query)))
-    sqs = sqs.highlight()
-    sqs.load_all()
-    hits = []
-    for hit in sqs:
-        description = None
-        if hit.model_name == 'page' and hit.highlighted['text']:
-            description = '<p>%s</p>' % hit.highlighted['text'][0]
-        hits.append({
-            'text': hit.title,
-            'url': hit.url,
-            'description': description,
-        })
-
-    return HttpResponse(json.dumps({'data': hits}), content_type='application/json')
-
-
 def snapshot(request, *args, **kwargs):
    snapshot = PageSnapshot.objects.get(id=kwargs['pk'])
    return publish_page(request, snapshot.get_page())
--- a/combo/settings.py
+++ b/combo/settings.py
@ -77,7 +77,6 @@ INSTALLED_APPS = (
    'combo.apps.pwa',
    'combo.apps.gallery',
    'combo.apps.kb',
-    'haystack',
    'xstatic.pkg.josefinsans',
    'xstatic.pkg.leaflet',
    'xstatic.pkg.opensans',
@ -189,13 +188,6 @@ CKEDITOR_CONFIGS = {
 CKEDITOR_CONFIGS['small'] = copy.copy(CKEDITOR_CONFIGS['default'])
 CKEDITOR_CONFIGS['small']['height'] = 150

-HAYSTACK_CONNECTIONS = {
-    'default': {
-        'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine',
-        'PATH': os.path.join(BASE_DIR, 'whoosh_index'),
-    },
-}
-
 # from solr.thumbnail -- https://sorl-thumbnail.readthedocs.io/en/latest/reference/settings.html
 THUMBNAIL_PRESERVE_FORMAT = True
 THUMBNAIL_FORCE_OVERWRITE = False
@ -264,6 +256,7 @@ MELLON_IDENTITY_PROVIDERS = []

 # search services
 COMBO_SEARCH_SERVICES = {}
+POSTGRESQL_FTS_SEARCH_CONFIG = 'french'

 # mapping of payment modes
 LINGO_NO_ONLINE_PAYMENT_REASONS = {}
--- a/debian/combo.cron.hourly
+++ b/debian/combo.cron.hourly
@ -2,5 +2,3 @@

 /sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command cron --all-tenants
 /sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command clearsessions --all-tenants
-# update_index cannot be used due to some bug in haystack/whoosh (#30509)
-/sbin/runuser -u combo /usr/bin/combo-manage -- tenant_command rebuild_index --noinput --all-tenants -v0
--- a/debian/control
+++ b/debian/control
@ -21,14 +21,13 @@ Depends: ${misc:Depends}, ${python3:Depends},
    python3-xstatic-opensans,
    python3-xstatic-roboto-fontface (>= 0.5.0.0),
    python3-eopayment (>= 1.35),
-    python3-django-haystack (>= 2.4.0),
    python3-django-ratelimit,
    python3-sorl-thumbnail,
    python3-pil,
    python3-pywebpush,
    python3-pygal,
    python3-lxml
-Recommends: python3-django-mellon, python3-whoosh
+Recommends: python3-django-mellon
 Conflicts: python-lingo
 Breaks: combo (<< 2.34.post2)
 Description: Portal Management System (Python module)
--- a/requirements.txt
+++ b/requirements.txt
@ -11,7 +11,5 @@ XStatic_roboto-fontface
 eopayment>=1.13
 python-dateutil
 djangorestframework>=3.3, <3.7
-django-haystack
-whoosh
 sorl-thumbnail
 pyproj
--- a/setup.py
+++ b/setup.py
@ -163,9 +163,7 @@ setup(
        'eopayment>=1.41',
        'python-dateutil',
        'djangorestframework>=3.3, <3.7',
-        'django-haystack',
        'django-ratelimit<3',
-        'whoosh',
        'sorl-thumbnail',
        'Pillow',
        'pyproj',
--- a/tests/settings.py
+++ b/tests/settings.py
@ -44,9 +44,6 @@ COMBO_DASHBOARD_ENABLED = True
 import tempfile
 MEDIA_ROOT = tempfile.mkdtemp('combo-test')

-HAYSTACK_CONNECTIONS['default']['PATH'] = os.path.join(
-        tempfile.mkdtemp('combo-test-whoosh'))
-
 if 'DISABLE_MIGRATIONS' in os.environ:
    class DisableMigrations(object):
        def __contains__(self, item):
--- a/tests/test_search.py
+++ b/tests/test_search.py
@ -6,17 +6,16 @@ import shutil
 import mock

 from django.conf import settings
+from django.contrib.auth.models import AnonymousUser, User, Group
 from django.test import override_settings
 from django.test.client import RequestFactory
 from django.core.management import call_command
 from django.core.urlresolvers import reverse

-from haystack.exceptions import SkipDocument
-
 from combo.apps.search.engines import engines
-from combo.apps.search.models import SearchCell
+from combo.apps.search.models import SearchCell, IndexedCell
+from combo.apps.search.utils import index_site, search_site
 from combo.data.models import Page, JsonCell, TextCell, MenuCell, LinkCell
-from combo.data.search_indexes import PageIndex

 from .test_manager import login

@ -229,9 +228,9 @@ def test_search_contents():
    page = Page(title='example page', slug='example-page')
    page.save()

-    # no indexation of private cells (is_visible check)
+    # private cells are indexed
    cell = TextCell(page=page, text='foobar', public=False, order=0)
-    assert cell.render_for_search() == ''
+    assert cell.render_for_search().strip() == 'foobar'

    # no indexation of empty cells (is_relevant check)
    cell = TextCell(page=page, text='', order=0)
@ -247,25 +246,20 @@ def test_search_contents():

 def test_search_contents_index():
    page = Page(title='example page', slug='example-page')
+    page.public = True
    page.save()

-    page_index = PageIndex()
-    assert page_index.get_model() is Page
-
-    assert page_index.prepare_url(page) == '/example-page/'
-
-    page_index.prepare(page)
-
-    page.public = False
-    with pytest.raises(SkipDocument):
-        page_index.prepare(page)
-
-    page.public = True
    cell = TextCell(page=page, text='<p>foobar</p>', order=0)
    cell.save()

-    prepared_data = page_index.prepare(page)
-    assert 'foobar' in prepared_data['text']
+    request = RequestFactory().get('/')
+    request.user = AnonymousUser()
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    index_site()
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 1
+

 def test_search_contents_technical_placeholder():
    page = Page(title='example page', slug='example-page')
@ -274,10 +268,14 @@ def test_search_contents_technical_placeholder():
    TextCell(page=page, text='<p>foobar</p>', order=0, placeholder='_off').save()
    TextCell(page=page, text='<p>barfoo</p>', order=0, placeholder='on').save()

-    page_index = PageIndex()
-    prepared_data = page_index.prepare(page)
-    assert 'barfoo' in prepared_data['text']
-    assert not 'foobar' in prepared_data['text']
+    request = RequestFactory().get('/')
+    request.user = AnonymousUser()
+    index_site()
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1
+

 def test_search_api(app):
    page = Page(title='example page', slug='example-page')
@ -291,70 +289,61 @@ def test_search_api(app):

    cell = TextCell(page=second_page, text='<p>other baz</p>', order=0)
    cell.save()
-
-    page_index = PageIndex()
-    page_index.reindex()
-
-    resp = app.get('/api/search/?q=foobar', status=404)
+    index_site()

    cell = SearchCell(page=page, _search_services={'data': ['_text']}, order=0)
    cell.save()

-    resp = app.get('/api/search/?q=foobar', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['text'] == 'example page'
+    resp = app.get('/ajax/search/%s/_text/?q=foobar' % cell.id, status=200)
+    assert resp.text.count('<li') == 1
+    assert 'example page' in resp.text

-    resp = app.get('/api/search/?q=other', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['text'] == 'second page'
+    resp = app.get('/ajax/search/%s/_text/?q=other' % cell.id, status=200)
+    assert resp.text.count('<li') == 1
+    assert 'second page' in resp.text

-    resp = app.get('/api/search/?q=baz', status=200)
-    assert len(resp.json['data']) == 2
+    resp = app.get('/ajax/search/%s/_text/?q=baz' % cell.id, status=200)
+    assert resp.text.count('<li') == 2

-    resp = app.get('/api/search/?q=quux', status=200)
-    assert len(resp.json['data']) == 0
+    resp = app.get('/ajax/search/%s/_text/?q=quux' % cell.id, status=200)
+    assert resp.text.count('<li') == 0

-def test_update_index_command(app):
-    call_command('clear_index', interactive=False)
-    call_command('update_index') # empty site

+def test_search_external_links(app):
    page = Page(title='example page', slug='example-page')
    page.save()

    cell = SearchCell(page=page, _search_services={'data': ['_text']}, order=0)
    cell.save()

-    call_command('update_index')
-    resp = app.get('/api/search/?q=foobar', status=200)
-    assert len(resp.json['data']) == 0
+    index_site()
+    request = RequestFactory().get('/')
+    request.user = AnonymousUser()
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0

    LinkCell(title='foobar', url='http://example.net', page=page, order=0).save()
-    call_command('update_index')
+    index_site()

-    resp = app.get('/api/search/?q=foobar', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['text'] == 'foobar'
-    assert resp.json['data'][0]['description'] is None
-    assert resp.json['data'][0]['url'] == 'http://example.net'
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 1
+    assert hits[0]['text'] == 'foobar'
+    assert hits[0]['url'] == 'http://example.net'

+    # second link with same target
    LinkCell(title='baz', url='http://example.net', page=page, order=0).save()
-    call_command('update_index')
-
-    resp = app.get('/api/search/?q=baz', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['url'] == 'http://example.net'
+    index_site()

    # add a second link with the same target
-    LinkCell(title='bar', url='http://example.net', page=page, order=0).save()
-    call_command('update_index')
+    hits = search_site(request, 'baz')
+    assert len(hits) == 1
+    assert hits[0]['text'] in ('foobar', 'baz')
+    assert hits[0]['url'] == 'http://example.net'
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 1
+    assert hits[0]['text'] in ('foobar', 'baz')
+    assert hits[0]['url'] == 'http://example.net'

-    resp = app.get('/api/search/?q=baz', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['url'] == 'http://example.net'
-
-    resp = app.get('/api/search/?q=bar', status=200)
-    assert len(resp.json['data']) == 1
-    assert resp.json['data'][0]['url'] == 'http://example.net'

 def test_manager_search_cell(app, admin_user):
    Page.objects.all().delete()
@ -399,9 +388,6 @@ def test_manager_search_cell(app, admin_user):


 def test_manager_waiting_index_message(app, admin_user):
-    from haystack import connections
-    shutil.rmtree(connections['default'].get_backend().path)
-
    Page.objects.all().delete()
    page = Page(title='One', slug='one', template_name='standard')
    page.save()
@ -417,8 +403,7 @@ def test_manager_waiting_index_message(app, admin_user):
    resp = resp.form.submit().follow()
    assert 'Content indexing has been scheduled' in resp.text

-    os.mkdir(connections['default'].get_backend().path)
-    call_command('update_index')
+    index_site()
    resp = app.get('/manage/pages/%s/' % page.id)
    assert 'Content indexing has been scheduled' not in resp.text

@ -455,3 +440,73 @@ def test_profile_search_engines(app):
        page.save()
        search_engines = engines.get_engines()
        assert 'users' in search_engines.keys()
+
+
+def test_private_search(app):
+    page = Page(title='example page', slug='example-page')
+    page.save()
+
+    TextCell(page=page, text='<p>foobar</p>', order=0, public=False).save()
+    TextCell(page=page, text='<p>barfoo</p>', order=0, public=True).save()
+
+    request = RequestFactory().get('/')
+    request.user = AnonymousUser()
+    index_site()
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1
+
+    request.user = User.objects.create_user(username='normal-user')
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 1
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1
+
+
+def test_restricted_search(app):
+    group = Group(name='plop')
+    group.save()
+
+    page = Page(title='example page', slug='example-page')
+    page.save()
+
+    cell = TextCell(page=page, text='<p>foobar</p>', order=0, public=False)
+    cell.save()
+    cell.groups.set([group])
+    TextCell(page=page, text='<p>barfoo</p>', order=0, public=False).save()
+    index_site()
+
+    # first cell is restricted, it's not found
+    request = RequestFactory().get('/')
+    request.user = User.objects.create_user(username='normal-user')
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1
+
+    page.groups.set([group])
+    index_site()
+
+    # page is restricted, no cell is found
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 0
+
+    # user is in group, gets a result
+    request.user.groups.set([group])
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 1
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1
+
+    # cell is excluded from group view
+    cell.restricted_to_unlogged = True
+    cell.save()
+    index_site()
+
+    hits = search_site(request, 'foobar')
+    assert len(hits) == 0
+    hits = search_site(request, 'barfoo')
+    assert len(hits) == 1