general: add external links to search results (#20376)

2017-12-26 16:22:29 +01:00 · 2017-12-26 16:22:29 +01:00 · 273f1b0032
parent c6a915d95f
commit 273f1b0032
8 changed files with 180 additions and 8 deletions
--- a/combo/apps/search/management/commands/update_index.py
+++ b/combo/apps/search/management/commands/update_index.py
@ -14,15 +14,58 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

+import datetime
+
 from haystack.management.commands.update_index import Command as UpdateIndexCommand

+from combo.data.models import Page, ExternalLinkSearchItem
 from combo.apps.search.models import SearchCell


 class Command(UpdateIndexCommand):

+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--skip-external-links-collection', action='store_true', default=False,
+            dest='skip_external_links_collection')
+
    def handle(self, **options):
        if SearchCell.objects.filter(_search_service='_text').count() == 0:
            # do not index site if there's no matching search cell
            return
+        if not options.get('skip_external_links_collection', False):
+            self.collect_external_links()
        return super(Command, self).handle(**options)
+
+    def collect_external_links(self):
+        start_time = datetime.datetime.now()
+
+        # assemble external links data
+        links = {}
+        for page in Page.objects.all():
+            if not page.is_visible(user=None):
+                continue
+            for cell in page.get_cells():
+                if not cell.is_visible(user=None):
+                    continue
+                for link_data in cell.get_external_links_data():
+                    if not link_data['url'] in links:
+                        # create an entry for that link.
+                        links[link_data['url']] = {}
+                        links[link_data['url']]['title'] = link_data['title']
+                        links[link_data['url']]['all_titles'] = []
+                    # all titles and additional texts will be assembled and indexed
+                    links[link_data['url']]['all_titles'].append(link_data['title'])
+                    links[link_data['url']]['all_titles'].append(link_data.get('text') or '')
+
+        # save data as ExternalLinkSearchItem objects
+        for link_url, link_data in links.items():
+            link_object, created = ExternalLinkSearchItem.objects.get_or_create(
+                    url=link_url,
+                    defaults={'title': link_data['title']})
+            link_object.title = link_data['title']
+            link_object.text = '\n'.join(link_data['all_titles'])
+            link_object.save()
+
+        # remove obsolete objects
+        ExternalLinkSearchItem.objects.filter(last_update_timestamp__lt=start_time).delete()
--- a/combo/apps/wcs/models.py
+++ b/combo/apps/wcs/models.py
@ -82,6 +82,22 @@ class WcsFormCell(CellBase):
            return
        return self.cached_title

+    def render_for_search(self):
+        return ''
+
+    def get_external_links_data(self):
+        if not (self.cached_url and self.cached_title):
+            return []
+        text = ''
+        if self.cached_json:
+            text = ' '.join([self.cached_json.get('description', ''),
+                             ' '.join(self.cached_json.get('keywords', []))]).strip()
+        return [{
+            'url': self.cached_url,
+            'title': self.cached_title,
+            'text': text,
+            }]
+

 class WcsCommonCategoryCell(CellBase):
    is_enabled = classmethod(is_wcs_enabled)
@ -375,6 +391,18 @@ class WcsFormsOfCategoryCell(WcsCommonCategoryCell, WcsBlurpMixin):

        return super(WcsFormsOfCategoryCell, self).render(context)

+    def render_for_search(self):
+        return ''
+
+    def get_external_links_data(self):
+        formdefs = self.get_data({'synchronous': True})
+        for site in formdefs.values():
+            for formdef in site.get('data'):
+                yield {
+                    'url': formdef['url'],
+                    'title': formdef['title'],
+                    'text': formdef['description']}
+

@register_cell_class
 class CategoriesCell(WcsDataBaseCell):
--- a/combo/data/migrations/0030_externallinksearchitem.py
+++ b/combo/data/migrations/0030_externallinksearchitem.py
@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('data', '0029_auto_20171022_1242'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='ExternalLinkSearchItem',
+            fields=[
+                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
+                ('title', models.CharField(max_length=150, verbose_name='Title')),
+                ('text', models.TextField(blank=True)),
+                ('url', models.CharField(max_length=200, verbose_name='URL', blank=True)),
+                ('last_update_timestamp', models.DateTimeField(auto_now=True)),
+            ],
+        ),
+    ]
--- a/combo/data/models.py
+++ b/combo/data/models.py
@ -537,6 +537,9 @@ class CellBase(models.Model):
        from HTMLParser import HTMLParser
        return HTMLParser().unescape(strip_tags(self.render(context)))

+    def get_external_links_data(self):
+        return []
+

@register_cell_class
 class TextCell(CellBase):
@ -707,6 +710,17 @@ class LinkCell(CellBase):
        from forms import LinkCellForm
        return LinkCellForm

+    def render_for_search(self):
+        return ''
+
+    def get_external_links_data(self):
+        if not self.url:
+            return []
+        link_data = self.get_cell_extra_context({})
+        if link_data.get('title') and link_data.get('url'):
+            return [link_data]
+        return []
+

@register_cell_class
 class FeedCell(CellBase):
@ -1150,3 +1164,15 @@ class ConfigJsonCell(JsonCellBase):
        extra = super(ConfigJsonCell, self).get_cell_extra_context(ctx, **kwargs)
        ctx.update(extra)
        return ctx
+
+
+class ExternalLinkSearchItem(models.Model):
+    # Link to an external site.
+    #
+    # Those are automatically collected during by the "update_index" command,
+    # that calls get_external_links_data from all available cells, to be used
+    # by the general search engine.
+    title = models.CharField(_('Title'), max_length=150)
+    text = models.TextField(blank=True)
+    url = models.CharField(_('URL'), max_length=200, blank=True)
+    last_update_timestamp = models.DateTimeField(auto_now=True)
--- a/combo/data/search_indexes.py
+++ b/combo/data/search_indexes.py
@ -17,7 +17,7 @@
 from haystack import indexes
 from haystack.exceptions import SkipDocument

-from .models import Page, CellBase
+from .models import Page, CellBase, ExternalLinkSearchItem

 class PageIndex(indexes.SearchIndex, indexes.Indexable):
    title = indexes.CharField(model_attr='title', boost=1.5)
@ -35,3 +35,12 @@ class PageIndex(indexes.SearchIndex, indexes.Indexable):
        if not obj.is_visible(user=None):
            raise SkipDocument()
        return super(PageIndex, self).prepare(obj)
+
+
+class ExternalLinkSearchIndex(indexes.SearchIndex, indexes.Indexable):
+    title = indexes.CharField(model_attr='title', boost=1.5)
+    text = indexes.CharField(model_attr='text', document=True)
+    url = indexes.CharField(model_attr='url', indexed=False)
+
+    def get_model(self):
+        return ExternalLinkSearchItem
--- a/combo/public/views.py
+++ b/combo/public/views.py
@ -409,12 +409,13 @@ def api_search(request):
    sqs = searchqueryset.auto_query(query).highlight()
    sqs.load_all()
    hits = []
-    for page in sqs:
-        if page.highlighted['text']:
-            description = '<p>%s</p>' % page.highlighted['text'][0]
+    for hit in sqs:
+        description = None
+        if hit.model_name == 'page' and hit.highlighted['text']:
+            description = '<p>%s</p>' % hit.highlighted['text'][0]
        hits.append({
-            'text': page.title,
-            'url': page.url,
+            'text': hit.title,
+            'url': hit.url,
            'description': description,
        })

--- a/tests/test_search.py
+++ b/tests/test_search.py
@ -6,13 +6,13 @@ import mock
 from django.conf import settings
 from django.test import Client, override_settings
 from django.test.client import RequestFactory
+from django.core.management import call_command
 from django.core.urlresolvers import reverse

 from haystack.exceptions import SkipDocument

-from combo.apps.search.management.commands.update_index import Command
 from combo.apps.search.models import SearchCell
-from combo.data.models import Page, JsonCell, TextCell, MenuCell
+from combo.data.models import Page, JsonCell, TextCell, MenuCell, LinkCell
 from combo.data.search_indexes import PageIndex

 pytestmark = pytest.mark.django_db
@ -224,3 +224,33 @@ def test_search_api(app):

    resp = app.get('/api/search/?q=quux', status=200)
    assert len(resp.json['data']) == 0
+
+def test_update_index_command(app):
+    call_command('clear_index', interactive=False)
+    call_command('update_index') # empty site
+
+    page = Page(title='example page', slug='example-page')
+    page.save()
+
+    cell = SearchCell(page=page, _search_service='_text', order=0)
+    cell.save()
+
+    call_command('update_index')
+    resp = app.get('/api/search/?q=foobar', status=200)
+    assert len(resp.json['data']) == 0
+
+    LinkCell(title='foobar', url='http://example.net', page=page, order=0).save()
+    call_command('update_index')
+
+    resp = app.get('/api/search/?q=foobar', status=200)
+    assert len(resp.json['data']) == 1
+    assert resp.json['data'][0]['text'] == 'foobar'
+    assert resp.json['data'][0]['description'] is None
+    assert resp.json['data'][0]['url'] == 'http://example.net'
+
+    LinkCell(title='baz', url='http://example.net', page=page, order=0).save()
+    call_command('update_index')
+
+    resp = app.get('/api/search/?q=baz', status=200)
+    assert len(resp.json['data']) == 1
+    assert resp.json['data'][0]['url'] == 'http://example.net'
--- a/tests/test_wcs.py
+++ b/tests/test_wcs.py
@ -243,6 +243,13 @@ def test_form_cell_save_cache():
    # make sure cached attributes are removed from serialized pages
    assert not 'cached_' in json.dumps(page.get_serialized_page())

+    # check content provided to search engine
+    assert cell.render_for_search() == ''
+    assert cell.get_external_links_data() == [
+        {'title': 'form title',
+         'url': 'http://127.0.0.1:8999/form-title/',
+         'text': ''}]
+
@wcsctl_present
 def test_category_cell_save_cache():
    page = Page(title='xxx', slug='test_category_cell_save_cache', template_name='standard')
@ -416,6 +423,10 @@ def test_forms_of_category_cell_render(context):
    assert result.index('form title') > result.index('a second form title')
    assert 'form title' in result and 'a second form title' in result

+    # check content provided to search engine
+    assert cell.render_for_search() == ''
+    assert len(list(cell.get_external_links_data())) == 2
+
@wcsctl_present
 def test_current_drafts_cell_render_unlogged(context):
    page = Page(title='xxx', slug='test_current_drafts_cell_render', template_name='standard')