general: add external links to search results (#20376)

This commit is contained in:
Frédéric Péters 2017-12-26 16:22:29 +01:00
parent c6a915d95f
commit 273f1b0032
8 changed files with 180 additions and 8 deletions

View File

@ -14,15 +14,58 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
from haystack.management.commands.update_index import Command as UpdateIndexCommand
from combo.data.models import Page, ExternalLinkSearchItem
from combo.apps.search.models import SearchCell
class Command(UpdateIndexCommand):
def add_arguments(self, parser):
parser.add_argument(
'--skip-external-links-collection', action='store_true', default=False,
dest='skip_external_links_collection')
def handle(self, **options):
if SearchCell.objects.filter(_search_service='_text').count() == 0:
# do not index site if there's no matching search cell
return
if not options.get('skip_external_links_collection', False):
self.collect_external_links()
return super(Command, self).handle(**options)
def collect_external_links(self):
start_time = datetime.datetime.now()
# assemble external links data
links = {}
for page in Page.objects.all():
if not page.is_visible(user=None):
continue
for cell in page.get_cells():
if not cell.is_visible(user=None):
continue
for link_data in cell.get_external_links_data():
if not link_data['url'] in links:
# create an entry for that link.
links[link_data['url']] = {}
links[link_data['url']]['title'] = link_data['title']
links[link_data['url']]['all_titles'] = []
# all titles and additional texts will be assembled and indexed
links[link_data['url']]['all_titles'].append(link_data['title'])
links[link_data['url']]['all_titles'].append(link_data.get('text') or '')
# save data as ExternalLinkSearchItem objects
for link_url, link_data in links.items():
link_object, created = ExternalLinkSearchItem.objects.get_or_create(
url=link_url,
defaults={'title': link_data['title']})
link_object.title = link_data['title']
link_object.text = '\n'.join(link_data['all_titles'])
link_object.save()
# remove obsolete objects
ExternalLinkSearchItem.objects.filter(last_update_timestamp__lt=start_time).delete()

View File

@ -82,6 +82,22 @@ class WcsFormCell(CellBase):
return
return self.cached_title
def render_for_search(self):
return ''
def get_external_links_data(self):
if not (self.cached_url and self.cached_title):
return []
text = ''
if self.cached_json:
text = ' '.join([self.cached_json.get('description', ''),
' '.join(self.cached_json.get('keywords', []))]).strip()
return [{
'url': self.cached_url,
'title': self.cached_title,
'text': text,
}]
class WcsCommonCategoryCell(CellBase):
is_enabled = classmethod(is_wcs_enabled)
@ -375,6 +391,18 @@ class WcsFormsOfCategoryCell(WcsCommonCategoryCell, WcsBlurpMixin):
return super(WcsFormsOfCategoryCell, self).render(context)
def render_for_search(self):
return ''
def get_external_links_data(self):
formdefs = self.get_data({'synchronous': True})
for site in formdefs.values():
for formdef in site.get('data'):
yield {
'url': formdef['url'],
'title': formdef['title'],
'text': formdef['description']}
@register_cell_class
class CategoriesCell(WcsDataBaseCell):

View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('data', '0029_auto_20171022_1242'),
]
operations = [
migrations.CreateModel(
name='ExternalLinkSearchItem',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('title', models.CharField(max_length=150, verbose_name='Title')),
('text', models.TextField(blank=True)),
('url', models.CharField(max_length=200, verbose_name='URL', blank=True)),
('last_update_timestamp', models.DateTimeField(auto_now=True)),
],
),
]

View File

@ -537,6 +537,9 @@ class CellBase(models.Model):
from HTMLParser import HTMLParser
return HTMLParser().unescape(strip_tags(self.render(context)))
def get_external_links_data(self):
return []
@register_cell_class
class TextCell(CellBase):
@ -707,6 +710,17 @@ class LinkCell(CellBase):
from forms import LinkCellForm
return LinkCellForm
def render_for_search(self):
return ''
def get_external_links_data(self):
if not self.url:
return []
link_data = self.get_cell_extra_context({})
if link_data.get('title') and link_data.get('url'):
return [link_data]
return []
@register_cell_class
class FeedCell(CellBase):
@ -1150,3 +1164,15 @@ class ConfigJsonCell(JsonCellBase):
extra = super(ConfigJsonCell, self).get_cell_extra_context(ctx, **kwargs)
ctx.update(extra)
return ctx
class ExternalLinkSearchItem(models.Model):
# Link to an external site.
#
# Those are automatically collected during by the "update_index" command,
# that calls get_external_links_data from all available cells, to be used
# by the general search engine.
title = models.CharField(_('Title'), max_length=150)
text = models.TextField(blank=True)
url = models.CharField(_('URL'), max_length=200, blank=True)
last_update_timestamp = models.DateTimeField(auto_now=True)

View File

@ -17,7 +17,7 @@
from haystack import indexes
from haystack.exceptions import SkipDocument
from .models import Page, CellBase
from .models import Page, CellBase, ExternalLinkSearchItem
class PageIndex(indexes.SearchIndex, indexes.Indexable):
title = indexes.CharField(model_attr='title', boost=1.5)
@ -35,3 +35,12 @@ class PageIndex(indexes.SearchIndex, indexes.Indexable):
if not obj.is_visible(user=None):
raise SkipDocument()
return super(PageIndex, self).prepare(obj)
class ExternalLinkSearchIndex(indexes.SearchIndex, indexes.Indexable):
title = indexes.CharField(model_attr='title', boost=1.5)
text = indexes.CharField(model_attr='text', document=True)
url = indexes.CharField(model_attr='url', indexed=False)
def get_model(self):
return ExternalLinkSearchItem

View File

@ -409,12 +409,13 @@ def api_search(request):
sqs = searchqueryset.auto_query(query).highlight()
sqs.load_all()
hits = []
for page in sqs:
if page.highlighted['text']:
description = '<p>%s</p>' % page.highlighted['text'][0]
for hit in sqs:
description = None
if hit.model_name == 'page' and hit.highlighted['text']:
description = '<p>%s</p>' % hit.highlighted['text'][0]
hits.append({
'text': page.title,
'url': page.url,
'text': hit.title,
'url': hit.url,
'description': description,
})

View File

@ -6,13 +6,13 @@ import mock
from django.conf import settings
from django.test import Client, override_settings
from django.test.client import RequestFactory
from django.core.management import call_command
from django.core.urlresolvers import reverse
from haystack.exceptions import SkipDocument
from combo.apps.search.management.commands.update_index import Command
from combo.apps.search.models import SearchCell
from combo.data.models import Page, JsonCell, TextCell, MenuCell
from combo.data.models import Page, JsonCell, TextCell, MenuCell, LinkCell
from combo.data.search_indexes import PageIndex
pytestmark = pytest.mark.django_db
@ -224,3 +224,33 @@ def test_search_api(app):
resp = app.get('/api/search/?q=quux', status=200)
assert len(resp.json['data']) == 0
def test_update_index_command(app):
call_command('clear_index', interactive=False)
call_command('update_index') # empty site
page = Page(title='example page', slug='example-page')
page.save()
cell = SearchCell(page=page, _search_service='_text', order=0)
cell.save()
call_command('update_index')
resp = app.get('/api/search/?q=foobar', status=200)
assert len(resp.json['data']) == 0
LinkCell(title='foobar', url='http://example.net', page=page, order=0).save()
call_command('update_index')
resp = app.get('/api/search/?q=foobar', status=200)
assert len(resp.json['data']) == 1
assert resp.json['data'][0]['text'] == 'foobar'
assert resp.json['data'][0]['description'] is None
assert resp.json['data'][0]['url'] == 'http://example.net'
LinkCell(title='baz', url='http://example.net', page=page, order=0).save()
call_command('update_index')
resp = app.get('/api/search/?q=baz', status=200)
assert len(resp.json['data']) == 1
assert resp.json['data'][0]['url'] == 'http://example.net'

View File

@ -243,6 +243,13 @@ def test_form_cell_save_cache():
# make sure cached attributes are removed from serialized pages
assert not 'cached_' in json.dumps(page.get_serialized_page())
# check content provided to search engine
assert cell.render_for_search() == ''
assert cell.get_external_links_data() == [
{'title': 'form title',
'url': 'http://127.0.0.1:8999/form-title/',
'text': ''}]
@wcsctl_present
def test_category_cell_save_cache():
page = Page(title='xxx', slug='test_category_cell_save_cache', template_name='standard')
@ -416,6 +423,10 @@ def test_forms_of_category_cell_render(context):
assert result.index('form title') > result.index('a second form title')
assert 'form title' in result and 'a second form title' in result
# check content provided to search engine
assert cell.render_for_search() == ''
assert len(list(cell.get_external_links_data())) == 2
@wcsctl_present
def test_current_drafts_cell_render_unlogged(context):
page = Page(title='xxx', slug='test_current_drafts_cell_render', template_name='standard')