search: better queries for index_site (#40252)

This commit is contained in:
Lauréline Guérin 2020-03-27 17:42:42 +01:00
parent a83ce2c5ee
commit 3464f6d023
No known key found for this signature in database
GPG Key ID: 1FAB9B9B4F93D473
2 changed files with 46 additions and 21 deletions

View File

@ -17,34 +17,54 @@
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from combo.data.models import CellBase
from django.db import connection
from django.db.models import Q
from django.db.models import Q, Prefetch
from django.db.transaction import atomic
from combo.data.models import Page, CellBase, ValidityInfo
from .models import IndexedCell
def set_cell_access(indexed_cell, cell):
indexed_cell.public_access = bool(cell.page.public and cell.public)
indexed_cell.excluded_groups.clear()
indexed_cell.restricted_groups.clear()
def set_cell_groups(indexed_cell, cell):
restricted_groups = []
excluded_groups = []
if not indexed_cell.public_access:
indexed_cell.restricted_groups.set(cell.groups.all())
restricted_groups = cell.prefetched_groups
if cell.restricted_to_unlogged:
indexed_cell.excluded_groups.set(cell.page.groups.all())
excluded_groups = cell.page.prefetched_groups
else:
for group in cell.page.groups.all():
indexed_cell.restricted_groups.add(group)
indexed_cell.save()
for group in cell.page.prefetched_groups:
restricted_groups.append(group)
if restricted_groups:
indexed_cell.restricted_groups.add(*restricted_groups)
if excluded_groups:
indexed_cell.excluded_groups.add(*excluded_groups)
@atomic
def index_site():
cell_classes = list(CellBase.get_cell_classes())
# populate ContentType cache
ContentType.objects.get_for_models(*cell_classes)
IndexedCell.objects.all().delete()
external_urls = {}
for klass in CellBase.get_cell_classes():
for cell in klass.objects.filter(page__snapshot__isnull=True, page__sub_slug='').exclude(placeholder__startswith='_'):
validity_info_list = list(ValidityInfo.objects.select_related('content_type'))
pages_by_pk = {
p.pk: p for p in (
Page.objects
.prefetch_related(Prefetch('groups', to_attr='prefetched_groups')))}
for klass in cell_classes:
queryset = (
klass.objects
.filter(page__snapshot__isnull=True, page__sub_slug='')
.exclude(placeholder__startswith='_')
.prefetch_related(
Prefetch('groups', to_attr='prefetched_groups')))
for cell in queryset:
cell.prefetched_validity_info = [
v for v in validity_info_list
if v.object_id == cell.pk and v.content_type.model_class() == cell.__class__]
cell.page = pages_by_pk.get(cell.page_id)
cell_type = ContentType.objects.get_for_model(cell)
indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
try:
@ -52,29 +72,34 @@ def index_site():
except Exception: # ignore rendering error
continue
if indexed_cell.indexed_text:
indexed_cell.public_access = bool(cell.page.public and cell.public)
indexed_cell.page_id = cell.page_id
indexed_cell.url = cell.page.get_online_url()
indexed_cell.title = cell.page.title
indexed_cell.save()
set_cell_access(indexed_cell, cell)
set_cell_groups(indexed_cell, cell)
for link_data in cell.get_external_links_data():
# index external links
indexed_cell = external_urls.get(indexed_cell.url)
if indexed_cell is None:
# create an entry for that link.
indexed_cell = IndexedCell(cell_type=cell_type, cell_pk=cell.id)
indexed_cell = IndexedCell(
cell_type=cell_type,
cell_pk=cell.id,
public_access=bool(cell.page.public and cell.public),
url=link_data['url'],
title=link_data['title'],
indexed_text=link_data.get('text') or '',
)
indexed_cell.save()
set_cell_access(indexed_cell, cell)
indexed_cell.url = link_data['url']
indexed_cell.title = link_data['title']
indexed_cell.indexed_text = link_data.get('text') or ''
set_cell_groups(indexed_cell, cell)
external_urls[indexed_cell.url] = indexed_cell
else:
# if that link already exists, add detailed texts
indexed_cell.indexed_text += ' ' + link_data['title']
indexed_cell.indexed_text += ' ' + link_data.get('text') or ''
indexed_cell.save()
indexed_cell.save()
def search_site(request, query):

View File

@ -544,4 +544,4 @@ def test_index_site_num_queries(app):
index_site() # populate cache
with CaptureQueriesContext(connection) as ctx:
index_site()
assert len(ctx.captured_queries) == 591
assert len(ctx.captured_queries) == 195