Compare commits

...

6 Commits

Author SHA1 Message Date
Pierre Ducroquet af28be9910 sql: test purge of search tokens (#86527)
gitea/wcs/pipeline/head This commit looks good Details
2024-03-19 18:22:59 +01:00
Pierre Ducroquet 475dffdcb5 wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527)
introduce a new mechanism to implement FTS with fuzzy-match.
This is made possible by adding and maintaining a table of the
FTS tokens, wcs_search_tokens, fed with searchable_formdefs
and wcs_all_forms.
When a query is issued, its tokens are matched against the
tokens with a fuzzy match when no direct match is found, and
the query is then rebuilt.
2024-03-19 18:22:59 +01:00
Pierre Ducroquet 7e887c268c tests: add a test for new FTS on formdefs (#86527) 2024-03-19 18:22:59 +01:00
Frédéric Péters 6de8f10127 tests: adjust users datasources to check against correct id (#88364)
gitea/wcs/pipeline/head This commit looks good Details
2024-03-19 17:36:30 +01:00
Frédéric Péters 0ed6455a65 misc: extend default list of forbidden file types (#88352)
gitea/wcs/pipeline/head There was a failure building this commit Details
2024-03-19 17:20:11 +01:00
Frédéric Péters 76b94d7ee8 data sources: export detailed roles infos (#84889)
gitea/wcs/pipeline/head This commit looks good Details
2024-03-19 15:35:22 +01:00
10 changed files with 398 additions and 63 deletions

View File

@ -429,6 +429,9 @@ def test_backoffice_submission_formdef_list_search(pub, local_user, access, auth
resp = get_url('/api/formdefs/?backoffice-submission=on&q=test')
assert len(resp.json['data']) == 2
resp = get_url('/api/formdefs/?backoffice-submission=on&q=tes')
assert len(resp.json['data']) == 2
resp = get_url('/api/formdefs/?backoffice-submission=on&q=xyz')
assert len(resp.json['data']) == 0

View File

@ -1,3 +1,5 @@
import xml.etree.ElementTree as ET
import pytest
from wcs import data_sources
@ -57,11 +59,11 @@ def test_datasource_users(pub):
assert data_sources.get_items({'type': datasource.slug}) == [
(
'1',
str(users[0].id),
'John Doe 0',
'1',
str(users[0].id),
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -72,11 +74,11 @@ def test_datasource_users(pub):
},
),
(
'2',
str(users[1].id),
'John Doe 1',
'2',
str(users[1].id),
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -89,11 +91,11 @@ def test_datasource_users(pub):
]
assert data_sources.get_items(datasource.extended_data_source) == [
(
'1',
str(users[0].id),
'John Doe 0',
'1',
str(users[0].id),
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -104,11 +106,11 @@ def test_datasource_users(pub):
},
),
(
'2',
str(users[1].id),
'John Doe 1',
'2',
str(users[1].id),
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -121,7 +123,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -131,7 +133,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -143,7 +145,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -153,7 +155,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -169,7 +171,7 @@ def test_datasource_users(pub):
datasource.store()
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -179,7 +181,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -191,7 +193,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -201,7 +203,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -223,7 +225,7 @@ def test_datasource_users(pub):
users[0].store()
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -235,7 +237,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -254,7 +256,7 @@ def test_datasource_users(pub):
datasource.store()
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -266,7 +268,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -291,7 +293,7 @@ def test_datasource_users(pub):
assert not datasource.include_disabled_users
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -303,7 +305,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -319,7 +321,7 @@ def test_datasource_users(pub):
datasource.store()
assert data_sources.get_structured_items({'type': datasource.slug}) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -329,7 +331,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -341,7 +343,7 @@ def test_datasource_users(pub):
]
assert data_sources.get_structured_items(datasource.extended_data_source) == [
{
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -351,7 +353,7 @@ def test_datasource_users(pub):
'user_email': None,
},
{
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -365,7 +367,7 @@ def test_datasource_users(pub):
# by uuid
assert datasource.get_structured_value('abc0') == {
'id': 1,
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -376,7 +378,7 @@ def test_datasource_users(pub):
}
assert datasource.get_display_value('abc0') == 'John Doe 0'
assert datasource.get_structured_value('abc1') == {
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -388,8 +390,8 @@ def test_datasource_users(pub):
assert datasource.get_display_value('abc1') == 'John Doe 1'
# by id
assert datasource.get_structured_value('1') == {
'id': 1,
assert datasource.get_structured_value(str(users[0].id)) == {
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -398,9 +400,9 @@ def test_datasource_users(pub):
'user_display_name': 'John Doe 0',
'user_email': None,
}
assert datasource.get_display_value('1') == 'John Doe 0'
assert datasource.get_structured_value('2') == {
'id': 2,
assert datasource.get_display_value(str(users[0].id)) == 'John Doe 0'
assert datasource.get_structured_value(str(users[1].id)) == {
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -409,11 +411,11 @@ def test_datasource_users(pub):
'user_display_name': 'John Doe 1',
'user_email': None,
}
assert datasource.get_display_value('2') == 'John Doe 1'
assert datasource.get_display_value(str(users[1].id)) == 'John Doe 1'
# by numeric id
assert datasource.get_structured_value(1) == {
'id': 1,
assert datasource.get_structured_value(users[0].id) == {
'id': users[0].id,
'text': 'John Doe 0',
'user_name_identifier_0': 'abc0',
'user_nameid': 'abc0',
@ -422,9 +424,9 @@ def test_datasource_users(pub):
'user_display_name': 'John Doe 0',
'user_email': None,
}
assert datasource.get_display_value(1) == 'John Doe 0'
assert datasource.get_structured_value(2) == {
'id': 2,
assert datasource.get_display_value(users[0].id) == 'John Doe 0'
assert datasource.get_structured_value(users[1].id) == {
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -433,7 +435,7 @@ def test_datasource_users(pub):
'user_display_name': 'John Doe 1',
'user_email': None,
}
assert datasource.get_display_value(2) == 'John Doe 1'
assert datasource.get_display_value(users[1].id) == 'John Doe 1'
datasource.users_included_roles = [role1.id]
datasource.users_excluded_roles = [role2.id]
@ -445,7 +447,7 @@ def test_datasource_users(pub):
assert datasource.get_structured_value('abc0') is None
assert datasource.get_display_value('abc0') is None
assert datasource.get_structured_value('abc1') == {
'id': 2,
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -457,10 +459,10 @@ def test_datasource_users(pub):
assert datasource.get_display_value('abc1') == 'John Doe 1'
# by id
assert datasource.get_structured_value('1') is None
assert datasource.get_display_value('1') is None
assert datasource.get_structured_value('2') == {
'id': 2,
assert datasource.get_structured_value(str(users[0].id)) is None
assert datasource.get_display_value(str(users[0].id)) is None
assert datasource.get_structured_value(str(users[1].id)) == {
'id': users[1].id,
'text': 'John Doe 1',
'user_name_identifier_0': 'abc1',
'user_nameid': 'abc1',
@ -469,7 +471,7 @@ def test_datasource_users(pub):
'user_display_name': 'John Doe 1',
'user_email': None,
}
assert datasource.get_display_value('2') == 'John Doe 1'
assert datasource.get_display_value(str(users[1].id)) == 'John Doe 1'
datasource.include_disabled_users = False
datasource.store()
@ -483,10 +485,10 @@ def test_datasource_users(pub):
assert datasource.get_display_value('abc1') is None
# by id
assert datasource.get_structured_value('1') is None
assert datasource.get_display_value('1') is None
assert datasource.get_structured_value('2') is None
assert datasource.get_display_value('2') is None
assert datasource.get_structured_value(str(users[0].id)) is None
assert datasource.get_display_value(str(users[0].id)) is None
assert datasource.get_structured_value(str(users[1].id)) is None
assert datasource.get_display_value(str(users[1].id)) is None
def test_datasource_users_user_formdef(pub):
@ -509,9 +511,9 @@ def test_datasource_users_user_formdef(pub):
assert data_sources.get_items({'type': datasource.slug}) == [
(
'3',
str(user.id),
'John Doe',
'3',
str(user.id),
{
'user_display_name': 'John Doe',
'user_email': None,
@ -520,8 +522,67 @@ def test_datasource_users_user_formdef(pub):
'user_var_plop': 'Bar',
'user_admin_access': False,
'user_backoffice_access': False,
'id': 3,
'id': user.id,
'text': 'John Doe',
},
)
]
def test_legacy_format_import(pub):
data_source_xml = """<datasource id="255">
<name>Agents de la ville</name>
<slug>agents_de_la_ville</slug>
<data_source>
<type>wcs:users</type>
<value />
</data_source><users_included_roles>
<item>8201764fc2c24b92bd691fd231a4cf76</item>
</users_included_roles>
</datasource>"""
ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml))
assert ds.users_included_roles == ['8201764fc2c24b92bd691fd231a4cf76']
def test_new_format_import(pub):
data_source_xml = """<datasource id="255">
<name>Agents de la ville</name>
<slug>agents_de_la_ville</slug>
<data_source>
<type>wcs:users</type>
<value />
</data_source><users_included_roles>
<role role-id="8201764fc2c24b92bd691fd231a4cf76" role-slug="agent">Agents</role>
</users_included_roles>
</datasource>"""
ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml))
assert ds.users_included_roles == [] # role doesn't exist
# import with id match
pub.role_class.wipe()
role1 = pub.role_class(name='role')
role1.id = '8201764fc2c24b92bd691fd231a4cf76'
role1.store()
ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml), include_id=True)
assert ds.users_included_roles == [role1.id]
# import with slug match
pub.role_class.wipe()
role1 = pub.role_class(name='Agents')
role1.slug = 'agent'
role1.store()
ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml), include_id=False)
assert ds.users_included_roles == [role1.id]
# import with name match
pub.role_class.wipe()
role1 = pub.role_class(name='Agents')
role1.slug = 'agent'
role1.store()
ds = NamedDataSource.import_from_xml_tree(
ET.fromstring(data_source_xml.replace('role-slug="agent"', '')), include_id=False
)
assert ds.users_included_roles == [role1.id]

View File

@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display
def test_search_tokens_purge(pub):
_, cur = sql.get_connection_and_cursor()
# purge garbage from other tests
sql.purge_obsolete_search_tokens()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
start = cur.fetchone()[0]
# define a new table
test_formdef = FormDef()
test_formdef.name = 'tableSelectFTStokens'
test_formdef.fields = [fields.StringField(id='3', label='string')]
test_formdef.store()
data_class = test_formdef.data_class(mode='sql')
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 1
t = data_class()
t.data = {'3': 'foofortokensofcourse'}
t.just_created()
t.store()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 2
t.data = {'3': 'chaussettefortokensofcourse'}
t.store()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 3
sql.purge_obsolete_search_tokens()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 2
def table_exists(cur, table_name):
cur.execute(
'''SELECT COUNT(*) FROM information_schema.tables

View File

@ -717,8 +717,8 @@ class NamedDataSource(XmlStorableObject):
('data_source', 'data_source'),
('notify_on_errors', 'bool'),
('record_on_errors', 'bool'),
('users_included_roles', 'str_list'),
('users_excluded_roles', 'str_list'),
('users_included_roles', 'ds_roles'),
('users_excluded_roles', 'ds_roles'),
('include_disabled_users', 'bool'),
]

View File

@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
for _formdef in FormDef.select() + CardDef.select():
sql.do_formdef_tables(_formdef)
sql.migrate_global_views(conn, cur)
sql.init_search_tokens()
cur.close()
def record_deprecated_usage(self, *args, **kwargs):

View File

@ -1080,6 +1080,15 @@ class FileWithPreviewWidget(CompositeWidget):
'.pif',
'.php',
'.js',
'.pht',
'.phtml',
'.shtml',
'.asa',
'.asax',
'.cer',
'.swf',
'.xap',
'.ps1',
'application/x-ms-dos-executable',
'text/x-php',
]

View File

@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
for error in self.loggederror_class.select(clause=clauses):
self.loggederror_class.remove_object(error.id)
def clean_search_tokens(self, **kwargs):
from wcs import sql
sql.purge_obsolete_search_tokens()
@classmethod
def register_cronjobs(cls):
cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
cls.register_cronjob(
CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
)
cls.register_cronjob(
CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
)
_initialized = False

View File

@ -20,7 +20,7 @@ import xml.etree.ElementTree as ET
from quixote import get_publisher
from .misc import indent_xml, xml_node_text
from .storage import Equal, Or, StorableObject
from .storage import Contains, Equal, Or, StorableObject
class XmlStorableObject(StorableObject):
@ -141,6 +141,8 @@ class XmlStorableObject(StorableObject):
def import_roles_from_xml(self, element, include_id=False, **kwargs):
criterias = []
for sub in element:
if sub.tag != 'role':
continue
if include_id and 'role-id' in sub.attrib:
criterias.append(Equal('id', sub.attrib['role-id']))
elif 'role-slug' in sub.attrib:
@ -156,3 +158,22 @@ class XmlStorableObject(StorableObject):
return get_publisher().role_class.select([Or(criterias)], order_by='name')
return lazy_roles
def import_ds_roles_from_xml(self, element, include_id=False, **kwargs):
imported_roles = self.import_roles_from_xml(element, include_id=include_id, **kwargs)
if callable(imported_roles):
imported_roles = imported_roles()
role_ids = [x.id for x in imported_roles]
for sub in element:
if sub.tag == 'item': # legacy support for <item>{id}</item>
role_ids.append(xml_node_text(sub))
return role_ids
def export_ds_roles_to_xml(self, element, attribute_name, include_id=False, **kwargs):
for role in get_publisher().role_class.select(
[Contains('id', getattr(self, attribute_name, None) or [])]
):
sub = ET.SubElement(element, 'role')
sub.attrib['role-id'] = role.id # always include id
sub.attrib['role-slug'] = role.slug
sub.text = role.name

View File

@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
}
def _table_exists(cur, table_name):
cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
rows = cur.fetchall()
return len(rows) > 0
def _trigger_exists(cur, table_name, trigger_name):
cur.execute(
'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
)
rows = cur.fetchall()
return len(rows) > 0
class WcsPgConnection(psycopg2.extensions.connection):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
% (name, category.id)
)
init_search_tokens_triggers(cur)
def clean_global_views(conn, cur):
# Purge of any dead data
@ -1674,11 +1690,178 @@ def init_global_table(conn=None, cur=None):
endpoint_status=endpoint_status_filter,
)
)
init_search_tokens_data(cur)
if own_conn:
cur.close()
def init_search_tokens(conn=None, cur=None):
"""Initialize the search_tokens mechanism.
It's based on three parts:
- a token table
- triggers to feed this table from the tsvectors used in the database
- a search function that will leverage these tokens to extend the search query.
So far, the sources used are wcs_all_forms and searchable_formdefs.
Example: let's say the sources texts are "Tarif d'école" and "La cantine".
This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
Our tokens table will have these three words.
When the search function is launched, it splits the search query and will
replace unavailable tokens by those close, if available.
The search query 'tari' will be expanded to 'tarif'.
The search query 'collège' will remain unchanged (and return nothing)
If several tokens match or are close enough, the query will be expanded to
an OR.
"""
own_cur = False
if cur is None:
own_cur = True
conn, cur = get_connection_and_cursor()
# Create table
cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
# Create triggers
init_search_tokens_triggers(cur)
# Fill table
init_search_tokens_data(cur)
# Index at the end, small performance trick... not that useful, but it's free...
cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
cur.execute(
'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
)
# And last: functions to use this brand new table
# These two aggregates make the search query far simpler to write
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or (tsquery) (sfunc=tsquery_or, stype=tsquery);')
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
cur.execute(
r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
RETURNS tsquery
LANGUAGE sql
STABLE
AS $function$
WITH
tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) w),
super_tokenized AS (
-- perfect: tokens that are found as is in table, thus no OR required
-- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
-- otherwise: token as is and likely no search result later
SELECT w,
coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(w)::text),
tsquery_agg_or(plainto_tsquery(partial.token) order by partial.token <-> w desc),
plainto_tsquery(w)) tokens
FROM tokenized
LEFT JOIN wcs_search_tokens AS partial ON partial.token % w AND w not similar to '%[0-9]{2,}%'
GROUP BY w)
SELECT tsquery_agg_and(tokens) FROM super_tokenized;
$function$;"""
)
if own_cur:
cur.close()
def init_search_tokens_triggers(cur):
# We define only appending triggers, ie on INSERT and UPDATE.
# It would be far heavier to maintain deletions here, and having extra data has
# no or marginal side effect on search performances, and absolutely no impact
# on search results.
# Instead, a weekly cron job will delete obsolete entries, thus making it sure no
# personal data is kept uselessly.
# First part: the appending function
cur.execute(
"""CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
RETURNS trigger
LANGUAGE plpgsql
AS $function$
BEGIN
INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
RETURN NEW;
END;
$function$;"""
)
if not (_table_exists(cur, 'wcs_search_tokens')):
# abort trigger creation if tokens table doesn't exist yet
return
if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
):
# Second part: insert and update triggers for wcs_all_forms
cur.execute(
"""CREATE TRIGGER wcs_all_forms_fts_trg_ins
AFTER INSERT ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
"""CREATE TRIGGER wcs_all_forms_fts_trg_upd
AFTER UPDATE OF fts ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
):
# Third part: insert and update triggers for searchable_formdefs
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_ins
AFTER INSERT ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_upd
AFTER UPDATE OF fts ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
def init_search_tokens_data(cur):
if not (_table_exists(cur, 'wcs_search_tokens')):
# abort table data initialization if tokens table doesn't exist yet
return
if _table_exists(cur, 'wcs_all_forms'):
cur.execute(
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
ON CONFLICT(token) DO NOTHING;"""
)
if _table_exists(cur, 'searchable_formdefs'):
cur.execute(
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
ON CONFLICT(token) DO NOTHING;"""
)
def purge_obsolete_search_tokens(cur=None):
own_cur = False
if cur is None:
own_cur = True
_, cur = get_connection_and_cursor()
cur.execute(
"""DELETE FROM wcs_search_tokens
WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
)
if own_cur:
cur.close()
class SqlMixin:
_table_name = None
_numerical_id = True
@ -4809,7 +4992,6 @@ class SearchableFormDef(SqlMixin):
% (cls._table_name, cls._table_name)
)
cls.do_indexes(cur)
cur.close()
from wcs.carddef import CardDef
from wcs.formdef import FormDef
@ -4818,6 +5000,8 @@ class SearchableFormDef(SqlMixin):
CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
):
cls.update(obj=objectdef)
init_search_tokens(cur)
cur.close()
@classmethod
def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
@ -4855,7 +5039,7 @@ class SearchableFormDef(SqlMixin):
def search(cls, obj_type, string):
_, cur = get_connection_and_cursor()
cur.execute(
'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
(FtsMatch.get_fts_value(string),),
)
ids = [x[0] for x in cur.fetchall()]
@ -5120,7 +5304,7 @@ def get_period_total(
# latest migration, number + description (description is not used
# programmaticaly but will make sure git conflicts if two migrations are
# separately added with the same number)
SQL_LEVEL = (106, 'add context column to logged_errors table')
SQL_LEVEL = (107, 'new fts mechanism with tokens table')
def migrate_global_views(conn, cur):
@ -5454,6 +5638,10 @@ def migrate():
for formdef in FormDef.select() + CardDef.select():
do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)
if sql_level < 107:
# 107: new fts mechanism with tokens table
init_search_tokens()
if sql_level != SQL_LEVEL[0]:
cur.execute(
'''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',

View File

@ -379,6 +379,11 @@ class FtsMatch(Criteria):
return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)
class WcsFtsMatch(FtsMatch):
def as_sql(self):
return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
class ElementEqual(Criteria):
def __init__(self, attribute, key, value, **kwargs):
super().__init__(attribute, value)