Compare commits

...

2 Commits

Author SHA1 Message Date
Pierre Ducroquet 5cdb00e496 sql: test purge of search tokens (#86527)
gitea/wcs/pipeline/head There was a failure building this commit Details
2024-03-27 17:21:12 +01:00
Pierre Ducroquet f304da7f09 wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527)
introduce a new mechanism to implement FTS with fuzzy-match.
This is made possible by adding and maintaining a table of the
FTS tokens, wcs_search_tokens, fed with searchable_formdefs
and wcs_all_forms.
When a query is issued, its tokens are matched against the
tokens with a fuzzy match when no direct match is found, and
the query is then rebuilt.
2024-03-27 17:21:12 +01:00
5 changed files with 248 additions and 3 deletions

View File

@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display
def test_search_tokens_purge(pub):
_, cur = sql.get_connection_and_cursor()
# purge garbage from other tests
sql.purge_obsolete_search_tokens()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
start = cur.fetchone()[0]
# define a new table
test_formdef = FormDef()
test_formdef.name = 'tableSelectFTStokens'
test_formdef.fields = [fields.StringField(id='3', label='string')]
test_formdef.store()
data_class = test_formdef.data_class(mode='sql')
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 1
t = data_class()
t.data = {'3': 'foofortokensofcourse'}
t.just_created()
t.store()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 2
t.data = {'3': 'chaussettefortokensofcourse'}
t.store()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 3
sql.purge_obsolete_search_tokens()
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
assert cur.fetchone()[0] == start + 2
def table_exists(cur, table_name):
cur.execute(
'''SELECT COUNT(*) FROM information_schema.tables

View File

@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
for _formdef in FormDef.select() + CardDef.select():
sql.do_formdef_tables(_formdef)
sql.migrate_global_views(conn, cur)
sql.init_search_tokens()
cur.close()
def record_deprecated_usage(self, *args, **kwargs):

View File

@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
for error in self.loggederror_class.select(clause=clauses):
self.loggederror_class.remove_object(error.id)
def clean_search_tokens(self, **kwargs):
from wcs import sql
sql.purge_obsolete_search_tokens()
@classmethod
def register_cronjobs(cls):
cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
cls.register_cronjob(
CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
)
cls.register_cronjob(
CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
)
_initialized = False

View File

@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
}
def _table_exists(cur, table_name):
cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
rows = cur.fetchall()
return len(rows) > 0
def _trigger_exists(cur, table_name, trigger_name):
cur.execute(
'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
)
rows = cur.fetchall()
return len(rows) > 0
class WcsPgConnection(psycopg2.extensions.connection):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
% (name, category.id)
)
init_search_tokens_triggers(cur)
def clean_global_views(conn, cur):
# Purge of any dead data
@ -1674,11 +1690,182 @@ def init_global_table(conn=None, cur=None):
endpoint_status=endpoint_status_filter,
)
)
init_search_tokens_data(cur)
if own_conn:
cur.close()
def init_search_tokens(conn=None, cur=None):
"""Initialize the search_tokens mechanism.
It's based on three parts:
- a token table
- triggers to feed this table from the tsvectors used in the database
- a search function that will leverage these tokens to extend the search query.
So far, the sources used are wcs_all_forms and searchable_formdefs.
Example: let's say the sources texts are "Tarif d'école" and "La cantine".
This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
Our tokens table will have these three words.
When the search function is launched, it splits the search query and will
replace unavailable tokens by those close, if available.
The search query 'tari' will be expanded to 'tarif'.
The search query 'collège' will remain unchanged (and return nothing)
If several tokens match or are close enough, the query will be expanded to
an OR.
"""
own_cur = False
if cur is None:
own_cur = True
conn, cur = get_connection_and_cursor()
# Create table
cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
# Create triggers
init_search_tokens_triggers(cur)
# Fill table
init_search_tokens_data(cur)
# Index at the end, small performance trick... not that useful, but it's free...
cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
cur.execute(
'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
)
# And last: functions to use this brand new table
# These two aggregates make the search query far simpler to write, allowing writing an OR/AND of search terms
# directly as an SQL aggregation.
# They use the tsquery_or and tsquery_and functions that are included in PostgreSQL since 8.3, but documented
# under their operator names || and &&.
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or (tsquery) (sfunc=tsquery_or, stype=tsquery);')
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
cur.execute(
r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
RETURNS tsquery
LANGUAGE sql
STABLE
AS $function$
WITH
tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) word),
super_tokenized AS (
-- perfect: tokens that are found as is in table, thus no OR required
-- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
-- distance search is done using pg_trgm, https://www.postgresql.org/docs/current/pgtrgm.html
-- otherwise: token as is and likely no search result later
SELECT word,
coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(word)::text),
tsquery_agg_or(plainto_tsquery(partial.token)),
plainto_tsquery(word)) AS tokens
FROM tokenized
LEFT JOIN wcs_search_tokens AS partial ON partial.token % plainto_tsquery(word)::text AND word not similar to '%[0-9]{2,}%'
GROUP BY word)
SELECT tsquery_agg_and(tokens) FROM super_tokenized;
$function$;"""
)
if own_cur:
cur.close()
def init_search_tokens_triggers(cur):
# We define only appending triggers, ie on INSERT and UPDATE.
# It would be far heavier to maintain deletions here, and having extra data has
# no or marginal side effect on search performances, and absolutely no impact
# on search results.
# Instead, a weekly cron job will delete obsolete entries, thus making it sure no
# personal data is kept uselessly.
# First part: the appending function
cur.execute(
"""CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
RETURNS trigger
LANGUAGE plpgsql
AS $function$
BEGIN
INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
RETURN NEW;
END;
$function$;"""
)
if not (_table_exists(cur, 'wcs_search_tokens')):
# abort trigger creation if tokens table doesn't exist yet
return
if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
):
# Second part: insert and update triggers for wcs_all_forms
cur.execute(
"""CREATE TRIGGER wcs_all_forms_fts_trg_ins
AFTER INSERT ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
"""CREATE TRIGGER wcs_all_forms_fts_trg_upd
AFTER UPDATE OF fts ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
):
# Third part: insert and update triggers for searchable_formdefs
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_ins
AFTER INSERT ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_upd
AFTER UPDATE OF fts ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
def init_search_tokens_data(cur):
if not (_table_exists(cur, 'wcs_search_tokens')):
# abort table data initialization if tokens table doesn't exist yet
return
if _table_exists(cur, 'wcs_all_forms'):
cur.execute(
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
ON CONFLICT(token) DO NOTHING;"""
)
if _table_exists(cur, 'searchable_formdefs'):
cur.execute(
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
ON CONFLICT(token) DO NOTHING;"""
)
def purge_obsolete_search_tokens(cur=None):
own_cur = False
if cur is None:
own_cur = True
_, cur = get_connection_and_cursor()
cur.execute(
"""DELETE FROM wcs_search_tokens
WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
)
if own_cur:
cur.close()
class SqlMixin:
_table_name = None
_numerical_id = True
@ -4811,7 +4998,6 @@ class SearchableFormDef(SqlMixin):
% (cls._table_name, cls._table_name)
)
cls.do_indexes(cur)
cur.close()
from wcs.carddef import CardDef
from wcs.formdef import FormDef
@ -4820,6 +5006,8 @@ class SearchableFormDef(SqlMixin):
CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
):
cls.update(obj=objectdef)
init_search_tokens(cur)
cur.close()
@classmethod
def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
@ -4857,7 +5045,7 @@ class SearchableFormDef(SqlMixin):
def search(cls, obj_type, string):
_, cur = get_connection_and_cursor()
cur.execute(
'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
(FtsMatch.get_fts_value(string),),
)
ids = [x[0] for x in cur.fetchall()]
@ -5122,7 +5310,7 @@ def get_period_total(
# latest migration, number + description (description is not used
# programmaticaly but will make sure git conflicts if two migrations are
# separately added with the same number)
SQL_LEVEL = (106, 'add context column to logged_errors table')
SQL_LEVEL = (107, 'new fts mechanism with tokens table')
def migrate_global_views(conn, cur):
@ -5456,6 +5644,10 @@ def migrate():
for formdef in FormDef.select() + CardDef.select():
do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)
if sql_level < 107:
# 107: new fts mechanism with tokens table
init_search_tokens()
if sql_level != SQL_LEVEL[0]:
cur.execute(
'''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',

View File

@ -379,6 +379,11 @@ class FtsMatch(Criteria):
return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)
class WcsFtsMatch(FtsMatch):
def as_sql(self):
return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
class ElementEqual(Criteria):
def __init__(self, attribute, key, value, **kwargs):
super().__init__(attribute, value)