Compare commits

..

2 Commits

Author SHA1 Message Date
Pierre Ducroquet fcdc2581ca sql: test purge of search tokens (#86527)
gitea/wcs/pipeline/head There was a failure building this commit Details
2024-03-27 17:05:16 +01:00
Pierre Ducroquet 53e3caf83b wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527)
introduce a new mechanism to implement FTS with fuzzy-match.
This is made possible by adding and maintaining a table of the
FTS tokens, wcs_search_tokens, fed with searchable_formdefs
and wcs_all_forms.
When a query is issued, its tokens are matched against the
tokens with a fuzzy match when no direct match is found, and
the query is then rebuilt.
2024-03-27 17:05:16 +01:00
1 changed files with 26 additions and 22 deletions

View File

@ -1738,7 +1738,10 @@ def init_search_tokens(conn=None, cur=None):
)
# And last: functions to use this brand new table
# These two aggregates make the search query far simpler to write
# These two aggregates make the search query far simpler to write, allowing writing an OR/AND of search terms
# directly as an SQL aggregation.
# They use the tsquery_or and tsquery_and functions that are included in PostgreSQL since 8.3, but documented
# under their operator names || and &&.
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or (tsquery) (sfunc=tsquery_or, stype=tsquery);')
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
cur.execute(
@ -1748,18 +1751,19 @@ def init_search_tokens(conn=None, cur=None):
STABLE
AS $function$
WITH
tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) w),
tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) word),
super_tokenized AS (
-- perfect: tokens that are found as is in table, thus no OR required
-- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
-- distance search is done using pg_trgm, https://www.postgresql.org/docs/current/pgtrgm.html
-- otherwise: token as is and likely no search result later
SELECT w,
coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(w)::text),
tsquery_agg_or(plainto_tsquery(partial.token) order by partial.token <-> w desc),
plainto_tsquery(w)) tokens
SELECT word,
coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(word)::text),
tsquery_agg_or(plainto_tsquery(partial.token)),
plainto_tsquery(word)) AS tokens
FROM tokenized
LEFT JOIN wcs_search_tokens AS partial ON partial.token % w AND w not similar to '%[0-9]{2,}%'
GROUP BY w)
LEFT JOIN wcs_search_tokens AS partial ON partial.token % plainto_tsquery(word) AND word not similar to '%[0-9]{2,}%'
GROUP BY word)
SELECT tsquery_agg_and(tokens) FROM super_tokenized;
$function$;"""
)
@ -1798,9 +1802,9 @@ $function$;"""
):
# Second part: insert and update triggers for wcs_all_forms
cur.execute(
"""CREATE TRIGGER wcs_all_forms_fts_trg_ins
AFTER INSERT ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
"""CREATE TRIGGER wcs_all_forms_fts_trg_ins
AFTER INSERT ON wcs_all_forms
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
@ -1815,15 +1819,15 @@ $function$;"""
):
# Third part: insert and update triggers for searchable_formdefs
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_ins
AFTER INSERT ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
"""CREATE TRIGGER searchable_formdefs_fts_trg_ins
AFTER INSERT ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
cur.execute(
"""CREATE TRIGGER searchable_formdefs_fts_trg_upd
AFTER UPDATE OF fts ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
"""CREATE TRIGGER searchable_formdefs_fts_trg_upd
AFTER UPDATE OF fts ON searchable_formdefs
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
)
@ -1835,14 +1839,14 @@ def init_search_tokens_data(cur):
if _table_exists(cur, 'wcs_all_forms'):
cur.execute(
"""INSERT INTO wcs_search_tokens
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
ON CONFLICT(token) DO NOTHING;"""
)
if _table_exists(cur, 'searchable_formdefs'):
cur.execute(
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
"""INSERT INTO wcs_search_tokens
SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
ON CONFLICT(token) DO NOTHING;"""
)
@ -1854,8 +1858,8 @@ def purge_obsolete_search_tokens(cur=None):
_, cur = get_connection_and_cursor()
cur.execute(
"""DELETE FROM wcs_search_tokens
WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
"""DELETE FROM wcs_search_tokens
WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
)
if own_cur: