sql: test purge of search tokens (#86527 )

wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527 )
introduce a new mechanism to implement FTS with fuzzy-match. This is made possible by adding and maintaining a table of the FTS tokens, wcs_search_tokens, fed with searchable_formdefs and wcs_all_forms. When a query is issued, its tokens are matched against the tokens with a fuzzy match when no direct match is found, and the query is then rebuilt.
2024-03-27 17:05:16 +01:00 · 2024-03-27 17:05:16 +01:00
5 changed files with 248 additions and 3 deletions
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
    assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display
 def test_search_tokens_purge(pub):
    _, cur = sql.get_connection_and_cursor()
    # purge garbage from other tests
    sql.purge_obsolete_search_tokens()
    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
    start = cur.fetchone()[0]
    # define a new table
    test_formdef = FormDef()
    test_formdef.name = 'tableSelectFTStokens'
    test_formdef.fields = [fields.StringField(id='3', label='string')]
    test_formdef.store()
    data_class = test_formdef.data_class(mode='sql')
    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
    assert cur.fetchone()[0] == start + 1
    t = data_class()
    t.data = {'3': 'foofortokensofcourse'}
    t.just_created()
    t.store()
    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
    assert cur.fetchone()[0] == start + 2
    t.data = {'3': 'chaussettefortokensofcourse'}
    t.store()
    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
    assert cur.fetchone()[0] == start + 3
    sql.purge_obsolete_search_tokens()
    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
    assert cur.fetchone()[0] == start + 2
 def table_exists(cur, table_name):
    cur.execute(
        '''SELECT COUNT(*) FROM information_schema.tables
--- a/wcs/publisher.py
+++ b/wcs/publisher.py
@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
            for _formdef in FormDef.select() + CardDef.select():
                sql.do_formdef_tables(_formdef)
            sql.migrate_global_views(conn, cur)
            sql.init_search_tokens()
            cur.close()
    def record_deprecated_usage(self, *args, **kwargs):
--- a/wcs/qommon/publisher.py
+++ b/wcs/qommon/publisher.py
@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
        for error in self.loggederror_class.select(clause=clauses):
            self.loggederror_class.remove_object(error.id)
    def clean_search_tokens(self, **kwargs):
        from wcs import sql
        sql.purge_obsolete_search_tokens()
    @classmethod
    def register_cronjobs(cls):
        cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
        cls.register_cronjob(
            CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
        )
        cls.register_cronjob(
            CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
        )
    _initialized = False
--- a/wcs/sql.py
+++ b/wcs/sql.py
@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
 }
 def _table_exists(cur, table_name):
    cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
    rows = cur.fetchall()
    return len(rows) > 0
 def _trigger_exists(cur, table_name, trigger_name):
    cur.execute(
        'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
    )
    rows = cur.fetchall()
    return len(rows) > 0
 class WcsPgConnection(psycopg2.extensions.connection):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
                % (name, category.id)
            )
    init_search_tokens_triggers(cur)
 def clean_global_views(conn, cur):
    # Purge of any dead data
@ -1674,11 +1690,182 @@ def init_global_table(conn=None, cur=None):
                endpoint_status=endpoint_status_filter,
            )
        )
        init_search_tokens_data(cur)
    if own_conn:
        cur.close()
 def init_search_tokens(conn=None, cur=None):
    """Initialize the search_tokens mechanism.
    It's based on three parts:
    - a token table
    - triggers to feed this table from the tsvectors used in the database
    - a search function that will leverage these tokens to extend the search query.
    So far, the sources used are wcs_all_forms and searchable_formdefs.
    Example: let's say the sources texts are "Tarif d'école" and "La cantine".
    This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
    Our tokens table will have these three words.
    When the search function is launched, it splits the search query and will
    replace unavailable tokens by those close, if available.
    The search query 'tari' will be expanded to 'tarif'.
    The search query 'collège' will remain unchanged (and return nothing)
    If several tokens match or are close enough, the query will be expanded to
    an OR.
    """
    own_cur = False
    if cur is None:
        own_cur = True
        conn, cur = get_connection_and_cursor()
    # Create table
    cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
    # Create triggers
    init_search_tokens_triggers(cur)
    # Fill table
    init_search_tokens_data(cur)
    # Index at the end, small performance trick... not that useful, but it's free...
    cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
    cur.execute(
        'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
    )
    # And last: functions to use this brand new table
    # These two aggregates make the search query far simpler to write, allowing writing an OR/AND of search terms
    # directly as an SQL aggregation.
    # They use the tsquery_or and tsquery_and functions that are included in PostgreSQL since 8.3, but documented
    # under their operator names || and &&.
    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or  (tsquery) (sfunc=tsquery_or,  stype=tsquery);')
    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
    cur.execute(
        r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
 RETURNS tsquery
 LANGUAGE sql
 STABLE
 AS $function$
 WITH
        tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) word),
        super_tokenized AS (
            -- perfect: tokens that are found as is in table, thus no OR required
            -- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
            --          distance search is done using pg_trgm, https://www.postgresql.org/docs/current/pgtrgm.html
            -- otherwise: token as is and likely no search result later
            SELECT word,
                coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(word)::text),
                         tsquery_agg_or(plainto_tsquery(partial.token)),
                         plainto_tsquery(word)) AS tokens
            FROM tokenized
            LEFT JOIN wcs_search_tokens AS partial ON partial.token % plainto_tsquery(word) AND word not similar to '%[0-9]{2,}%'
            GROUP BY word)
 SELECT tsquery_agg_and(tokens) FROM super_tokenized;
 $function$;"""
    )
    if own_cur:
        cur.close()
 def init_search_tokens_triggers(cur):
    # We define only appending triggers, ie on INSERT and UPDATE.
    # It would be far heavier to maintain deletions here, and having extra data has
    # no or marginal side effect on search performances, and absolutely no impact
    # on search results.
    # Instead, a weekly cron job will delete obsolete entries, thus making it sure no
    # personal data is kept uselessly.
    # First part: the appending function
    cur.execute(
        """CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
 RETURNS trigger
 LANGUAGE plpgsql
 AS $function$
 BEGIN
        INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
        RETURN NEW;
 END;
 $function$;"""
    )
    if not (_table_exists(cur, 'wcs_search_tokens')):
        # abort trigger creation if tokens table doesn't exist yet
        return
    if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
        cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
    ):
        # Second part: insert and update triggers for wcs_all_forms
        cur.execute(
            """CREATE TRIGGER wcs_all_forms_fts_trg_ins
            AFTER INSERT ON wcs_all_forms
            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
        )
        cur.execute(
            """CREATE TRIGGER wcs_all_forms_fts_trg_upd
            AFTER UPDATE OF fts ON wcs_all_forms
            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
        )
    if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
        cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
    ):
        # Third part: insert and update triggers for searchable_formdefs
        cur.execute(
            """CREATE TRIGGER searchable_formdefs_fts_trg_ins
            AFTER INSERT ON searchable_formdefs
            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
        )
        cur.execute(
            """CREATE TRIGGER searchable_formdefs_fts_trg_upd
            AFTER UPDATE OF fts ON searchable_formdefs
            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
        )
 def init_search_tokens_data(cur):
    if not (_table_exists(cur, 'wcs_search_tokens')):
        # abort table data initialization if tokens table doesn't exist yet
        return
    if _table_exists(cur, 'wcs_all_forms'):
        cur.execute(
            """INSERT INTO wcs_search_tokens
        SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
        ON CONFLICT(token) DO NOTHING;"""
        )
    if _table_exists(cur, 'searchable_formdefs'):
        cur.execute(
            """INSERT INTO wcs_search_tokens
        SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
        ON CONFLICT(token) DO NOTHING;"""
        )
 def purge_obsolete_search_tokens(cur=None):
    own_cur = False
    if cur is None:
        own_cur = True
        _, cur = get_connection_and_cursor()
    cur.execute(
        """DELETE FROM wcs_search_tokens
    WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
    AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
    )
    if own_cur:
        cur.close()
 class SqlMixin:
    _table_name = None
    _numerical_id = True
@ -4811,7 +4998,6 @@ class SearchableFormDef(SqlMixin):
                % (cls._table_name, cls._table_name)
            )
        cls.do_indexes(cur)
        cur.close()
        from wcs.carddef import CardDef
        from wcs.formdef import FormDef
@ -4820,6 +5006,8 @@ class SearchableFormDef(SqlMixin):
            CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
        ):
            cls.update(obj=objectdef)
        init_search_tokens(cur)
        cur.close()
    @classmethod
    def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
@ -4857,7 +5045,7 @@ class SearchableFormDef(SqlMixin):
    def search(cls, obj_type, string):
        _, cur = get_connection_and_cursor()
        cur.execute(
-            'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
+            'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
            (FtsMatch.get_fts_value(string),),
        )
        ids = [x[0] for x in cur.fetchall()]
@ -5122,7 +5310,7 @@ def get_period_total(
 # latest migration, number + description (description is not used
 # programmaticaly but will make sure git conflicts if two migrations are
 # separately added with the same number)
-SQL_LEVEL = (106, 'add context column to logged_errors table')
+SQL_LEVEL = (107, 'new fts mechanism with tokens table')
 def migrate_global_views(conn, cur):
@ -5456,6 +5644,10 @@ def migrate():
        for formdef in FormDef.select() + CardDef.select():
            do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)
    if sql_level < 107:
        # 107: new fts mechanism with tokens table
        init_search_tokens()
    if sql_level != SQL_LEVEL[0]:
        cur.execute(
            '''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',
--- a/wcs/sql_criterias.py
+++ b/wcs/sql_criterias.py
@ -379,6 +379,11 @@ class FtsMatch(Criteria):
        return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)
 class WcsFtsMatch(FtsMatch):
    def as_sql(self):
        return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
 class ElementEqual(Criteria):
    def __init__(self, attribute, key, value, **kwargs):
        super().__init__(attribute, value)