sql: test purge of search tokens (#86527 )

wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527 )
introduce a new mechanism to implement FTS with fuzzy-match. This is made possible by adding and maintaining a table of the FTS tokens, wcs_search_tokens, fed with searchable_formdefs and wcs_all_forms. When a query is issued, its tokens are matched against the tokens with a fuzzy match when no direct match is found, and the query is then rebuilt.
2024-03-27 17:21:12 +01:00 · 2024-03-27 17:21:12 +01:00
5 changed files with 248 additions and 3 deletions
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
    assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display


+def test_search_tokens_purge(pub):
+    _, cur = sql.get_connection_and_cursor()
+
+    # purge garbage from other tests
+    sql.purge_obsolete_search_tokens()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    start = cur.fetchone()[0]
+
+    # define a new table
+    test_formdef = FormDef()
+    test_formdef.name = 'tableSelectFTStokens'
+    test_formdef.fields = [fields.StringField(id='3', label='string')]
+    test_formdef.store()
+    data_class = test_formdef.data_class(mode='sql')
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 1
+
+    t = data_class()
+    t.data = {'3': 'foofortokensofcourse'}
+    t.just_created()
+    t.store()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 2
+
+    t.data = {'3': 'chaussettefortokensofcourse'}
+    t.store()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 3
+
+    sql.purge_obsolete_search_tokens()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 2
+
+
 def table_exists(cur, table_name):
    cur.execute(
        '''SELECT COUNT(*) FROM information_schema.tables
--- a/wcs/publisher.py
+++ b/wcs/publisher.py
@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
            for _formdef in FormDef.select() + CardDef.select():
                sql.do_formdef_tables(_formdef)
            sql.migrate_global_views(conn, cur)
+            sql.init_search_tokens()
            cur.close()

    def record_deprecated_usage(self, *args, **kwargs):
--- a/wcs/qommon/publisher.py
+++ b/wcs/qommon/publisher.py
@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
        for error in self.loggederror_class.select(clause=clauses):
            self.loggederror_class.remove_object(error.id)

+    def clean_search_tokens(self, **kwargs):
+        from wcs import sql
+
+        sql.purge_obsolete_search_tokens()
+
    @classmethod
    def register_cronjobs(cls):
        cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
        cls.register_cronjob(
            CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
        )
+        cls.register_cronjob(
+            CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
+        )

    _initialized = False

--- a/wcs/sql.py
+++ b/wcs/sql.py
@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
 }


+def _table_exists(cur, table_name):
+    cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
+    rows = cur.fetchall()
+    return len(rows) > 0
+
+
+def _trigger_exists(cur, table_name, trigger_name):
+    cur.execute(
+        'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
+    )
+    rows = cur.fetchall()
+    return len(rows) > 0
+
+
 class WcsPgConnection(psycopg2.extensions.connection):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
                % (name, category.id)
            )

+    init_search_tokens_triggers(cur)
+

 def clean_global_views(conn, cur):
    # Purge of any dead data
@ -1674,11 +1690,182 @@ def init_global_table(conn=None, cur=None):
                endpoint_status=endpoint_status_filter,
            )
        )
+        init_search_tokens_data(cur)

    if own_conn:
        cur.close()


+def init_search_tokens(conn=None, cur=None):
+    """Initialize the search_tokens mechanism.
+
+    It's based on three parts:
+    - a token table
+    - triggers to feed this table from the tsvectors used in the database
+    - a search function that will leverage these tokens to extend the search query.
+
+    So far, the sources used are wcs_all_forms and searchable_formdefs.
+
+    Example: let's say the sources texts are "Tarif d'école" and "La cantine".
+    This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
+    Our tokens table will have these three words.
+    When the search function is launched, it splits the search query and will
+    replace unavailable tokens by those close, if available.
+    The search query 'tari' will be expanded to 'tarif'.
+    The search query 'collège' will remain unchanged (and return nothing)
+    If several tokens match or are close enough, the query will be expanded to
+    an OR.
+    """
+
+    own_cur = False
+    if cur is None:
+        own_cur = True
+        conn, cur = get_connection_and_cursor()
+
+    # Create table
+    cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
+
+    # Create triggers
+    init_search_tokens_triggers(cur)
+
+    # Fill table
+    init_search_tokens_data(cur)
+
+    # Index at the end, small performance trick... not that useful, but it's free...
+    cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
+    cur.execute(
+        'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
+    )
+
+    # And last: functions to use this brand new table
+    # These two aggregates make the search query far simpler to write, allowing writing an OR/AND of search terms
+    # directly as an SQL aggregation.
+    # They use the tsquery_or and tsquery_and functions that are included in PostgreSQL since 8.3, but documented
+    # under their operator names || and &&.
+    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or  (tsquery) (sfunc=tsquery_or,  stype=tsquery);')
+    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
+    cur.execute(
+        r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
+ RETURNS tsquery
+ LANGUAGE sql
+ STABLE
+AS $function$
+WITH
+        tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) word),
+        super_tokenized AS (
+            -- perfect: tokens that are found as is in table, thus no OR required
+            -- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
+            --          distance search is done using pg_trgm, https://www.postgresql.org/docs/current/pgtrgm.html
+            -- otherwise: token as is and likely no search result later
+            SELECT word,
+                coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(word)::text),
+                         tsquery_agg_or(plainto_tsquery(partial.token)),
+                         plainto_tsquery(word)) AS tokens
+            FROM tokenized
+            LEFT JOIN wcs_search_tokens AS partial ON partial.token % plainto_tsquery(word)::text AND word not similar to '%[0-9]{2,}%'
+            GROUP BY word)
+SELECT tsquery_agg_and(tokens) FROM super_tokenized;
+$function$;"""
+    )
+
+    if own_cur:
+        cur.close()
+
+
+def init_search_tokens_triggers(cur):
+    # We define only appending triggers, ie on INSERT and UPDATE.
+    # It would be far heavier to maintain deletions here, and having extra data has
+    # no or marginal side effect on search performances, and absolutely no impact
+    # on search results.
+    # Instead, a weekly cron job will delete obsolete entries, thus making it sure no
+    # personal data is kept uselessly.
+
+    # First part: the appending function
+    cur.execute(
+        """CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
+ RETURNS trigger
+ LANGUAGE plpgsql
+AS $function$
+BEGIN
+        INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
+        RETURN NEW;
+END;
+$function$;"""
+    )
+
+    if not (_table_exists(cur, 'wcs_search_tokens')):
+        # abort trigger creation if tokens table doesn't exist yet
+        return
+
+    if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
+        cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
+    ):
+        # Second part: insert and update triggers for wcs_all_forms
+        cur.execute(
+            """CREATE TRIGGER wcs_all_forms_fts_trg_ins
+            AFTER INSERT ON wcs_all_forms
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+        cur.execute(
+            """CREATE TRIGGER wcs_all_forms_fts_trg_upd
+            AFTER UPDATE OF fts ON wcs_all_forms
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+
+    if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
+        cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
+    ):
+        # Third part: insert and update triggers for searchable_formdefs
+        cur.execute(
+            """CREATE TRIGGER searchable_formdefs_fts_trg_ins
+            AFTER INSERT ON searchable_formdefs
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+        cur.execute(
+            """CREATE TRIGGER searchable_formdefs_fts_trg_upd
+            AFTER UPDATE OF fts ON searchable_formdefs
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+
+
+def init_search_tokens_data(cur):
+    if not (_table_exists(cur, 'wcs_search_tokens')):
+        # abort table data initialization if tokens table doesn't exist yet
+        return
+
+    if _table_exists(cur, 'wcs_all_forms'):
+        cur.execute(
+            """INSERT INTO wcs_search_tokens
+        SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
+        ON CONFLICT(token) DO NOTHING;"""
+        )
+    if _table_exists(cur, 'searchable_formdefs'):
+        cur.execute(
+            """INSERT INTO wcs_search_tokens
+        SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
+        ON CONFLICT(token) DO NOTHING;"""
+        )
+
+
+def purge_obsolete_search_tokens(cur=None):
+    own_cur = False
+    if cur is None:
+        own_cur = True
+        _, cur = get_connection_and_cursor()
+
+    cur.execute(
+        """DELETE FROM wcs_search_tokens
+    WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
+    AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
+    )
+    if own_cur:
+        cur.close()
+
+
 class SqlMixin:
    _table_name = None
    _numerical_id = True
@ -4811,7 +4998,6 @@ class SearchableFormDef(SqlMixin):
                % (cls._table_name, cls._table_name)
            )
        cls.do_indexes(cur)
-        cur.close()

        from wcs.carddef import CardDef
        from wcs.formdef import FormDef
@ -4820,6 +5006,8 @@ class SearchableFormDef(SqlMixin):
            CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
        ):
            cls.update(obj=objectdef)
+        init_search_tokens(cur)
+        cur.close()

    @classmethod
    def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
@ -4857,7 +5045,7 @@ class SearchableFormDef(SqlMixin):
    def search(cls, obj_type, string):
        _, cur = get_connection_and_cursor()
        cur.execute(
-            'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
+            'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
            (FtsMatch.get_fts_value(string),),
        )
        ids = [x[0] for x in cur.fetchall()]
@ -5122,7 +5310,7 @@ def get_period_total(
 # latest migration, number + description (description is not used
 # programmaticaly but will make sure git conflicts if two migrations are
 # separately added with the same number)
-SQL_LEVEL = (106, 'add context column to logged_errors table')
+SQL_LEVEL = (107, 'new fts mechanism with tokens table')


 def migrate_global_views(conn, cur):
@ -5456,6 +5644,10 @@ def migrate():
        for formdef in FormDef.select() + CardDef.select():
            do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)

+    if sql_level < 107:
+        # 107: new fts mechanism with tokens table
+        init_search_tokens()
+
    if sql_level != SQL_LEVEL[0]:
        cur.execute(
            '''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',
--- a/wcs/sql_criterias.py
+++ b/wcs/sql_criterias.py
@ -379,6 +379,11 @@ class FtsMatch(Criteria):
        return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)


+class WcsFtsMatch(FtsMatch):
+    def as_sql(self):
+        return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
+
+
 class ElementEqual(Criteria):
    def __init__(self, attribute, key, value, **kwargs):
        super().__init__(attribute, value)