sql: test purge of search tokens (#86527 )

wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527 )
introduce a new mechanism to implement FTS with fuzzy-match. This is made possible by adding and maintaining a table of the FTS tokens, wcs_search_tokens, fed with searchable_formdefs and wcs_all_forms. When a query is issued, its tokens are matched against the tokens with a fuzzy match when no direct match is found, and the query is then rebuilt.
2024-03-19 18:22:59 +01:00 · 2024-03-19 18:22:59 +01:00 · 2024-03-19 18:22:59 +01:00 · 2024-03-19 17:36:30 +01:00 · 2024-03-19 17:20:11 +01:00 · 2024-03-19 15:35:22 +01:00
10 changed files with 398 additions and 63 deletions
--- a/tests/api/test_formdef.py
+++ b/tests/api/test_formdef.py
@ -429,6 +429,9 @@ def test_backoffice_submission_formdef_list_search(pub, local_user, access, auth
    resp = get_url('/api/formdefs/?backoffice-submission=on&q=test')
    assert len(resp.json['data']) == 2

+    resp = get_url('/api/formdefs/?backoffice-submission=on&q=tes')
+    assert len(resp.json['data']) == 2
+
    resp = get_url('/api/formdefs/?backoffice-submission=on&q=xyz')
    assert len(resp.json['data']) == 0

--- a/tests/test_datasource_users.py
+++ b/tests/test_datasource_users.py
@ -1,3 +1,5 @@
+import xml.etree.ElementTree as ET
+
 import pytest

 from wcs import data_sources
@ -57,11 +59,11 @@ def test_datasource_users(pub):

    assert data_sources.get_items({'type': datasource.slug}) == [
        (
-            '1',
+            str(users[0].id),
            'John Doe 0',
-            '1',
+            str(users[0].id),
            {
-                'id': 1,
+                'id': users[0].id,
                'text': 'John Doe 0',
                'user_name_identifier_0': 'abc0',
                'user_nameid': 'abc0',
@ -72,11 +74,11 @@ def test_datasource_users(pub):
            },
        ),
        (
-            '2',
+            str(users[1].id),
            'John Doe 1',
-            '2',
+            str(users[1].id),
            {
-                'id': 2,
+                'id': users[1].id,
                'text': 'John Doe 1',
                'user_name_identifier_0': 'abc1',
                'user_nameid': 'abc1',
@ -89,11 +91,11 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_items(datasource.extended_data_source) == [
        (
-            '1',
+            str(users[0].id),
            'John Doe 0',
-            '1',
+            str(users[0].id),
            {
-                'id': 1,
+                'id': users[0].id,
                'text': 'John Doe 0',
                'user_name_identifier_0': 'abc0',
                'user_nameid': 'abc0',
@ -104,11 +106,11 @@ def test_datasource_users(pub):
            },
        ),
        (
-            '2',
+            str(users[1].id),
            'John Doe 1',
-            '2',
+            str(users[1].id),
            {
-                'id': 2,
+                'id': users[1].id,
                'text': 'John Doe 1',
                'user_name_identifier_0': 'abc1',
                'user_nameid': 'abc1',
@ -121,7 +123,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -131,7 +133,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -143,7 +145,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -153,7 +155,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -169,7 +171,7 @@ def test_datasource_users(pub):
    datasource.store()
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -179,7 +181,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -191,7 +193,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -201,7 +203,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -223,7 +225,7 @@ def test_datasource_users(pub):
    users[0].store()
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -235,7 +237,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -254,7 +256,7 @@ def test_datasource_users(pub):
    datasource.store()
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -266,7 +268,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -291,7 +293,7 @@ def test_datasource_users(pub):
    assert not datasource.include_disabled_users
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -303,7 +305,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -319,7 +321,7 @@ def test_datasource_users(pub):
    datasource.store()
    assert data_sources.get_structured_items({'type': datasource.slug}) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -329,7 +331,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -341,7 +343,7 @@ def test_datasource_users(pub):
    ]
    assert data_sources.get_structured_items(datasource.extended_data_source) == [
        {
-            'id': 1,
+            'id': users[0].id,
            'text': 'John Doe 0',
            'user_name_identifier_0': 'abc0',
            'user_nameid': 'abc0',
@ -351,7 +353,7 @@ def test_datasource_users(pub):
            'user_email': None,
        },
        {
-            'id': 2,
+            'id': users[1].id,
            'text': 'John Doe 1',
            'user_name_identifier_0': 'abc1',
            'user_nameid': 'abc1',
@ -365,7 +367,7 @@ def test_datasource_users(pub):

    # by uuid
    assert datasource.get_structured_value('abc0') == {
-        'id': 1,
+        'id': users[0].id,
        'text': 'John Doe 0',
        'user_name_identifier_0': 'abc0',
        'user_nameid': 'abc0',
@ -376,7 +378,7 @@ def test_datasource_users(pub):
    }
    assert datasource.get_display_value('abc0') == 'John Doe 0'
    assert datasource.get_structured_value('abc1') == {
-        'id': 2,
+        'id': users[1].id,
        'text': 'John Doe 1',
        'user_name_identifier_0': 'abc1',
        'user_nameid': 'abc1',
@ -388,8 +390,8 @@ def test_datasource_users(pub):
    assert datasource.get_display_value('abc1') == 'John Doe 1'

    # by id
-    assert datasource.get_structured_value('1') == {
-        'id': 1,
+    assert datasource.get_structured_value(str(users[0].id)) == {
+        'id': users[0].id,
        'text': 'John Doe 0',
        'user_name_identifier_0': 'abc0',
        'user_nameid': 'abc0',
@ -398,9 +400,9 @@ def test_datasource_users(pub):
        'user_display_name': 'John Doe 0',
        'user_email': None,
    }
-    assert datasource.get_display_value('1') == 'John Doe 0'
-    assert datasource.get_structured_value('2') == {
-        'id': 2,
+    assert datasource.get_display_value(str(users[0].id)) == 'John Doe 0'
+    assert datasource.get_structured_value(str(users[1].id)) == {
+        'id': users[1].id,
        'text': 'John Doe 1',
        'user_name_identifier_0': 'abc1',
        'user_nameid': 'abc1',
@ -409,11 +411,11 @@ def test_datasource_users(pub):
        'user_display_name': 'John Doe 1',
        'user_email': None,
    }
-    assert datasource.get_display_value('2') == 'John Doe 1'
+    assert datasource.get_display_value(str(users[1].id)) == 'John Doe 1'

    # by numeric id
-    assert datasource.get_structured_value(1) == {
-        'id': 1,
+    assert datasource.get_structured_value(users[0].id) == {
+        'id': users[0].id,
        'text': 'John Doe 0',
        'user_name_identifier_0': 'abc0',
        'user_nameid': 'abc0',
@ -422,9 +424,9 @@ def test_datasource_users(pub):
        'user_display_name': 'John Doe 0',
        'user_email': None,
    }
-    assert datasource.get_display_value(1) == 'John Doe 0'
-    assert datasource.get_structured_value(2) == {
-        'id': 2,
+    assert datasource.get_display_value(users[0].id) == 'John Doe 0'
+    assert datasource.get_structured_value(users[1].id) == {
+        'id': users[1].id,
        'text': 'John Doe 1',
        'user_name_identifier_0': 'abc1',
        'user_nameid': 'abc1',
@ -433,7 +435,7 @@ def test_datasource_users(pub):
        'user_display_name': 'John Doe 1',
        'user_email': None,
    }
-    assert datasource.get_display_value(2) == 'John Doe 1'
+    assert datasource.get_display_value(users[1].id) == 'John Doe 1'

    datasource.users_included_roles = [role1.id]
    datasource.users_excluded_roles = [role2.id]
@ -445,7 +447,7 @@ def test_datasource_users(pub):
    assert datasource.get_structured_value('abc0') is None
    assert datasource.get_display_value('abc0') is None
    assert datasource.get_structured_value('abc1') == {
-        'id': 2,
+        'id': users[1].id,
        'text': 'John Doe 1',
        'user_name_identifier_0': 'abc1',
        'user_nameid': 'abc1',
@ -457,10 +459,10 @@ def test_datasource_users(pub):
    assert datasource.get_display_value('abc1') == 'John Doe 1'

    # by id
-    assert datasource.get_structured_value('1') is None
-    assert datasource.get_display_value('1') is None
-    assert datasource.get_structured_value('2') == {
-        'id': 2,
+    assert datasource.get_structured_value(str(users[0].id)) is None
+    assert datasource.get_display_value(str(users[0].id)) is None
+    assert datasource.get_structured_value(str(users[1].id)) == {
+        'id': users[1].id,
        'text': 'John Doe 1',
        'user_name_identifier_0': 'abc1',
        'user_nameid': 'abc1',
@ -469,7 +471,7 @@ def test_datasource_users(pub):
        'user_display_name': 'John Doe 1',
        'user_email': None,
    }
-    assert datasource.get_display_value('2') == 'John Doe 1'
+    assert datasource.get_display_value(str(users[1].id)) == 'John Doe 1'

    datasource.include_disabled_users = False
    datasource.store()
@ -483,10 +485,10 @@ def test_datasource_users(pub):
    assert datasource.get_display_value('abc1') is None

    # by id
-    assert datasource.get_structured_value('1') is None
-    assert datasource.get_display_value('1') is None
-    assert datasource.get_structured_value('2') is None
-    assert datasource.get_display_value('2') is None
+    assert datasource.get_structured_value(str(users[0].id)) is None
+    assert datasource.get_display_value(str(users[0].id)) is None
+    assert datasource.get_structured_value(str(users[1].id)) is None
+    assert datasource.get_display_value(str(users[1].id)) is None


 def test_datasource_users_user_formdef(pub):
@ -509,9 +511,9 @@ def test_datasource_users_user_formdef(pub):

    assert data_sources.get_items({'type': datasource.slug}) == [
        (
-            '3',
+            str(user.id),
            'John Doe',
-            '3',
+            str(user.id),
            {
                'user_display_name': 'John Doe',
                'user_email': None,
@ -520,8 +522,67 @@ def test_datasource_users_user_formdef(pub):
                'user_var_plop': 'Bar',
                'user_admin_access': False,
                'user_backoffice_access': False,
-                'id': 3,
+                'id': user.id,
                'text': 'John Doe',
            },
        )
    ]
+
+
+def test_legacy_format_import(pub):
+    data_source_xml = """<datasource id="255">
+  <name>Agents de la ville</name>
+  <slug>agents_de_la_ville</slug>
+  <data_source>
+    <type>wcs:users</type>
+    <value />
+  </data_source><users_included_roles>
+    <item>8201764fc2c24b92bd691fd231a4cf76</item>
+  </users_included_roles>
+</datasource>"""
+    ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml))
+    assert ds.users_included_roles == ['8201764fc2c24b92bd691fd231a4cf76']
+
+
+def test_new_format_import(pub):
+    data_source_xml = """<datasource id="255">
+  <name>Agents de la ville</name>
+  <slug>agents_de_la_ville</slug>
+  <data_source>
+    <type>wcs:users</type>
+    <value />
+  </data_source><users_included_roles>
+    <role role-id="8201764fc2c24b92bd691fd231a4cf76" role-slug="agent">Agents</role>
+  </users_included_roles>
+</datasource>"""
+    ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml))
+    assert ds.users_included_roles == []  # role doesn't exist
+
+    # import with id match
+    pub.role_class.wipe()
+    role1 = pub.role_class(name='role')
+    role1.id = '8201764fc2c24b92bd691fd231a4cf76'
+    role1.store()
+
+    ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml), include_id=True)
+    assert ds.users_included_roles == [role1.id]
+
+    # import with slug match
+    pub.role_class.wipe()
+    role1 = pub.role_class(name='Agents')
+    role1.slug = 'agent'
+    role1.store()
+
+    ds = NamedDataSource.import_from_xml_tree(ET.fromstring(data_source_xml), include_id=False)
+    assert ds.users_included_roles == [role1.id]
+
+    # import with name match
+    pub.role_class.wipe()
+    role1 = pub.role_class(name='Agents')
+    role1.slug = 'agent'
+    role1.store()
+
+    ds = NamedDataSource.import_from_xml_tree(
+        ET.fromstring(data_source_xml.replace('role-slug="agent"', '')), include_id=False
+    )
+    assert ds.users_included_roles == [role1.id]
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
    assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display


+def test_search_tokens_purge(pub):
+    _, cur = sql.get_connection_and_cursor()
+
+    # purge garbage from other tests
+    sql.purge_obsolete_search_tokens()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    start = cur.fetchone()[0]
+
+    # define a new table
+    test_formdef = FormDef()
+    test_formdef.name = 'tableSelectFTStokens'
+    test_formdef.fields = [fields.StringField(id='3', label='string')]
+    test_formdef.store()
+    data_class = test_formdef.data_class(mode='sql')
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 1
+
+    t = data_class()
+    t.data = {'3': 'foofortokensofcourse'}
+    t.just_created()
+    t.store()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 2
+
+    t.data = {'3': 'chaussettefortokensofcourse'}
+    t.store()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 3
+
+    sql.purge_obsolete_search_tokens()
+
+    cur.execute('SELECT count(*) FROM wcs_search_tokens;')
+    assert cur.fetchone()[0] == start + 2
+
+
 def table_exists(cur, table_name):
    cur.execute(
        '''SELECT COUNT(*) FROM information_schema.tables
--- a/wcs/data_sources.py
+++ b/wcs/data_sources.py
@ -717,8 +717,8 @@ class NamedDataSource(XmlStorableObject):
        ('data_source', 'data_source'),
        ('notify_on_errors', 'bool'),
        ('record_on_errors', 'bool'),
-        ('users_included_roles', 'str_list'),
-        ('users_excluded_roles', 'str_list'),
+        ('users_included_roles', 'ds_roles'),
+        ('users_excluded_roles', 'ds_roles'),
        ('include_disabled_users', 'bool'),
    ]

--- a/wcs/publisher.py
+++ b/wcs/publisher.py
@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
            for _formdef in FormDef.select() + CardDef.select():
                sql.do_formdef_tables(_formdef)
            sql.migrate_global_views(conn, cur)
+            sql.init_search_tokens()
            cur.close()

    def record_deprecated_usage(self, *args, **kwargs):
--- a/wcs/qommon/form.py
+++ b/wcs/qommon/form.py
@ -1080,6 +1080,15 @@ class FileWithPreviewWidget(CompositeWidget):
                '.pif',
                '.php',
                '.js',
+                '.pht',
+                '.phtml',
+                '.shtml',
+                '.asa',
+                '.asax',
+                '.cer',
+                '.swf',
+                '.xap',
+                '.ps1',
                'application/x-ms-dos-executable',
                'text/x-php',
            ]
--- a/wcs/qommon/publisher.py
+++ b/wcs/qommon/publisher.py
@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
        for error in self.loggederror_class.select(clause=clauses):
            self.loggederror_class.remove_object(error.id)

+    def clean_search_tokens(self, **kwargs):
+        from wcs import sql
+
+        sql.purge_obsolete_search_tokens()
+
    @classmethod
    def register_cronjobs(cls):
        cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
        cls.register_cronjob(
            CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
        )
+        cls.register_cronjob(
+            CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
+        )

    _initialized = False

--- a/wcs/qommon/xml_storage.py
+++ b/wcs/qommon/xml_storage.py
@ -20,7 +20,7 @@ import xml.etree.ElementTree as ET
 from quixote import get_publisher

 from .misc import indent_xml, xml_node_text
-from .storage import Equal, Or, StorableObject
+from .storage import Contains, Equal, Or, StorableObject


 class XmlStorableObject(StorableObject):
@ -141,6 +141,8 @@ class XmlStorableObject(StorableObject):
    def import_roles_from_xml(self, element, include_id=False, **kwargs):
        criterias = []
        for sub in element:
+            if sub.tag != 'role':
+                continue
            if include_id and 'role-id' in sub.attrib:
                criterias.append(Equal('id', sub.attrib['role-id']))
            elif 'role-slug' in sub.attrib:
@ -156,3 +158,22 @@ class XmlStorableObject(StorableObject):
            return get_publisher().role_class.select([Or(criterias)], order_by='name')

        return lazy_roles
+
+    def import_ds_roles_from_xml(self, element, include_id=False, **kwargs):
+        imported_roles = self.import_roles_from_xml(element, include_id=include_id, **kwargs)
+        if callable(imported_roles):
+            imported_roles = imported_roles()
+        role_ids = [x.id for x in imported_roles]
+        for sub in element:
+            if sub.tag == 'item':  # legacy support for <item>{id}</item>
+                role_ids.append(xml_node_text(sub))
+        return role_ids
+
+    def export_ds_roles_to_xml(self, element, attribute_name, include_id=False, **kwargs):
+        for role in get_publisher().role_class.select(
+            [Contains('id', getattr(self, attribute_name, None) or [])]
+        ):
+            sub = ET.SubElement(element, 'role')
+            sub.attrib['role-id'] = role.id  # always include id
+            sub.attrib['role-slug'] = role.slug
+            sub.text = role.name
--- a/wcs/sql.py
+++ b/wcs/sql.py
@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
 }


+def _table_exists(cur, table_name):
+    cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
+    rows = cur.fetchall()
+    return len(rows) > 0
+
+
+def _trigger_exists(cur, table_name, trigger_name):
+    cur.execute(
+        'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
+    )
+    rows = cur.fetchall()
+    return len(rows) > 0
+
+
 class WcsPgConnection(psycopg2.extensions.connection):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
                % (name, category.id)
            )

+    init_search_tokens_triggers(cur)
+

 def clean_global_views(conn, cur):
    # Purge of any dead data
@ -1674,11 +1690,178 @@ def init_global_table(conn=None, cur=None):
                endpoint_status=endpoint_status_filter,
            )
        )
+        init_search_tokens_data(cur)

    if own_conn:
        cur.close()


+def init_search_tokens(conn=None, cur=None):
+    """Initialize the search_tokens mechanism.
+
+    It's based on three parts:
+    - a token table
+    - triggers to feed this table from the tsvectors used in the database
+    - a search function that will leverage these tokens to extend the search query.
+
+    So far, the sources used are wcs_all_forms and searchable_formdefs.
+
+    Example: let's say the sources texts are "Tarif d'école" and "La cantine".
+    This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
+    Our tokens table will have these three words.
+    When the search function is launched, it splits the search query and will
+    replace unavailable tokens by those close, if available.
+    The search query 'tari' will be expanded to 'tarif'.
+    The search query 'collège' will remain unchanged (and return nothing)
+    If several tokens match or are close enough, the query will be expanded to
+    an OR.
+    """
+
+    own_cur = False
+    if cur is None:
+        own_cur = True
+        conn, cur = get_connection_and_cursor()
+
+    # Create table
+    cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
+
+    # Create triggers
+    init_search_tokens_triggers(cur)
+
+    # Fill table
+    init_search_tokens_data(cur)
+
+    # Index at the end, small performance trick... not that useful, but it's free...
+    cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
+    cur.execute(
+        'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
+    )
+
+    # And last: functions to use this brand new table
+    # These two aggregates make the search query far simpler to write
+    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or  (tsquery) (sfunc=tsquery_or,  stype=tsquery);')
+    cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
+    cur.execute(
+        r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
+ RETURNS tsquery
+ LANGUAGE sql
+ STABLE
+AS $function$
+WITH
+        tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) w),
+        super_tokenized AS (
+            -- perfect: tokens that are found as is in table, thus no OR required
+            -- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
+            -- otherwise: token as is and likely no search result later
+            SELECT w,
+                coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(w)::text),
+                         tsquery_agg_or(plainto_tsquery(partial.token) order by partial.token <-> w desc),
+                         plainto_tsquery(w)) tokens
+            FROM tokenized
+            LEFT JOIN wcs_search_tokens AS partial ON partial.token % w AND w not similar to '%[0-9]{2,}%'
+            GROUP BY w)
+SELECT tsquery_agg_and(tokens) FROM super_tokenized;
+$function$;"""
+    )
+
+    if own_cur:
+        cur.close()
+
+
+def init_search_tokens_triggers(cur):
+    # We define only appending triggers, ie on INSERT and UPDATE.
+    # It would be far heavier to maintain deletions here, and having extra data has
+    # no or marginal side effect on search performances, and absolutely no impact
+    # on search results.
+    # Instead, a weekly cron job will delete obsolete entries, thus making it sure no
+    # personal data is kept uselessly.
+
+    # First part: the appending function
+    cur.execute(
+        """CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
+ RETURNS trigger
+ LANGUAGE plpgsql
+AS $function$
+BEGIN
+        INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
+        RETURN NEW;
+END;
+$function$;"""
+    )
+
+    if not (_table_exists(cur, 'wcs_search_tokens')):
+        # abort trigger creation if tokens table doesn't exist yet
+        return
+
+    if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
+        cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
+    ):
+        # Second part: insert and update triggers for wcs_all_forms
+        cur.execute(
+            """CREATE TRIGGER wcs_all_forms_fts_trg_ins 
+            AFTER INSERT ON wcs_all_forms 
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL) 
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+        cur.execute(
+            """CREATE TRIGGER wcs_all_forms_fts_trg_upd
+            AFTER UPDATE OF fts ON wcs_all_forms
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+
+    if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
+        cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
+    ):
+        # Third part: insert and update triggers for searchable_formdefs
+        cur.execute(
+            """CREATE TRIGGER searchable_formdefs_fts_trg_ins 
+            AFTER INSERT ON searchable_formdefs 
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL) 
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+        cur.execute(
+            """CREATE TRIGGER searchable_formdefs_fts_trg_upd 
+            AFTER UPDATE OF fts ON searchable_formdefs 
+            FOR EACH ROW WHEN (NEW.fts IS NOT NULL) 
+            EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
+        )
+
+
+def init_search_tokens_data(cur):
+    if not (_table_exists(cur, 'wcs_search_tokens')):
+        # abort table data initialization if tokens table doesn't exist yet
+        return
+
+    if _table_exists(cur, 'wcs_all_forms'):
+        cur.execute(
+            """INSERT INTO wcs_search_tokens 
+        SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
+        ON CONFLICT(token) DO NOTHING;"""
+        )
+    if _table_exists(cur, 'searchable_formdefs'):
+        cur.execute(
+            """INSERT INTO wcs_search_tokens 
+        SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs 
+        ON CONFLICT(token) DO NOTHING;"""
+        )
+
+
+def purge_obsolete_search_tokens(cur=None):
+    own_cur = False
+    if cur is None:
+        own_cur = True
+        _, cur = get_connection_and_cursor()
+
+    cur.execute(
+        """DELETE FROM wcs_search_tokens 
+    WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms) 
+    AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
+    )
+    if own_cur:
+        cur.close()
+
+
 class SqlMixin:
    _table_name = None
    _numerical_id = True
@ -4809,7 +4992,6 @@ class SearchableFormDef(SqlMixin):
                % (cls._table_name, cls._table_name)
            )
        cls.do_indexes(cur)
-        cur.close()

        from wcs.carddef import CardDef
        from wcs.formdef import FormDef
@ -4818,6 +5000,8 @@ class SearchableFormDef(SqlMixin):
            CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
        ):
            cls.update(obj=objectdef)
+        init_search_tokens(cur)
+        cur.close()

    @classmethod
    def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
@ -4855,7 +5039,7 @@ class SearchableFormDef(SqlMixin):
    def search(cls, obj_type, string):
        _, cur = get_connection_and_cursor()
        cur.execute(
-            'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
+            'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
            (FtsMatch.get_fts_value(string),),
        )
        ids = [x[0] for x in cur.fetchall()]
@ -5120,7 +5304,7 @@ def get_period_total(
 # latest migration, number + description (description is not used
 # programmaticaly but will make sure git conflicts if two migrations are
 # separately added with the same number)
-SQL_LEVEL = (106, 'add context column to logged_errors table')
+SQL_LEVEL = (107, 'new fts mechanism with tokens table')


 def migrate_global_views(conn, cur):
@ -5454,6 +5638,10 @@ def migrate():
        for formdef in FormDef.select() + CardDef.select():
            do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)

+    if sql_level < 107:
+        # 107: new fts mechanism with tokens table
+        init_search_tokens()
+
    if sql_level != SQL_LEVEL[0]:
        cur.execute(
            '''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',
--- a/wcs/sql_criterias.py
+++ b/wcs/sql_criterias.py
@ -379,6 +379,11 @@ class FtsMatch(Criteria):
        return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)


+class WcsFtsMatch(FtsMatch):
+    def as_sql(self):
+        return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
+
+
 class ElementEqual(Criteria):
    def __init__(self, attribute, key, value, **kwargs):
        super().__init__(attribute, value)
Author	SHA1	Message	Date
Pierre Ducroquet	af28be9910	sql: test purge of search tokens (#86527 ) gitea/wcs/pipeline/head This commit looks good Details	2024-03-19 18:22:59 +01:00
Pierre Ducroquet	475dffdcb5	wcs_search_tokens: new FTS mechanism with fuzzy-match (#86527 ) introduce a new mechanism to implement FTS with fuzzy-match. This is made possible by adding and maintaining a table of the FTS tokens, wcs_search_tokens, fed with searchable_formdefs and wcs_all_forms. When a query is issued, its tokens are matched against the tokens with a fuzzy match when no direct match is found, and the query is then rebuilt.	2024-03-19 18:22:59 +01:00
Pierre Ducroquet	7e887c268c	tests: add a test for new FTS on formdefs (#86527 )	2024-03-19 18:22:59 +01:00
Frédéric Péters	6de8f10127	tests: adjust users datasources to check against correct id (#88364 ) gitea/wcs/pipeline/head This commit looks good Details	2024-03-19 17:36:30 +01:00
Frédéric Péters	0ed6455a65	misc: extend default list of forbidden file types (#88352 ) gitea/wcs/pipeline/head There was a failure building this commit Details	2024-03-19 17:20:11 +01:00
Frédéric Péters	76b94d7ee8	data sources: export detailed roles infos (#84889 ) gitea/wcs/pipeline/head This commit looks good Details	2024-03-19 15:35:22 +01:00