Compare commits
2 Commits
ac48ebb70d
...
fcdc2581ca
Author | SHA1 | Date |
---|---|---|
Pierre Ducroquet | fcdc2581ca | |
Pierre Ducroquet | 53e3caf83b |
|
@ -1183,6 +1183,45 @@ def test_sql_criteria_fts(pub):
|
||||||
assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display
|
assert data_class.select([st.FtsMatch(formdata1.id_display)])[0].id_display == formdata1.id_display
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_tokens_purge(pub):
|
||||||
|
_, cur = sql.get_connection_and_cursor()
|
||||||
|
|
||||||
|
# purge garbage from other tests
|
||||||
|
sql.purge_obsolete_search_tokens()
|
||||||
|
|
||||||
|
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
|
||||||
|
start = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# define a new table
|
||||||
|
test_formdef = FormDef()
|
||||||
|
test_formdef.name = 'tableSelectFTStokens'
|
||||||
|
test_formdef.fields = [fields.StringField(id='3', label='string')]
|
||||||
|
test_formdef.store()
|
||||||
|
data_class = test_formdef.data_class(mode='sql')
|
||||||
|
|
||||||
|
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
|
||||||
|
assert cur.fetchone()[0] == start + 1
|
||||||
|
|
||||||
|
t = data_class()
|
||||||
|
t.data = {'3': 'foofortokensofcourse'}
|
||||||
|
t.just_created()
|
||||||
|
t.store()
|
||||||
|
|
||||||
|
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
|
||||||
|
assert cur.fetchone()[0] == start + 2
|
||||||
|
|
||||||
|
t.data = {'3': 'chaussettefortokensofcourse'}
|
||||||
|
t.store()
|
||||||
|
|
||||||
|
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
|
||||||
|
assert cur.fetchone()[0] == start + 3
|
||||||
|
|
||||||
|
sql.purge_obsolete_search_tokens()
|
||||||
|
|
||||||
|
cur.execute('SELECT count(*) FROM wcs_search_tokens;')
|
||||||
|
assert cur.fetchone()[0] == start + 2
|
||||||
|
|
||||||
|
|
||||||
def table_exists(cur, table_name):
|
def table_exists(cur, table_name):
|
||||||
cur.execute(
|
cur.execute(
|
||||||
'''SELECT COUNT(*) FROM information_schema.tables
|
'''SELECT COUNT(*) FROM information_schema.tables
|
||||||
|
|
|
@ -485,6 +485,7 @@ class WcsPublisher(QommonPublisher):
|
||||||
for _formdef in FormDef.select() + CardDef.select():
|
for _formdef in FormDef.select() + CardDef.select():
|
||||||
sql.do_formdef_tables(_formdef)
|
sql.do_formdef_tables(_formdef)
|
||||||
sql.migrate_global_views(conn, cur)
|
sql.migrate_global_views(conn, cur)
|
||||||
|
sql.init_search_tokens()
|
||||||
cur.close()
|
cur.close()
|
||||||
|
|
||||||
def record_deprecated_usage(self, *args, **kwargs):
|
def record_deprecated_usage(self, *args, **kwargs):
|
||||||
|
|
|
@ -692,6 +692,11 @@ class QommonPublisher(Publisher):
|
||||||
for error in self.loggederror_class.select(clause=clauses):
|
for error in self.loggederror_class.select(clause=clauses):
|
||||||
self.loggederror_class.remove_object(error.id)
|
self.loggederror_class.remove_object(error.id)
|
||||||
|
|
||||||
|
def clean_search_tokens(self, **kwargs):
|
||||||
|
from wcs import sql
|
||||||
|
|
||||||
|
sql.purge_obsolete_search_tokens()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register_cronjobs(cls):
|
def register_cronjobs(cls):
|
||||||
cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
|
cls.register_cronjob(CronJob(cls.clean_sessions, minutes=[0], name='clean_sessions'))
|
||||||
|
@ -704,6 +709,9 @@ class QommonPublisher(Publisher):
|
||||||
cls.register_cronjob(
|
cls.register_cronjob(
|
||||||
CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
|
CronJob(cls.clean_loggederrors, hours=[3], minutes=[0], name='clean_loggederrors')
|
||||||
)
|
)
|
||||||
|
cls.register_cronjob(
|
||||||
|
CronJob(cls.clean_search_tokens, weekdays=[0], hours=[1], minutes=[0], name='clean_search_tokens')
|
||||||
|
)
|
||||||
|
|
||||||
_initialized = False
|
_initialized = False
|
||||||
|
|
||||||
|
|
198
wcs/sql.py
198
wcs/sql.py
|
@ -96,6 +96,20 @@ SQL_TYPE_MAPPING = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _table_exists(cur, table_name):
|
||||||
|
cur.execute('SELECT 1 FROM pg_class WHERE relname = %s', (table_name,))
|
||||||
|
rows = cur.fetchall()
|
||||||
|
return len(rows) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def _trigger_exists(cur, table_name, trigger_name):
|
||||||
|
cur.execute(
|
||||||
|
'SELECT 1 FROM pg_trigger WHERE tgrelid = %s::regclass AND tgname = %s', (table_name, trigger_name)
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
return len(rows) > 0
|
||||||
|
|
||||||
|
|
||||||
class WcsPgConnection(psycopg2.extensions.connection):
|
class WcsPgConnection(psycopg2.extensions.connection):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
@ -1582,6 +1596,8 @@ def do_global_views(conn, cur):
|
||||||
% (name, category.id)
|
% (name, category.id)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
init_search_tokens_triggers(cur)
|
||||||
|
|
||||||
|
|
||||||
def clean_global_views(conn, cur):
|
def clean_global_views(conn, cur):
|
||||||
# Purge of any dead data
|
# Purge of any dead data
|
||||||
|
@ -1674,11 +1690,182 @@ def init_global_table(conn=None, cur=None):
|
||||||
endpoint_status=endpoint_status_filter,
|
endpoint_status=endpoint_status_filter,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
init_search_tokens_data(cur)
|
||||||
|
|
||||||
if own_conn:
|
if own_conn:
|
||||||
cur.close()
|
cur.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_search_tokens(conn=None, cur=None):
|
||||||
|
"""Initialize the search_tokens mechanism.
|
||||||
|
|
||||||
|
It's based on three parts:
|
||||||
|
- a token table
|
||||||
|
- triggers to feed this table from the tsvectors used in the database
|
||||||
|
- a search function that will leverage these tokens to extend the search query.
|
||||||
|
|
||||||
|
So far, the sources used are wcs_all_forms and searchable_formdefs.
|
||||||
|
|
||||||
|
Example: let's say the sources texts are "Tarif d'école" and "La cantine".
|
||||||
|
This gives the following tsvectors: ('tarif', 'écol') and ('cantin')
|
||||||
|
Our tokens table will have these three words.
|
||||||
|
When the search function is launched, it splits the search query and will
|
||||||
|
replace unavailable tokens by those close, if available.
|
||||||
|
The search query 'tari' will be expanded to 'tarif'.
|
||||||
|
The search query 'collège' will remain unchanged (and return nothing)
|
||||||
|
If several tokens match or are close enough, the query will be expanded to
|
||||||
|
an OR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
own_cur = False
|
||||||
|
if cur is None:
|
||||||
|
own_cur = True
|
||||||
|
conn, cur = get_connection_and_cursor()
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
cur.execute('CREATE TABLE IF NOT EXISTS wcs_search_tokens(token TEXT PRIMARY KEY);')
|
||||||
|
|
||||||
|
# Create triggers
|
||||||
|
init_search_tokens_triggers(cur)
|
||||||
|
|
||||||
|
# Fill table
|
||||||
|
init_search_tokens_data(cur)
|
||||||
|
|
||||||
|
# Index at the end, small performance trick... not that useful, but it's free...
|
||||||
|
cur.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm;')
|
||||||
|
cur.execute(
|
||||||
|
'CREATE INDEX IF NOT EXISTS wcs_search_tokens_trgm ON wcs_search_tokens USING gin(token gin_trgm_ops);'
|
||||||
|
)
|
||||||
|
|
||||||
|
# And last: functions to use this brand new table
|
||||||
|
# These two aggregates make the search query far simpler to write, allowing writing an OR/AND of search terms
|
||||||
|
# directly as an SQL aggregation.
|
||||||
|
# They use the tsquery_or and tsquery_and functions that are included in PostgreSQL since 8.3, but documented
|
||||||
|
# under their operator names || and &&.
|
||||||
|
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_or (tsquery) (sfunc=tsquery_or, stype=tsquery);')
|
||||||
|
cur.execute('CREATE OR REPLACE AGGREGATE tsquery_agg_and (tsquery) (sfunc=tsquery_and, stype=tsquery);')
|
||||||
|
cur.execute(
|
||||||
|
r"""CREATE OR REPLACE FUNCTION public.wcs_tsquery(text)
|
||||||
|
RETURNS tsquery
|
||||||
|
LANGUAGE sql
|
||||||
|
STABLE
|
||||||
|
AS $function$
|
||||||
|
WITH
|
||||||
|
tokenized AS (SELECT unnest(regexp_split_to_array($1, '\s+')) word),
|
||||||
|
super_tokenized AS (
|
||||||
|
-- perfect: tokens that are found as is in table, thus no OR required
|
||||||
|
-- partial: tokens found using distance search on tokens table (note: numbers are excluded here)
|
||||||
|
-- distance search is done using pg_trgm, https://www.postgresql.org/docs/current/pgtrgm.html
|
||||||
|
-- otherwise: token as is and likely no search result later
|
||||||
|
SELECT word,
|
||||||
|
coalesce((select plainto_tsquery(perfect.token) FROM wcs_search_tokens AS perfect WHERE perfect.token = plainto_tsquery(word)::text),
|
||||||
|
tsquery_agg_or(plainto_tsquery(partial.token)),
|
||||||
|
plainto_tsquery(word)) AS tokens
|
||||||
|
FROM tokenized
|
||||||
|
LEFT JOIN wcs_search_tokens AS partial ON partial.token % plainto_tsquery(word) AND word not similar to '%[0-9]{2,}%'
|
||||||
|
GROUP BY word)
|
||||||
|
SELECT tsquery_agg_and(tokens) FROM super_tokenized;
|
||||||
|
$function$;"""
|
||||||
|
)
|
||||||
|
|
||||||
|
if own_cur:
|
||||||
|
cur.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_search_tokens_triggers(cur):
|
||||||
|
# We define only appending triggers, ie on INSERT and UPDATE.
|
||||||
|
# It would be far heavier to maintain deletions here, and having extra data has
|
||||||
|
# no or marginal side effect on search performances, and absolutely no impact
|
||||||
|
# on search results.
|
||||||
|
# Instead, a weekly cron job will delete obsolete entries, thus making it sure no
|
||||||
|
# personal data is kept uselessly.
|
||||||
|
|
||||||
|
# First part: the appending function
|
||||||
|
cur.execute(
|
||||||
|
"""CREATE OR REPLACE FUNCTION wcs_search_tokens_trigger_fn ()
|
||||||
|
RETURNS trigger
|
||||||
|
LANGUAGE plpgsql
|
||||||
|
AS $function$
|
||||||
|
BEGIN
|
||||||
|
INSERT INTO wcs_search_tokens SELECT unnest(tsvector_to_array(NEW.fts)) ON CONFLICT(token) DO NOTHING;
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$function$;"""
|
||||||
|
)
|
||||||
|
|
||||||
|
if not (_table_exists(cur, 'wcs_search_tokens')):
|
||||||
|
# abort trigger creation if tokens table doesn't exist yet
|
||||||
|
return
|
||||||
|
|
||||||
|
if _table_exists(cur, 'wcs_all_forms') and not _trigger_exists(
|
||||||
|
cur, 'wcs_all_forms', 'wcs_all_forms_fts_trg_upd'
|
||||||
|
):
|
||||||
|
# Second part: insert and update triggers for wcs_all_forms
|
||||||
|
cur.execute(
|
||||||
|
"""CREATE TRIGGER wcs_all_forms_fts_trg_ins
|
||||||
|
AFTER INSERT ON wcs_all_forms
|
||||||
|
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
|
||||||
|
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"""CREATE TRIGGER wcs_all_forms_fts_trg_upd
|
||||||
|
AFTER UPDATE OF fts ON wcs_all_forms
|
||||||
|
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
|
||||||
|
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
|
||||||
|
)
|
||||||
|
|
||||||
|
if _table_exists(cur, 'searchable_formdefs') and not _trigger_exists(
|
||||||
|
cur, 'searchable_formdefs', 'searchable_formdefs_fts_trg_upd'
|
||||||
|
):
|
||||||
|
# Third part: insert and update triggers for searchable_formdefs
|
||||||
|
cur.execute(
|
||||||
|
"""CREATE TRIGGER searchable_formdefs_fts_trg_ins
|
||||||
|
AFTER INSERT ON searchable_formdefs
|
||||||
|
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
|
||||||
|
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"""CREATE TRIGGER searchable_formdefs_fts_trg_upd
|
||||||
|
AFTER UPDATE OF fts ON searchable_formdefs
|
||||||
|
FOR EACH ROW WHEN (NEW.fts IS NOT NULL)
|
||||||
|
EXECUTE PROCEDURE wcs_search_tokens_trigger_fn();"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def init_search_tokens_data(cur):
|
||||||
|
if not (_table_exists(cur, 'wcs_search_tokens')):
|
||||||
|
# abort table data initialization if tokens table doesn't exist yet
|
||||||
|
return
|
||||||
|
|
||||||
|
if _table_exists(cur, 'wcs_all_forms'):
|
||||||
|
cur.execute(
|
||||||
|
"""INSERT INTO wcs_search_tokens
|
||||||
|
SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms
|
||||||
|
ON CONFLICT(token) DO NOTHING;"""
|
||||||
|
)
|
||||||
|
if _table_exists(cur, 'searchable_formdefs'):
|
||||||
|
cur.execute(
|
||||||
|
"""INSERT INTO wcs_search_tokens
|
||||||
|
SELECT unnest(tsvector_to_array(fts)) FROM searchable_formdefs
|
||||||
|
ON CONFLICT(token) DO NOTHING;"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def purge_obsolete_search_tokens(cur=None):
|
||||||
|
own_cur = False
|
||||||
|
if cur is None:
|
||||||
|
own_cur = True
|
||||||
|
_, cur = get_connection_and_cursor()
|
||||||
|
|
||||||
|
cur.execute(
|
||||||
|
"""DELETE FROM wcs_search_tokens
|
||||||
|
WHERE token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms)
|
||||||
|
AND token NOT IN (SELECT unnest(tsvector_to_array(fts)) FROM wcs_all_forms);"""
|
||||||
|
)
|
||||||
|
if own_cur:
|
||||||
|
cur.close()
|
||||||
|
|
||||||
|
|
||||||
class SqlMixin:
|
class SqlMixin:
|
||||||
_table_name = None
|
_table_name = None
|
||||||
_numerical_id = True
|
_numerical_id = True
|
||||||
|
@ -4811,7 +4998,6 @@ class SearchableFormDef(SqlMixin):
|
||||||
% (cls._table_name, cls._table_name)
|
% (cls._table_name, cls._table_name)
|
||||||
)
|
)
|
||||||
cls.do_indexes(cur)
|
cls.do_indexes(cur)
|
||||||
cur.close()
|
|
||||||
|
|
||||||
from wcs.carddef import CardDef
|
from wcs.carddef import CardDef
|
||||||
from wcs.formdef import FormDef
|
from wcs.formdef import FormDef
|
||||||
|
@ -4820,6 +5006,8 @@ class SearchableFormDef(SqlMixin):
|
||||||
CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
|
CardDef.select(ignore_errors=True), FormDef.select(ignore_errors=True)
|
||||||
):
|
):
|
||||||
cls.update(obj=objectdef)
|
cls.update(obj=objectdef)
|
||||||
|
init_search_tokens(cur)
|
||||||
|
cur.close()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
|
def update(cls, obj=None, removed_obj_type=None, removed_obj_id=None):
|
||||||
|
@ -4857,7 +5045,7 @@ class SearchableFormDef(SqlMixin):
|
||||||
def search(cls, obj_type, string):
|
def search(cls, obj_type, string):
|
||||||
_, cur = get_connection_and_cursor()
|
_, cur = get_connection_and_cursor()
|
||||||
cur.execute(
|
cur.execute(
|
||||||
'SELECT object_id FROM searchable_formdefs WHERE fts @@ plainto_tsquery(%s)',
|
'SELECT object_id FROM searchable_formdefs WHERE fts @@ wcs_tsquery(%s)',
|
||||||
(FtsMatch.get_fts_value(string),),
|
(FtsMatch.get_fts_value(string),),
|
||||||
)
|
)
|
||||||
ids = [x[0] for x in cur.fetchall()]
|
ids = [x[0] for x in cur.fetchall()]
|
||||||
|
@ -5122,7 +5310,7 @@ def get_period_total(
|
||||||
# latest migration, number + description (description is not used
|
# latest migration, number + description (description is not used
|
||||||
# programmaticaly but will make sure git conflicts if two migrations are
|
# programmaticaly but will make sure git conflicts if two migrations are
|
||||||
# separately added with the same number)
|
# separately added with the same number)
|
||||||
SQL_LEVEL = (106, 'add context column to logged_errors table')
|
SQL_LEVEL = (107, 'new fts mechanism with tokens table')
|
||||||
|
|
||||||
|
|
||||||
def migrate_global_views(conn, cur):
|
def migrate_global_views(conn, cur):
|
||||||
|
@ -5456,6 +5644,10 @@ def migrate():
|
||||||
for formdef in FormDef.select() + CardDef.select():
|
for formdef in FormDef.select() + CardDef.select():
|
||||||
do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)
|
do_formdef_tables(formdef, rebuild_views=False, rebuild_global_views=False)
|
||||||
|
|
||||||
|
if sql_level < 107:
|
||||||
|
# 107: new fts mechanism with tokens table
|
||||||
|
init_search_tokens()
|
||||||
|
|
||||||
if sql_level != SQL_LEVEL[0]:
|
if sql_level != SQL_LEVEL[0]:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
'''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',
|
'''UPDATE wcs_meta SET value = %s, updated_at=NOW() WHERE key = %s''',
|
||||||
|
|
|
@ -379,6 +379,11 @@ class FtsMatch(Criteria):
|
||||||
return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)
|
return 'fts @@ plainto_tsquery(%%(c%s)s)' % id(self.value)
|
||||||
|
|
||||||
|
|
||||||
|
class WcsFtsMatch(FtsMatch):
|
||||||
|
def as_sql(self):
|
||||||
|
return 'fts @@ wcs_tsquery(%%(c%s)s)' % id(self.value)
|
||||||
|
|
||||||
|
|
||||||
class ElementEqual(Criteria):
|
class ElementEqual(Criteria):
|
||||||
def __init__(self, attribute, key, value, **kwargs):
|
def __init__(self, attribute, key, value, **kwargs):
|
||||||
super().__init__(attribute, value)
|
super().__init__(attribute, value)
|
||||||
|
|
Loading…
Reference in New Issue