custom_user: use strict_word_similarity in find_duplicates (#80940)
gitea/authentic/pipeline/head This commit looks good Details

This commit is contained in:
Benjamin Dauvergne 2023-09-21 17:58:59 +02:00
parent 1f980ae638
commit 7935236510
4 changed files with 70 additions and 4 deletions

View File

@ -16,6 +16,9 @@
from django.apps import AppConfig
from django.db import DEFAULT_DB_ALIAS, router
from django.db.models import CharField, TextField
from .postgres_utils import TrigramStrictWordSimilar
class CustomUserConfig(AppConfig):
@ -27,6 +30,10 @@ class CustomUserConfig(AppConfig):
post_migrate.connect(self.create_first_name_last_name_attributes, sender=self)
# register custom postgres ORM lookup
CharField.register_lookup(TrigramStrictWordSimilar)
TextField.register_lookup(TrigramStrictWordSimilar)
def create_first_name_last_name_attributes(
self, app_config, verbosity=2, interactive=True, using=DEFAULT_DB_ALIAS, **kwargs
):

View File

@ -34,6 +34,8 @@ from authentic2.models import AttributeValue
from authentic2.utils.date import parse_date
from authentic2.utils.lookups import ImmutableConcat, Unaccent
from .postgres_utils import TrigramStrictWordDistance
class UserQuerySet(models.QuerySet):
def free_text_search(self, search):
@ -131,7 +133,9 @@ class UserQuerySet(models.QuerySet):
def find_duplicates(
self, first_name=None, last_name=None, fullname=None, birthdate=None, limit=5, threshold=None, ou=None
):
self.set_trigram_similarity_threshold(threshold=threshold or app_settings.A2_DUPLICATES_THRESHOLD)
self.set_trigram_strict_word_similarity_threshold(
threshold=threshold or app_settings.A2_DUPLICATES_THRESHOLD
)
if fullname is not None:
name = fullname
@ -141,8 +145,8 @@ class UserQuerySet(models.QuerySet):
name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii').lower()
qs = self.annotate(name=Lower(Unaccent(ImmutableConcat('first_name', Value(' '), 'last_name'))))
qs = qs.filter(name__trigram_similar=name)
qs = qs.annotate(dist=TrigramDistance('name', name))
qs = qs.filter(name__trigram_strict_word_similar=name)
qs = qs.annotate(dist=TrigramStrictWordDistance('name', name))
qs = qs.order_by('dist')
if ou:
@ -169,6 +173,13 @@ class UserQuerySet(models.QuerySet):
return qs
def set_trigram_strict_word_similarity_threshold(self, threshold=None):
with connection.cursor() as cursor:
cursor.execute(
'SET pg_trgm.strict_word_similarity_threshold = %f'
% (threshold or app_settings.A2_FTS_THRESHOLD)
)
def set_trigram_similarity_threshold(self, threshold=None):
with connection.cursor() as cursor:
cursor.execute(

View File

@ -0,0 +1,14 @@
# authentic2 - (C) Entr'ouvert
from django.contrib.postgres.search import TrigramBase
from django.db.models.lookups import PostgresOperatorLookup
class TrigramStrictWordSimilar(PostgresOperatorLookup):
lookup_name = 'trigram_strict_word_similar'
postgres_operator = '%%>>'
class TrigramStrictWordDistance(TrigramBase):
function = ''
arg_joiner = ' <->>> '

View File

@ -163,10 +163,44 @@ def test_fts_trigram(fts):
# dist attribute signals queryset from find_duplicates()
assert hasattr(User.objects.free_text_search('darmettein')[0], 'dist')
assert User.objects.free_text_search('lea darmettein').filter(dist__lte=0.3).count() == 1
assert list(
User.objects.free_text_search('lea darmettein')
.filter(dist=0.0)
.values_list('last_name', 'first_name')
) == [('darmettein', 'Lea')]
assert hasattr(User.objects.free_text_search('darmettein')[0], 'dist')
def test_fts_last_name(db):
first_names = [
'Albert',
'Michel',
'Nicole',
'Sylviane',
'Jean-Pierre',
'JEAN PIERRE',
'Jean-Claude',
'Jeanine',
]
for first_name in first_names:
User.objects.create(last_name='ROSSET', first_name=first_name)
User.objects.create(last_name='RUSSO', first_name='Rossetta')
assert list(
User.objects.free_text_search('rosset')
.filter(dist__lt=0.2)
.values_list('last_name', 'first_name', 'dist')
) == [
('ROSSET', 'Albert', 0.0),
('ROSSET', 'Jean-Claude', 0.0),
('ROSSET', 'Jeanine', 0.0),
('ROSSET', 'Jean-Pierre', 0.0),
('ROSSET', 'JEAN PIERRE', 0.0),
('ROSSET', 'Michel', 0.0),
('ROSSET', 'Nicole', 0.0),
('ROSSET', 'Sylviane', 0.0),
]
def test_fts_legacy(fts):
assert User.objects.free_text_search('rue des peupliers').count() == 3