nanterre: ajoute une méthode de recherche spécifique pour les doublons (fixes #22330)

2018-03-07 12:50:08 +01:00 · 2018-03-07 12:50:08 +01:00 · 774610685f
parent ef8a4565d6
commit 774610685f
5 changed files with 87 additions and 35 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import datetime
+import csv

 import pytest
 import django_webtest
@ -281,3 +282,31 @@ def app(request):
    wtm._patch_settings()
    request.addfinalizer(wtm._unpatch_settings)
    return django_webtest.DjangoTestApp(extra_environ={'HTTP_HOST': 'localhost'})
+
+
+@pytest.fixture
+def lot_of_names(rsu_schema):
+    try:
+        reader = csv.reader(open('tests/export-etat-civil.csv'))
+    except IOError:
+        pytest.skip()
+
+    def generate():
+        tr = Transaction.objects.create()
+        for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
+            yield Entity(
+                    schema=rsu_schema['individu'],
+                    created=tr,
+                    content={
+                        'genre': genre,
+                        'prenoms': prenoms,
+                        'nom_de_naissance': nom_de_naissance,
+                        'nom_d_usage': nom_d_usage,
+                        'date_de_naissance': date_de_naissance,
+                        'telephones': [],
+                        'statut_legal': statut_legal,
+                        'email': '',
+                        'cles_de_federation': {},
+                    })
+
+    Entity.objects.bulk_create(generate())
--- a/tests/test_nanterre_doublons.py
+++ b/tests/test_nanterre_doublons.py
@ -2,9 +2,10 @@ import urlparse
 import copy

 from django.core.urlresolvers import reverse
+from django.core.management import call_command

 from zoo.zoo_nanterre.models import Duplicate
-from zoo.zoo_data.models import Log
+from zoo.zoo_data.models import Log, Entity


 def test_list_doublons(nanterre_classic_family, app):
@ -156,3 +157,11 @@ def test_list_doublons(nanterre_classic_family, app):
    log = Log.objects.filter(entity_id=third_data['individu_2']['id']).latest('id')
    assert 'non doublon de' in log.content['text']
    assert log.content['meta']['form_id'] == 103
+
+
+def test_doublons_cmd(lot_of_names):
+    call_command('rsu-duplicates', 'find')
+    assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
+    call_command('rsu-duplicates', 'list')
+    import pdb
+    pdb.set_trace()
--- a/tests/test_nanterre_search.py
+++ b/tests/test_nanterre_search.py
@ -5,8 +5,6 @@ import json

 import pytest

-from zoo.zoo_data.models import Entity, Transaction
-
 # Ces tests ne sont pas exécutables publiquement car ils nécessitent des
 # données personnelles pour être efficaces:
 # Pour générer ces données faire dans psql:
@ -32,34 +30,6 @@ from zoo.zoo_data.models import Entity, Transaction
 # En l'absence de ces fichiers les tests sont ignorés.


-@pytest.fixture
-def lot_of_names(rsu_schema):
-    try:
-        reader = csv.reader(open('tests/export-etat-civil.csv'))
-    except IOError:
-        pytest.skip()
-
-    def generate():
-        tr = Transaction.objects.create()
-        for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
-            yield Entity(
-                    schema=rsu_schema['individu'],
-                    created=tr,
-                    content={
-                        'genre': genre,
-                        'prenoms': prenoms,
-                        'nom_de_naissance': nom_de_naissance,
-                        'nom_d_usage': nom_d_usage,
-                        'date_de_naissance': date_de_naissance,
-                        'telephones': [],
-                        'statut_legal': statut_legal,
-                        'email': '',
-                        'cles_de_federation': {},
-                    })
-
-    Entity.objects.bulk_create(generate())
-
-
@pytest.fixture
 def basic_searches():
    try:
--- a/zoo/zoo_nanterre/duplicates.py
+++ b/zoo/zoo_nanterre/duplicates.py
@ -57,10 +57,7 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
            continue
        # search for duplicate based on the name
        s = search.copy()
-        if first.content['nom_de_naissance']:
-            s = search.search_name(u'%s %s' % (first.content['prenoms'], first.content['nom_de_naissance']))
-        if first.content['nom_d_usage']:
-            s = s.search_name(u'%s %s' % (first.content['prenoms'], first.content['nom_d_usage']))
+        s = s.search_individu(first)
        if first.content.get('date_de_naissance'):
            s = s.search_birthdate(
                datetime.datetime.strptime(
--- a/zoo/zoo_nanterre/utils.py
+++ b/zoo/zoo_nanterre/utils.py
@ -355,6 +355,53 @@ class PersonSearch(object):

        return self.search_name(fullname)

+    def search_individu(self, individu, factor=1.0, first_name_weight=1.0, last_name_weight=1.0):
+        self = copy.deepcopy(self)
+
+        prenoms = individu.content['prenoms']
+        nom_de_naissance = individu.content['nom_de_naissance']
+        nom_d_usage = individu.content['nom_d_usage']
+
+        # Create the simple filter
+        prenoms_expression = Value(1.0) - TrigramDistance(Normalize(
+            JSONTextRef(F('content'), 'prenoms')), self.luv(prenoms))
+        nn_nn_expression = Value(1.0) - TrigramDistance(Normalize(
+            JSONTextRef(F('content'), 'nom_de_naissance')), self.luv(nom_de_naissance))
+        nu_nu_expression = Value(1.0) - TrigramDistance(Normalize(
+            JSONTextRef(F('content'), 'nom_d_usage')), self.luv(nom_d_usage))
+        nn_nu_expression = Value(1.0) - TrigramDistance(Normalize(
+            JSONTextRef(F('content'), 'nom_de_naissance')), self.luv(nom_d_usage))
+        nu_nn_expression = Value(1.0) - TrigramDistance(Normalize(
+            JSONTextRef(F('content'), 'nom_d_usage')), self.luv(nom_de_naissance))
+        q_noms = []
+        if nom_de_naissance:
+            q_noms.append(self.q_normalize('nom_de_naissance', nom_de_naissance))
+            self.name_similarities.append(
+                (first_name_weight * prenoms_expression
+                 + last_name_weight * nn_nn_expression)
+                / Value(first_name_weight + last_name_weight))
+            self.name_similarities.append(
+                (first_name_weight * prenoms_expression
+                 + last_name_weight * nu_nn_expression)
+                / Value(first_name_weight + last_name_weight))
+            q_noms.append(self.q_normalize('nom_d_usage', nom_de_naissance))
+        if nom_d_usage:
+            q_noms.append(self.q_normalize('nom_d_usage', nom_d_usage))
+            q_noms.append(self.q_normalize('nom_de_naissance', nom_d_usage))
+            self.name_similarities.append(
+                (first_name_weight * prenoms_expression
+                 + last_name_weight * nu_nu_expression)
+                / Value(first_name_weight + last_name_weight))
+            self.name_similarities.append(
+                (first_name_weight * prenoms_expression
+                 + last_name_weight * nn_nu_expression)
+                / Value(first_name_weight + last_name_weight))
+        q = self.q_normalize('prenoms', prenoms) & reduce(operator.__or__, q_noms)
+        self.add_filter('name', q)
+
+        # Compute similarity score
+        return self
+
    def copy(self):
        return copy.deepcopy(self)