nanterre: diminuer la similarité pour les membres d'une même famille (#37038)

2020-01-12 21:57:22 +01:00 · 2020-01-12 21:57:22 +01:00 · f7183bd2ce
parent fd0ca4b6aa
commit f7183bd2ce
2 changed files with 54 additions and 2 deletions
--- a/tests/test_nanterre_doublons.py
+++ b/tests/test_nanterre_doublons.py
@ -163,3 +163,23 @@ def test_doublons_cmd(lot_of_names):
    call_command('rsu-duplicates', 'find')
    assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
    call_command('rsu-duplicates', 'list')
+
+
+def test_ignore_siblings(nanterre_classic_family, settings):
+    call_command('rsu-duplicates', 'find')
+    assert Duplicate.objects.count() == 0
+
+    # by moving the birthdate of kevin to undefined (less than 1903)
+    # and changing the first name of keving to JEANNOT looking like its father
+    # first name JEAN, we find kevin and jean are potential duplicates
+    nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
+    nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
+    nanterre_classic_family['kevin'].save()
+    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
+    call_command('rsu-duplicates', 'find')
+    assert Duplicate.objects.count() == 1
+
+    # if we lower the sibling factor to 0.9, the duplicate is now ignored
+    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
+    call_command('rsu-duplicates', 'find')
+    assert Duplicate.objects.count() == 0
--- a/zoo/zoo_nanterre/duplicates.py
+++ b/zoo/zoo_nanterre/duplicates.py
@ -14,6 +14,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

+import collections
 from decimal import Decimal
 import datetime

@ -23,9 +24,9 @@ from django.utils.timezone import now
 from django.conf import settings

 from zoo.utils import strip_accents
-from zoo.zoo_data.models import Entity
+from zoo.zoo_data.models import Entity, Relation
 from .models import Duplicate
-from .utils import pair_sort, PersonSearch
+from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL


@atomic
@ -34,6 +35,8 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
    # Define search space
    limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
    base_limit = base_limit or limit / 2.0
+    sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
+
    qs = queryset or Entity.objects.all()
    qs = qs.filter(schema__slug='individu')
    if days:
@ -49,8 +52,28 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
    new = set()
    new_duplicates = []

+    conjoints = set()
+    for rel in Relation.objects.filter(schema__slug=UNION_REL):
+        conjoints.add(frozenset([rel.left_id, rel.right_id]))
+    parents = collections.defaultdict(lambda: set())
+    for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
+        parents[rel.right_id].add(rel.left_id)
+
+    def same_network(first, second):
+        '''Returns true if persons are parts of the same family'''
+        if frozenset([first.id, second.id]) in conjoints:
+            return True
+        if first.id in parents and second.id in parents[first.id]:
+            return True
+        if second.id in parents and first.id in parents[second.id]:
+            return True
+        if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
+            return True
+        return False
+
    search = PersonSearch(limit=limit, base_limit=base_limit)
    count = qs.count()
+    seen = set()

    for i, first in enumerate(qs):
        if 'naitre' in strip_accents(first.content['prenoms'].lower()):
@ -68,8 +91,15 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
                continue
            if first == second:
                continue
+
            p = pair_sort(first.id, second.id)
            similarity = Decimal(second.similarity)
+            if same_network(first, second):
+                similarity *= Decimal(sibling_factor)
+            if similarity < limit:
+                continue
+
+            seen.add(p)
            if p in known:
                duplicate = known[p]
                if duplicate.score == similarity:
@ -90,4 +120,6 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
            yield len(new_duplicates), i + 1, count

    Duplicate.objects.bulk_create(new_duplicates)
+    # clear old duplicates
+    Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
    yield len(new_duplicates), count, count