nanterre: diminuer la similarité pour les membres d'une même famille (#37038)
This commit is contained in:
parent
fd0ca4b6aa
commit
f7183bd2ce
|
@ -163,3 +163,23 @@ def test_doublons_cmd(lot_of_names):
|
||||||
call_command('rsu-duplicates', 'find')
|
call_command('rsu-duplicates', 'find')
|
||||||
assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
|
assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
|
||||||
call_command('rsu-duplicates', 'list')
|
call_command('rsu-duplicates', 'list')
|
||||||
|
|
||||||
|
|
||||||
|
def test_ignore_siblings(nanterre_classic_family, settings):
|
||||||
|
call_command('rsu-duplicates', 'find')
|
||||||
|
assert Duplicate.objects.count() == 0
|
||||||
|
|
||||||
|
# by moving the birthdate of kevin to undefined (less than 1903)
|
||||||
|
# and changing the first name of keving to JEANNOT looking like its father
|
||||||
|
# first name JEAN, we find kevin and jean are potential duplicates
|
||||||
|
nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
|
||||||
|
nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
|
||||||
|
nanterre_classic_family['kevin'].save()
|
||||||
|
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
|
||||||
|
call_command('rsu-duplicates', 'find')
|
||||||
|
assert Duplicate.objects.count() == 1
|
||||||
|
|
||||||
|
# if we lower the sibling factor to 0.9, the duplicate is now ignored
|
||||||
|
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
|
||||||
|
call_command('rsu-duplicates', 'find')
|
||||||
|
assert Duplicate.objects.count() == 0
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
# You should have received a copy of the GNU Affero General Public License
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import collections
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
@ -23,9 +24,9 @@ from django.utils.timezone import now
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from zoo.utils import strip_accents
|
from zoo.utils import strip_accents
|
||||||
from zoo.zoo_data.models import Entity
|
from zoo.zoo_data.models import Entity, Relation
|
||||||
from .models import Duplicate
|
from .models import Duplicate
|
||||||
from .utils import pair_sort, PersonSearch
|
from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
|
||||||
|
|
||||||
|
|
||||||
@atomic
|
@atomic
|
||||||
|
@ -34,6 +35,8 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
|
||||||
# Define search space
|
# Define search space
|
||||||
limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
|
limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
|
||||||
base_limit = base_limit or limit / 2.0
|
base_limit = base_limit or limit / 2.0
|
||||||
|
sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
|
||||||
|
|
||||||
qs = queryset or Entity.objects.all()
|
qs = queryset or Entity.objects.all()
|
||||||
qs = qs.filter(schema__slug='individu')
|
qs = qs.filter(schema__slug='individu')
|
||||||
if days:
|
if days:
|
||||||
|
@ -49,8 +52,28 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
|
||||||
new = set()
|
new = set()
|
||||||
new_duplicates = []
|
new_duplicates = []
|
||||||
|
|
||||||
|
conjoints = set()
|
||||||
|
for rel in Relation.objects.filter(schema__slug=UNION_REL):
|
||||||
|
conjoints.add(frozenset([rel.left_id, rel.right_id]))
|
||||||
|
parents = collections.defaultdict(lambda: set())
|
||||||
|
for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
|
||||||
|
parents[rel.right_id].add(rel.left_id)
|
||||||
|
|
||||||
|
def same_network(first, second):
|
||||||
|
'''Returns true if persons are parts of the same family'''
|
||||||
|
if frozenset([first.id, second.id]) in conjoints:
|
||||||
|
return True
|
||||||
|
if first.id in parents and second.id in parents[first.id]:
|
||||||
|
return True
|
||||||
|
if second.id in parents and first.id in parents[second.id]:
|
||||||
|
return True
|
||||||
|
if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
search = PersonSearch(limit=limit, base_limit=base_limit)
|
search = PersonSearch(limit=limit, base_limit=base_limit)
|
||||||
count = qs.count()
|
count = qs.count()
|
||||||
|
seen = set()
|
||||||
|
|
||||||
for i, first in enumerate(qs):
|
for i, first in enumerate(qs):
|
||||||
if 'naitre' in strip_accents(first.content['prenoms'].lower()):
|
if 'naitre' in strip_accents(first.content['prenoms'].lower()):
|
||||||
|
@ -68,8 +91,15 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
|
||||||
continue
|
continue
|
||||||
if first == second:
|
if first == second:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
p = pair_sort(first.id, second.id)
|
p = pair_sort(first.id, second.id)
|
||||||
similarity = Decimal(second.similarity)
|
similarity = Decimal(second.similarity)
|
||||||
|
if same_network(first, second):
|
||||||
|
similarity *= Decimal(sibling_factor)
|
||||||
|
if similarity < limit:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen.add(p)
|
||||||
if p in known:
|
if p in known:
|
||||||
duplicate = known[p]
|
duplicate = known[p]
|
||||||
if duplicate.score == similarity:
|
if duplicate.score == similarity:
|
||||||
|
@ -90,4 +120,6 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
|
||||||
yield len(new_duplicates), i + 1, count
|
yield len(new_duplicates), i + 1, count
|
||||||
|
|
||||||
Duplicate.objects.bulk_create(new_duplicates)
|
Duplicate.objects.bulk_create(new_duplicates)
|
||||||
|
# clear old duplicates
|
||||||
|
Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
|
||||||
yield len(new_duplicates), count, count
|
yield len(new_duplicates), count, count
|
||||||
|
|
Loading…
Reference in New Issue