126 lines
4.7 KiB
Python
126 lines
4.7 KiB
Python
# zoo - versatile objects management
|
|
# Copyright (C) 2016 Entr'ouvert
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify it
|
|
# under the terms of the GNU Affero General Public License as published
|
|
# by the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
import collections
|
|
from decimal import Decimal
|
|
import datetime
|
|
|
|
from django.db.transaction import atomic
|
|
from django.db.models.query import Q
|
|
from django.utils.timezone import now
|
|
from django.conf import settings
|
|
|
|
from zoo.utils import strip_accents
|
|
from zoo.zoo_data.models import Entity, Relation
|
|
from .models import Duplicate
|
|
from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL
|
|
|
|
|
|
@atomic
|
|
def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count=None, ids=None,
|
|
progression=False):
|
|
# Define search space
|
|
limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
|
|
base_limit = base_limit or limit / 2.0
|
|
sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)
|
|
|
|
qs = queryset or Entity.objects.all()
|
|
qs = qs.filter(schema__slug='individu')
|
|
if days:
|
|
threshold = now() - datetime.timedelta(days=days)
|
|
qs = qs.filter(
|
|
Q(created__created__gte=threshold) | Q(modified__created__gte=threshold))
|
|
if count:
|
|
qs = qs.order_by('-id')[:count]
|
|
if ids:
|
|
qs = qs.filter(id__in=ids)
|
|
|
|
known = {(d.first_id, d.second_id): d for d in Duplicate.objects.all()}
|
|
new = set()
|
|
new_duplicates = []
|
|
|
|
conjoints = set()
|
|
for rel in Relation.objects.filter(schema__slug=UNION_REL):
|
|
conjoints.add(frozenset([rel.left_id, rel.right_id]))
|
|
parents = collections.defaultdict(lambda: set())
|
|
for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
|
|
parents[rel.right_id].add(rel.left_id)
|
|
|
|
def same_network(first, second):
|
|
'''Returns true if persons are parts of the same family'''
|
|
if frozenset([first.id, second.id]) in conjoints:
|
|
return True
|
|
if first.id in parents and second.id in parents[first.id]:
|
|
return True
|
|
if second.id in parents and first.id in parents[second.id]:
|
|
return True
|
|
if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
|
|
return True
|
|
return False
|
|
|
|
search = PersonSearch(limit=limit, base_limit=base_limit)
|
|
count = qs.count()
|
|
seen = set()
|
|
|
|
for i, first in enumerate(qs):
|
|
if 'naitre' in strip_accents(first.content['prenoms'].lower()):
|
|
continue
|
|
# search for duplicate based on the name
|
|
s = search.copy()
|
|
s = s.search_individu(first)
|
|
if first.content.get('date_de_naissance'):
|
|
s = s.search_birthdate(
|
|
datetime.datetime.strptime(
|
|
first.content['date_de_naissance'], '%Y-%m-%d').date(),
|
|
window_days=30)
|
|
for second in s.queryset(prefetch=False):
|
|
if 'naitre' in strip_accents(second.content['prenoms'].lower()):
|
|
continue
|
|
if first == second:
|
|
continue
|
|
|
|
p = pair_sort(first.id, second.id)
|
|
similarity = Decimal(second.similarity)
|
|
if same_network(first, second):
|
|
similarity *= Decimal(sibling_factor)
|
|
if similarity < limit:
|
|
continue
|
|
|
|
seen.add(p)
|
|
if p in known:
|
|
duplicate = known[p]
|
|
if duplicate.score == similarity:
|
|
continue
|
|
if duplicate.score < similarity:
|
|
# if new score if greater, ask for new check
|
|
duplicate.false = False
|
|
duplicate.score = similarity
|
|
duplicate.save()
|
|
elif p not in new:
|
|
new.add(p)
|
|
new_duplicates.append(
|
|
Duplicate(
|
|
first_id=p[0],
|
|
second_id=p[1],
|
|
score=similarity))
|
|
if progression:
|
|
yield len(new_duplicates), i + 1, count
|
|
|
|
Duplicate.objects.bulk_create(new_duplicates)
|
|
# clear old duplicates
|
|
Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
|
|
yield len(new_duplicates), count, count
|