zoo/zoo/zoo_nanterre/duplicates.py

# zoo - versatile objects management
# Copyright (C) 2016  Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import collections
from decimal import Decimal
import datetime

from django.db.transaction import atomic
from django.db.models.query import Q
from django.utils.timezone import now
from django.conf import settings

from zoo.utils import strip_accents
from zoo.zoo_data.models import Entity, Relation
from .models import Duplicate
from .utils import pair_sort, PersonSearch, UNION_REL, RESPONSABILITE_LEGALE_REL


@atomic
def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count=None, ids=None,
                    progression=False):
    # Define search space
    limit = limit or getattr(settings, 'ZOO_NANTERRE_DUPLICATES_THRESHOLD', 0.7)
    base_limit = base_limit or limit / 2.0
    sibling_factor = getattr(settings, 'ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR', 0.6)

    qs = queryset or Entity.objects.all()
    qs = qs.filter(schema__slug='individu')
    if days:
        threshold = now() - datetime.timedelta(days=days)
        qs = qs.filter(
            Q(created__created__gte=threshold) | Q(modified__created__gte=threshold))
    if count:
        qs = qs.order_by('-id')[:count]
    if ids:
        qs = qs.filter(id__in=ids)

    known = {(d.first_id, d.second_id): d for d in Duplicate.objects.all()}
    new = set()
    new_duplicates = []

    conjoints = set()
    for rel in Relation.objects.filter(schema__slug=UNION_REL):
        conjoints.add(frozenset([rel.left_id, rel.right_id]))
    parents = collections.defaultdict(lambda: set())
    for rel in Relation.objects.filter(schema__slug=RESPONSABILITE_LEGALE_REL):
        parents[rel.right_id].add(rel.left_id)

    def same_network(first, second):
        '''Returns true if persons are parts of the same family'''
        if frozenset([first.id, second.id]) in conjoints:
            return True
        if first.id in parents and second.id in parents[first.id]:
            return True
        if second.id in parents and first.id in parents[second.id]:
            return True
        if first.id in parents and second.id in parents and parents[first.id] & parents[second.id]:
            return True
        return False

    search = PersonSearch(limit=limit, base_limit=base_limit)
    count = qs.count()
    seen = set()

    for i, first in enumerate(qs):
        if 'naitre' in strip_accents(first.content['prenoms'].lower()):
            continue
        # search for duplicate based on the name
        s = search.copy()
        s = s.search_individu(first)
        if first.content.get('date_de_naissance'):
            s = s.search_birthdate(
                datetime.datetime.strptime(
                    first.content['date_de_naissance'], '%Y-%m-%d').date(),
                window_days=30)
        for second in s.queryset(prefetch=False):
            if 'naitre' in strip_accents(second.content['prenoms'].lower()):
                continue
            if first == second:
                continue

            p = pair_sort(first.id, second.id)
            similarity = Decimal(second.similarity)
            if same_network(first, second):
                similarity *= Decimal(sibling_factor)
            if similarity < limit:
                continue

            seen.add(p)
            if p in known:
                duplicate = known[p]
                if duplicate.score == similarity:
                    continue
                if duplicate.score < similarity:
                    # if new score if greater, ask for new check
                    duplicate.false = False
                duplicate.score = similarity
                duplicate.save()
            elif p not in new:
                new.add(p)
                new_duplicates.append(
                    Duplicate(
                        first_id=p[0],
                        second_id=p[1],
                        score=similarity))
        if progression:
            yield len(new_duplicates), i + 1, count

    Duplicate.objects.bulk_create(new_duplicates)
    # clear old duplicates
    Duplicate.objects.filter(id__in=[known[p].id for p in set(known) - set(seen)]).delete()
    yield len(new_duplicates), count, count