zoo/tests/test_nanterre_doublons.py

import copy
import urllib

from django.urls import reverse
from django.core.management import call_command

from zoo.zoo_nanterre.models import Duplicate
from zoo.zoo_data.models import Log, Entity


def test_list_doublons(nanterre_classic_family, app):
    e = nanterre_classic_family

    url = reverse('rsu-api-doublons')

    d = Duplicate.objects.create(
        first=e['jean'],
        second=e['marie'],
        score=1.0)

    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert len(response.json['data']) == 1
    assert response.json['data'][0]['id'] == d.id
    first_id = min(e['jean'].id, e['marie'].id)
    second_id = max(e['jean'].id, e['marie'].id)
    assert response.json['data'][0]['individu_1']['id'] == first_id
    assert response.json['data'][0]['individu_2']['id'] == second_id
    assert response.json['data'][0]['score'] == 100

    new = []
    for i in range(90):
        new.append(copy.deepcopy(e['jean']))
        new[-1].id = None
        new[-1].save()
        Duplicate.objects.create(
            first=new[-1],
            second=e['marie'],
            score=(100 - i / 2) / 100.0)

    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' in response.json
    assert 'cookie' in response.json
    assert response.json['cookie'] == urllib.parse.parse_qs(
        urllib.parse.urlparse(
            response.json['more']).query)['cookie'][0]
    assert len(response.json['data']) >= 10
    assert response.json['data'][0]['id'] == d.id
    assert response.json['data'][0]['individu_1']['id'] == first_id
    assert response.json['data'][0]['individu_2']['id'] == second_id
    assert response.json['data'][0]['score'] == 100
    first_data = response.json['data'][0]
    second_data = response.json['data'][1]
    third_data = response.json['data'][2]

    # verify pagination respect ordering by decreasing score and increasing id
    next_url = url
    all_datas = []
    while next_url:
        response = app.get(next_url)
        all_datas.extend(response.json['data'])
        next_url = response.json.get('more')
        if next_url:
            assert response.json['data']
    assert len(all_datas) == 91, 'some duplicates are missing'
    l = [(-x['score'], x['id']) for x in all_datas]
    assert sorted(l) == l, 'data is not properly ordered'

    url = reverse('rsu-api-doublons') + '?limit=100'
    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert 'cookie' not in response.json
    assert len(response.json['data']) == 91

    url = reverse('rsu-api-doublons') + '?score_min=90&limit=100'
    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert 'cookie' not in response.json
    assert len(response.json['data']) == 23  # Duplicate.objects.filter(score__gte=Decimal('0.9')).count()
    for doublon in response.json['data']:
        assert int(doublon['score']) >= 90
    assert response.json['score_min'] == '90'
    assert 'score_max' not in response.json

    doublon_url = reverse('rsu-api-doublon', kwargs={'doublon_id': '%s %s' % (d.first_id,
                                                                              d.second_id)})
    response = app.get(doublon_url)
    assert response.json['err'] == 0
    assert response.json['data']['id'] == first_data['id']

    false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
        'doublon_id': '%s %s' % (
            second_data['individu_1']['id'],
            second_data['individu_2']['id'],
        )
    })
    response = app.post(false_positive_url)
    assert response.json['err'] == 0
    assert Duplicate.objects.get(id=second_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
    log = Log.objects.filter(entity_id=second_data['individu_1']['id']).latest('id')
    assert 'non doublon de' in log.content['text']
    log = Log.objects.filter(entity_id=second_data['individu_2']['id']).latest('id')
    assert 'non doublon de' in log.content['text']

    url = reverse('rsu-api-doublons') + '?false_positive=1'
    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert 'cookie' not in response.json
    assert len(response.json['data']) == 1

    response = app.post(false_positive_url, status=500)
    assert response.json['err'] == 1

    dedup_url = reverse('rsu-api-doublon-dedup', kwargs={
        'doublon_id': '%s %s' % (
            first_data['individu_1']['id'],
            first_data['individu_2']['id'],
        )
    })
    response = app.post_json(dedup_url, params={'choice': 1})
    assert response.json['err'] == 0
    d.refresh_from_db()
    assert d.state == Duplicate.STATE_DEDUP
    assert d.content['dedup_choice'] == 1

    url = reverse('rsu-api-doublons') + '?dedup=1'
    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert 'cookie' not in response.json
    assert len(response.json['data']) == 1

    url = reverse('rsu-api-doublons') + '?limit=100'
    response = app.get(url)
    assert response.json['err'] == 0
    assert 'more' not in response.json
    assert 'cookie' not in response.json
    assert len(response.json['data']) == 89

    false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
        'doublon_id': '%s %s' % (
            third_data['individu_1']['id'],
            third_data['individu_2']['id'],
        )
    })
    response = app.post_json(false_positive_url, params={'journal_form_id': 103})
    assert response.json['err'] == 0
    assert Duplicate.objects.get(id=third_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
    log = Log.objects.filter(entity_id=third_data['individu_1']['id']).latest('id')
    assert 'non doublon de' in log.content['text']
    assert log.content['meta']['form_id'] == 103
    log = Log.objects.filter(entity_id=third_data['individu_2']['id']).latest('id')
    assert 'non doublon de' in log.content['text']
    assert log.content['meta']['form_id'] == 103


def test_doublons_cmd(lot_of_names):
    call_command('rsu-duplicates', 'find')
    assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
    call_command('rsu-duplicates', 'list')


def test_ignore_siblings(nanterre_classic_family, settings):
    call_command('rsu-duplicates', 'find')
    assert Duplicate.objects.count() == 0

    # by moving the birthdate of kevin to undefined (less than 1903)
    # and changing the first name of keving to JEANNOT looking like its father
    # first name JEAN, we find kevin and jean are potential duplicates
    nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
    nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
    nanterre_classic_family['kevin'].save()
    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
    call_command('rsu-duplicates', 'find')
    assert Duplicate.objects.count() == 1

    # if we lower the sibling factor to 0.9, the duplicate is now ignored
    settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
    call_command('rsu-duplicates', 'find')
    assert Duplicate.objects.count() == 0