zoo/tests/test_nanterre_doublons.py

186 lines
7.1 KiB
Python

import copy
from django.urls import reverse
from django.core.management import call_command
from django.utils.six.moves.urllib import parse as urlparse
from zoo.zoo_nanterre.models import Duplicate
from zoo.zoo_data.models import Log, Entity
def test_list_doublons(nanterre_classic_family, app):
e = nanterre_classic_family
url = reverse('rsu-api-doublons')
d = Duplicate.objects.create(
first=e['jean'],
second=e['marie'],
score=1.0)
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert len(response.json['data']) == 1
assert response.json['data'][0]['id'] == d.id
first_id = min(e['jean'].id, e['marie'].id)
second_id = max(e['jean'].id, e['marie'].id)
assert response.json['data'][0]['individu_1']['id'] == first_id
assert response.json['data'][0]['individu_2']['id'] == second_id
assert response.json['data'][0]['score'] == 100
new = []
for i in range(90):
new.append(copy.deepcopy(e['jean']))
new[-1].id = None
new[-1].save()
Duplicate.objects.create(
first=new[-1],
second=e['marie'],
score=(100 - i / 2) / 100.0)
response = app.get(url)
assert response.json['err'] == 0
assert 'more' in response.json
assert 'cookie' in response.json
assert response.json['cookie'] == urlparse.parse_qs(
urlparse.urlparse(
response.json['more']).query)['cookie'][0]
assert len(response.json['data']) >= 10
assert response.json['data'][0]['id'] == d.id
assert response.json['data'][0]['individu_1']['id'] == first_id
assert response.json['data'][0]['individu_2']['id'] == second_id
assert response.json['data'][0]['score'] == 100
first_data = response.json['data'][0]
second_data = response.json['data'][1]
third_data = response.json['data'][2]
# verify pagination respect ordering by decreasing score and increasing id
next_url = url
all_datas = []
while next_url:
response = app.get(next_url)
all_datas.extend(response.json['data'])
next_url = response.json.get('more')
if next_url:
assert response.json['data']
assert len(all_datas) == 91, 'some duplicates are missing'
l = [(-x['score'], x['id']) for x in all_datas]
assert sorted(l) == l, 'data is not properly ordered'
url = reverse('rsu-api-doublons') + '?limit=100'
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert 'cookie' not in response.json
assert len(response.json['data']) == 91
url = reverse('rsu-api-doublons') + '?score_min=90&limit=100'
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert 'cookie' not in response.json
assert len(response.json['data']) == 23 # Duplicate.objects.filter(score__gte=Decimal('0.9')).count()
for doublon in response.json['data']:
assert int(doublon['score']) >= 90
assert response.json['score_min'] == '90'
assert 'score_max' not in response.json
doublon_url = reverse('rsu-api-doublon', kwargs={'doublon_id': '%s %s' % (d.first_id,
d.second_id)})
response = app.get(doublon_url)
assert response.json['err'] == 0
assert response.json['data']['id'] == first_data['id']
false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
'doublon_id': '%s %s' % (
second_data['individu_1']['id'],
second_data['individu_2']['id'],
)
})
response = app.post(false_positive_url)
assert response.json['err'] == 0
assert Duplicate.objects.get(id=second_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
log = Log.objects.filter(entity_id=second_data['individu_1']['id']).latest('id')
assert 'non doublon de' in log.content['text']
log = Log.objects.filter(entity_id=second_data['individu_2']['id']).latest('id')
assert 'non doublon de' in log.content['text']
url = reverse('rsu-api-doublons') + '?false_positive=1'
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert 'cookie' not in response.json
assert len(response.json['data']) == 1
response = app.post(false_positive_url, status=500)
assert response.json['err'] == 1
dedup_url = reverse('rsu-api-doublon-dedup', kwargs={
'doublon_id': '%s %s' % (
first_data['individu_1']['id'],
first_data['individu_2']['id'],
)
})
response = app.post_json(dedup_url, params={'choice': 1})
assert response.json['err'] == 0
d.refresh_from_db()
assert d.state == Duplicate.STATE_DEDUP
assert d.content['dedup_choice'] == 1
url = reverse('rsu-api-doublons') + '?dedup=1'
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert 'cookie' not in response.json
assert len(response.json['data']) == 1
url = reverse('rsu-api-doublons') + '?limit=100'
response = app.get(url)
assert response.json['err'] == 0
assert 'more' not in response.json
assert 'cookie' not in response.json
assert len(response.json['data']) == 89
false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
'doublon_id': '%s %s' % (
third_data['individu_1']['id'],
third_data['individu_2']['id'],
)
})
response = app.post_json(false_positive_url, params={'journal_form_id': 103})
assert response.json['err'] == 0
assert Duplicate.objects.get(id=third_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
log = Log.objects.filter(entity_id=third_data['individu_1']['id']).latest('id')
assert 'non doublon de' in log.content['text']
assert log.content['meta']['form_id'] == 103
log = Log.objects.filter(entity_id=third_data['individu_2']['id']).latest('id')
assert 'non doublon de' in log.content['text']
assert log.content['meta']['form_id'] == 103
def test_doublons_cmd(lot_of_names):
call_command('rsu-duplicates', 'find')
assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
call_command('rsu-duplicates', 'list')
def test_ignore_siblings(nanterre_classic_family, settings):
call_command('rsu-duplicates', 'find')
assert Duplicate.objects.count() == 0
# by moving the birthdate of kevin to undefined (less than 1903)
# and changing the first name of keving to JEANNOT looking like its father
# first name JEAN, we find kevin and jean are potential duplicates
nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
nanterre_classic_family['kevin'].save()
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
call_command('rsu-duplicates', 'find')
assert Duplicate.objects.count() == 1
# if we lower the sibling factor to 0.9, the duplicate is now ignored
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
call_command('rsu-duplicates', 'find')
assert Duplicate.objects.count() == 0