186 lines
7.1 KiB
Python
186 lines
7.1 KiB
Python
import copy
|
|
import urllib
|
|
|
|
from django.urls import reverse
|
|
from django.core.management import call_command
|
|
|
|
from zoo.zoo_nanterre.models import Duplicate
|
|
from zoo.zoo_data.models import Log, Entity
|
|
|
|
|
|
def test_list_doublons(nanterre_classic_family, app):
|
|
e = nanterre_classic_family
|
|
|
|
url = reverse('rsu-api-doublons')
|
|
|
|
d = Duplicate.objects.create(
|
|
first=e['jean'],
|
|
second=e['marie'],
|
|
score=1.0)
|
|
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert len(response.json['data']) == 1
|
|
assert response.json['data'][0]['id'] == d.id
|
|
first_id = min(e['jean'].id, e['marie'].id)
|
|
second_id = max(e['jean'].id, e['marie'].id)
|
|
assert response.json['data'][0]['individu_1']['id'] == first_id
|
|
assert response.json['data'][0]['individu_2']['id'] == second_id
|
|
assert response.json['data'][0]['score'] == 100
|
|
|
|
new = []
|
|
for i in range(90):
|
|
new.append(copy.deepcopy(e['jean']))
|
|
new[-1].id = None
|
|
new[-1].save()
|
|
Duplicate.objects.create(
|
|
first=new[-1],
|
|
second=e['marie'],
|
|
score=(100 - i / 2) / 100.0)
|
|
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' in response.json
|
|
assert 'cookie' in response.json
|
|
assert response.json['cookie'] == urllib.parse.parse_qs(
|
|
urllib.parse.urlparse(
|
|
response.json['more']).query)['cookie'][0]
|
|
assert len(response.json['data']) >= 10
|
|
assert response.json['data'][0]['id'] == d.id
|
|
assert response.json['data'][0]['individu_1']['id'] == first_id
|
|
assert response.json['data'][0]['individu_2']['id'] == second_id
|
|
assert response.json['data'][0]['score'] == 100
|
|
first_data = response.json['data'][0]
|
|
second_data = response.json['data'][1]
|
|
third_data = response.json['data'][2]
|
|
|
|
# verify pagination respect ordering by decreasing score and increasing id
|
|
next_url = url
|
|
all_datas = []
|
|
while next_url:
|
|
response = app.get(next_url)
|
|
all_datas.extend(response.json['data'])
|
|
next_url = response.json.get('more')
|
|
if next_url:
|
|
assert response.json['data']
|
|
assert len(all_datas) == 91, 'some duplicates are missing'
|
|
l = [(-x['score'], x['id']) for x in all_datas]
|
|
assert sorted(l) == l, 'data is not properly ordered'
|
|
|
|
url = reverse('rsu-api-doublons') + '?limit=100'
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert 'cookie' not in response.json
|
|
assert len(response.json['data']) == 91
|
|
|
|
url = reverse('rsu-api-doublons') + '?score_min=90&limit=100'
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert 'cookie' not in response.json
|
|
assert len(response.json['data']) == 23 # Duplicate.objects.filter(score__gte=Decimal('0.9')).count()
|
|
for doublon in response.json['data']:
|
|
assert int(doublon['score']) >= 90
|
|
assert response.json['score_min'] == '90'
|
|
assert 'score_max' not in response.json
|
|
|
|
doublon_url = reverse('rsu-api-doublon', kwargs={'doublon_id': '%s %s' % (d.first_id,
|
|
d.second_id)})
|
|
response = app.get(doublon_url)
|
|
assert response.json['err'] == 0
|
|
assert response.json['data']['id'] == first_data['id']
|
|
|
|
false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
|
|
'doublon_id': '%s %s' % (
|
|
second_data['individu_1']['id'],
|
|
second_data['individu_2']['id'],
|
|
)
|
|
})
|
|
response = app.post(false_positive_url)
|
|
assert response.json['err'] == 0
|
|
assert Duplicate.objects.get(id=second_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
|
|
log = Log.objects.filter(entity_id=second_data['individu_1']['id']).latest('id')
|
|
assert 'non doublon de' in log.content['text']
|
|
log = Log.objects.filter(entity_id=second_data['individu_2']['id']).latest('id')
|
|
assert 'non doublon de' in log.content['text']
|
|
|
|
url = reverse('rsu-api-doublons') + '?false_positive=1'
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert 'cookie' not in response.json
|
|
assert len(response.json['data']) == 1
|
|
|
|
response = app.post(false_positive_url, status=500)
|
|
assert response.json['err'] == 1
|
|
|
|
dedup_url = reverse('rsu-api-doublon-dedup', kwargs={
|
|
'doublon_id': '%s %s' % (
|
|
first_data['individu_1']['id'],
|
|
first_data['individu_2']['id'],
|
|
)
|
|
})
|
|
response = app.post_json(dedup_url, params={'choice': 1})
|
|
assert response.json['err'] == 0
|
|
d.refresh_from_db()
|
|
assert d.state == Duplicate.STATE_DEDUP
|
|
assert d.content['dedup_choice'] == 1
|
|
|
|
url = reverse('rsu-api-doublons') + '?dedup=1'
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert 'cookie' not in response.json
|
|
assert len(response.json['data']) == 1
|
|
|
|
url = reverse('rsu-api-doublons') + '?limit=100'
|
|
response = app.get(url)
|
|
assert response.json['err'] == 0
|
|
assert 'more' not in response.json
|
|
assert 'cookie' not in response.json
|
|
assert len(response.json['data']) == 89
|
|
|
|
false_positive_url = reverse('rsu-api-doublon-false-positive', kwargs={
|
|
'doublon_id': '%s %s' % (
|
|
third_data['individu_1']['id'],
|
|
third_data['individu_2']['id'],
|
|
)
|
|
})
|
|
response = app.post_json(false_positive_url, params={'journal_form_id': 103})
|
|
assert response.json['err'] == 0
|
|
assert Duplicate.objects.get(id=third_data['id']).state == Duplicate.STATE_FALSE_POSITIVE
|
|
log = Log.objects.filter(entity_id=third_data['individu_1']['id']).latest('id')
|
|
assert 'non doublon de' in log.content['text']
|
|
assert log.content['meta']['form_id'] == 103
|
|
log = Log.objects.filter(entity_id=third_data['individu_2']['id']).latest('id')
|
|
assert 'non doublon de' in log.content['text']
|
|
assert log.content['meta']['form_id'] == 103
|
|
|
|
|
|
def test_doublons_cmd(lot_of_names):
|
|
call_command('rsu-duplicates', 'find')
|
|
assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
|
|
call_command('rsu-duplicates', 'list')
|
|
|
|
|
|
def test_ignore_siblings(nanterre_classic_family, settings):
|
|
call_command('rsu-duplicates', 'find')
|
|
assert Duplicate.objects.count() == 0
|
|
|
|
# by moving the birthdate of kevin to undefined (less than 1903)
|
|
# and changing the first name of keving to JEANNOT looking like its father
|
|
# first name JEAN, we find kevin and jean are potential duplicates
|
|
nanterre_classic_family['kevin'].content['prenoms'] = 'JEANNOT'
|
|
nanterre_classic_family['kevin'].content['date_de_naissance'] = '1901-01-01'
|
|
nanterre_classic_family['kevin'].save()
|
|
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 1.0
|
|
call_command('rsu-duplicates', 'find')
|
|
assert Duplicate.objects.count() == 1
|
|
|
|
# if we lower the sibling factor to 0.9, the duplicate is now ignored
|
|
settings.ZOO_NANTERRE_DUPLICATES_SIBLING_FACTOR = 0.9
|
|
call_command('rsu-duplicates', 'find')
|
|
assert Duplicate.objects.count() == 0
|