Merge branch 'master' into nanterre-recette

This commit is contained in:
Thomas NOËL 2018-03-07 15:38:01 +01:00
commit a49d95d2f5
7 changed files with 94 additions and 43 deletions

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import datetime
import csv
import pytest
import django_webtest
@ -281,3 +282,31 @@ def app(request):
wtm._patch_settings()
request.addfinalizer(wtm._unpatch_settings)
return django_webtest.DjangoTestApp(extra_environ={'HTTP_HOST': 'localhost'})
@pytest.fixture
def lot_of_names(rsu_schema):
try:
reader = csv.reader(open('tests/export-etat-civil.csv'))
except IOError:
pytest.skip()
def generate():
tr = Transaction.objects.create()
for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
yield Entity(
schema=rsu_schema['individu'],
created=tr,
content={
'genre': genre,
'prenoms': prenoms,
'nom_de_naissance': nom_de_naissance,
'nom_d_usage': nom_d_usage,
'date_de_naissance': date_de_naissance,
'telephones': [],
'statut_legal': statut_legal,
'email': '',
'cles_de_federation': {},
})
Entity.objects.bulk_create(generate())

View File

@ -2,9 +2,10 @@ import urlparse
import copy
from django.core.urlresolvers import reverse
from django.core.management import call_command
from zoo.zoo_nanterre.models import Duplicate
from zoo.zoo_data.models import Log
from zoo.zoo_data.models import Log, Entity
def test_list_doublons(nanterre_classic_family, app):
@ -156,3 +157,11 @@ def test_list_doublons(nanterre_classic_family, app):
log = Log.objects.filter(entity_id=third_data['individu_2']['id']).latest('id')
assert 'non doublon de' in log.content['text']
assert log.content['meta']['form_id'] == 103
def test_doublons_cmd(lot_of_names):
call_command('rsu-duplicates', 'find')
assert Duplicate.objects.count() < (Entity.objects.filter(schema__slug='individu').count() / 5)
call_command('rsu-duplicates', 'list')
import pdb
pdb.set_trace()

View File

@ -5,8 +5,6 @@ import json
import pytest
from zoo.zoo_data.models import Entity, Transaction
# Ces tests ne sont pas exécutables publiquement car ils nécessitent des
# données personnelles pour être efficaces:
# Pour générer ces données faire dans psql:
@ -32,34 +30,6 @@ from zoo.zoo_data.models import Entity, Transaction
# En l'absence de ces fichiers les tests sont ignorés.
@pytest.fixture
def lot_of_names(rsu_schema):
try:
reader = csv.reader(open('tests/export-etat-civil.csv'))
except IOError:
pytest.skip()
def generate():
tr = Transaction.objects.create()
for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
yield Entity(
schema=rsu_schema['individu'],
created=tr,
content={
'genre': genre,
'prenoms': prenoms,
'nom_de_naissance': nom_de_naissance,
'nom_d_usage': nom_d_usage,
'date_de_naissance': date_de_naissance,
'telephones': [],
'statut_legal': statut_legal,
'email': '',
'cles_de_federation': {},
})
Entity.objects.bulk_create(generate())
@pytest.fixture
def basic_searches():
try:

View File

@ -23,17 +23,10 @@ class ZooMetaAppConfig(AppConfig):
name = 'zoo.zoo_meta'
verbose_name = _('metadatas')
def post_migrate(self, **kwargs):
from .models import EntitySchema
for schema in EntitySchema.objects.all():
schema.rebuild_indexes()
def post_save(self, sender, instance, **kwargs):
instance.rebuild_indexes()
def ready(self):
from .models import EntitySchema
post_migrate.connect(self.post_migrate)
post_save.connect(self.post_save, sender=EntitySchema)

View File

@ -115,3 +115,9 @@ class ZooNanterreConfig(AppConfig):
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_de_naissance'))")
schema.create_trigram_index(
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_d_usage'))")
schema.create_trigram_index(
"immutable_normalize(content->>'prenoms')")
schema.create_trigram_index(
"immutable_normalize(content->>'nom_d_usage')")
schema.create_trigram_index(
"immutable_normalize(content->>'nom_de_naissance')")

View File

@ -57,10 +57,7 @@ def find_duplicates(limit=None, base_limit=None, queryset=None, days=None, count
continue
# search for duplicate based on the name
s = search.copy()
if first.content['nom_de_naissance']:
s = search.search_name(u'%s %s' % (first.content['prenoms'], first.content['nom_de_naissance']))
if first.content['nom_d_usage']:
s = s.search_name(u'%s %s' % (first.content['prenoms'], first.content['nom_d_usage']))
s = s.search_individu(first)
if first.content.get('date_de_naissance'):
s = s.search_birthdate(
datetime.datetime.strptime(

View File

@ -182,7 +182,7 @@ class PersonSearch(object):
yield application, value['name']
return list(helper())
def __init__(self, limit=0.5, base_limit=0.3):
def __init__(self, limit=0.5, base_limit=0.1):
self.birthdates_filters = []
self.filters = {}
self.name_filters = []
@ -355,6 +355,53 @@ class PersonSearch(object):
return self.search_name(fullname)
def search_individu(self, individu, factor=1.0, first_name_weight=1.0, last_name_weight=1.0):
self = copy.deepcopy(self)
prenoms = individu.content['prenoms']
nom_de_naissance = individu.content['nom_de_naissance']
nom_d_usage = individu.content['nom_d_usage']
# Create the simple filter
prenoms_expression = Value(1.0) - TrigramDistance(Normalize(
JSONTextRef(F('content'), 'prenoms')), self.luv(prenoms))
nn_nn_expression = Value(1.0) - TrigramDistance(Normalize(
JSONTextRef(F('content'), 'nom_de_naissance')), self.luv(nom_de_naissance))
nu_nu_expression = Value(1.0) - TrigramDistance(Normalize(
JSONTextRef(F('content'), 'nom_d_usage')), self.luv(nom_d_usage))
nn_nu_expression = Value(1.0) - TrigramDistance(Normalize(
JSONTextRef(F('content'), 'nom_de_naissance')), self.luv(nom_d_usage))
nu_nn_expression = Value(1.0) - TrigramDistance(Normalize(
JSONTextRef(F('content'), 'nom_d_usage')), self.luv(nom_de_naissance))
q_noms = []
if nom_de_naissance:
q_noms.append(self.q_normalize('nom_de_naissance', nom_de_naissance))
self.name_similarities.append(
(first_name_weight * prenoms_expression
+ last_name_weight * nn_nn_expression)
/ Value(first_name_weight + last_name_weight))
self.name_similarities.append(
(first_name_weight * prenoms_expression
+ last_name_weight * nu_nn_expression)
/ Value(first_name_weight + last_name_weight))
q_noms.append(self.q_normalize('nom_d_usage', nom_de_naissance))
if nom_d_usage:
q_noms.append(self.q_normalize('nom_d_usage', nom_d_usage))
q_noms.append(self.q_normalize('nom_de_naissance', nom_d_usage))
self.name_similarities.append(
(first_name_weight * prenoms_expression
+ last_name_weight * nu_nu_expression)
/ Value(first_name_weight + last_name_weight))
self.name_similarities.append(
(first_name_weight * prenoms_expression
+ last_name_weight * nn_nu_expression)
/ Value(first_name_weight + last_name_weight))
q = self.q_normalize('prenoms', prenoms) & reduce(operator.__or__, q_noms)
self.add_filter('name', q)
# Compute similarity score
return self
def copy(self):
return copy.deepcopy(self)