Merge remote-tracking branch 'origin' into nanterre-recette

This commit is contained in:
Thomas NOËL 2018-02-02 18:04:38 +01:00
commit bf52ff175b
7 changed files with 230 additions and 88 deletions

View File

@ -21,7 +21,7 @@ from zoo.zoo_nanterre.utils import (PersonSearch, adresse as get_individu_adress
def test_person_search(db, rsu):
search = PersonSearch()
found = list(search.search_name(rsu[0].content['prenoms'], rsu[0].content['nom_de_naissance']))
found = list(search.search_name(rsu[0].content['prenoms'] + ' ' + rsu[0].content['nom_de_naissance']))
assert rsu[0].id == found[0].id
assert found[0].similarity == 1.0

View File

@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
import csv
import json
import pytest
from zoo.zoo_data.models import Entity, Transaction
# Ces tests ne sont pas exécutables publiquement car ils nécessitent des
# données personnelles pour être efficaces:
# Pour générer ces données faire dans psql:
#
# \copy (select e.content->>'genre',
# e.content->>'statut_legal',
# e.content->>'prenoms',
# e.content->>'nom_de_naissance',
# e.content->>'nom_d_usage',
# e.content->>'date_de_naissance'
# from zoo_data_entity as e, zoo_meta_entityschema as s
# where e.schema_id = s.id and s.slug = 'individu') to '/tmp/export-etat-civil.csv' with csv;
#
# et copier le fichier export-etat-civil.csv dans le répertoire tests/, ensuite
# écrire un fichier basic-searches.csv dans ce même répertoire avec le format
# suivant:
# * 1ère colonne: la requête, ex.: "benjamin dauvergne"
# * 2ème colonne: un entier positif, n, indiquant que la réponse doit se trouver dans les n premiers résultats,
# * 3ème colonne: un dictionnaire au format JSON indiquant les attributs pour
# repérer l'entrée que l'on recherche, ex.: (JSON adapté pour le format CSV)
# "{""nom"": "DAUVERGNE", ""prenoms"": ""BENJAMIN""}"
#
# En l'absence de ces fichiers les tests sont ignorés.
@pytest.fixture
def lot_of_names(rsu_schema):
try:
reader = csv.reader(open('tests/export-etat-civil.csv'))
except IOError:
pytest.skip()
def generate():
tr = Transaction.objects.create()
for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
yield Entity(
schema=rsu_schema['individu'],
created=tr,
content={
'genre': genre,
'prenoms': prenoms,
'nom_de_naissance': nom_de_naissance,
'nom_d_usage': nom_d_usage,
'date_de_naissance': date_de_naissance,
'telephones': [],
'statut_legal': statut_legal,
'email': '',
'cles_de_federation': {},
})
Entity.objects.bulk_create(generate())
@pytest.fixture
def basic_searches():
try:
reader = csv.reader(open('tests/basic-searches.csv'))
except IOError:
pytest.skip()
def generate():
for query, window, checks in reader:
window = int(window)
checks = json.loads(checks)
yield query, window, checks
return list(generate())
def check(record, checks):
for key, value in checks.iteritems():
if record.get(key) != value:
return False
return True
def test_basic(app, basic_searches, lot_of_names):
for query, window, checks in basic_searches:
response = app.get('/rsu/search/', params={'q': query, 'limit': 15})
data = response.json['data'][:window]
texts = [(row['score'], row['text']) for row in response.json['data']]
# vérifie que la réponse que l'on cherche est bien dans les `window`
# premières réponses.
assert any(record for record in data if check(record, checks)), (
'query %r does not match criterious %r in first %d records' % (query, checks, window))

View File

@ -53,3 +53,8 @@ class JSONTextRef(Func):
jsonb = JSONRef(jsonb, *expressions[1:-1])
ref = Value(expressions[-1])
super(JSONTextRef, self).__init__(jsonb, ref, **extra)
class TextCat(Func):
function = ''
arg_joiner = ' || '

View File

@ -15,9 +15,25 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from django.utils.translation import ugettext_lazy as _
from django.db.models.signals import post_migrate, post_save
from django.apps import AppConfig
class ZooMetaAppConfig(AppConfig):
name = 'zoo.zoo_meta'
verbose_name = _('metadatas')
def post_migrate(self, **kwargs):
from .models import EntitySchema
for schema in EntitySchema.objects.all():
schema.rebuild_indexes()
def post_save(self, sender, instance, **kwargs):
instance.rebuild_indexes()
def ready(self):
from .models import EntitySchema
post_migrate.connect(self.post_migrate)
post_save.connect(self.post_save, sender=EntitySchema)

View File

@ -16,6 +16,7 @@
from hashlib import md5
from django.apps import apps
from django.db import models, connection
from django.utils.translation import ugettext_lazy as _
from django.contrib.postgres.fields import JSONField
@ -87,6 +88,25 @@ class CommonSchema(models.Model):
'WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
cursor.execute(sql)
def create_trigram_index(self, expr):
from zoo.zoo_data.models import Entity, Relation
if isinstance(self, EntitySchema):
table = Entity._meta.db_table
elif isinstance(self, RelationSchema):
table = Relation._meta.db_table
else:
raise NotImplementedError(self)
key = md5(expr).hexdigest()[:8]
gin_sql = ('CREATE INDEX zoo_entity_%s_gin_%s_dynamic_idx ON %s USING gin ((%s) '
'gin_trgm_ops) WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
gist_sql = ('CREATE INDEX zoo_entity_%s_gist_%s_dynamic_idx ON %s USING gist ((%s)'
' gist_trgm_ops) WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
with connection.cursor() as cursor:
cursor.execute(gin_sql)
cursor.execute(gist_sql)
def rebuild_indexes(self):
from zoo.zoo_data.models import Entity
@ -114,11 +134,12 @@ class CommonSchema(models.Model):
if m is not None:
m(cursor, table, path)
objects = GetBySlugManager()
# delegate index building to custom applications
for app in apps.get_app_configs():
if hasattr(app, 'zoo_rebuild_indexes'):
app.zoo_rebuild_indexes(self)
def save(self, *args, **kwargs):
super(CommonSchema, self).save(*args, **kwargs)
self.rebuild_indexes()
objects = GetBySlugManager()
def make_caption(self, value):
if self.caption_template:

View File

@ -104,3 +104,14 @@ class ZooNanterreConfig(AppConfig):
def ready(self):
post_migrate.connect(self.post_migrate)
def zoo_rebuild_indexes(self, schema):
from .utils import INDIVIDU_ENT
if schema.slug != INDIVIDU_ENT:
return
schema.create_trigram_index(
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_de_naissance'))")
schema.create_trigram_index(
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_d_usage'))")

View File

@ -36,7 +36,7 @@ import psycopg2
from django.conf import settings
from django.contrib.postgres.search import TrigramDistance
from django.db import connection
from django.db.models import Q, F, Value
from django.db.models import Q, F, Value, ExpressionWrapper, CharField
from django.db.models.functions import Least, Greatest, Coalesce, Concat
from django.db import transaction
from django.utils.timezone import now, make_aware
@ -44,7 +44,7 @@ from django.contrib.auth.hashers import make_password
from zoo.zoo_meta.models import EntitySchema, RelationSchema
from zoo.zoo_data.models import Entity, Relation, Transaction, Log
from zoo.zoo_data.search import JSONTextRef, Normalize
from zoo.zoo_data.search import JSONTextRef, Normalize, TextCat
today = datetime.date.today
@ -184,18 +184,20 @@ class PersonSearch(object):
def __init__(self, limit=0.5, base_limit=0.3):
self.birthdates_filters = []
self.filters = {}
self.name_filters = []
self.name_similarities = []
self.email_similarities = []
self.key_filters = []
self.email_filters = []
self.statut_legal_filter = None
self.schema = EntitySchema.objects.get(slug=INDIVIDU_ENT)
self.limit = limit
self.base_limit = base_limit
self.annotations = []
def add_filter(self, name, filter_expression):
self.filters.setdefault(name, []).append(filter_expression)
def search_statut_legal(self, statut_legal):
self.statut_legal_filter = statut_legal
self.add_filter('statut_legal', Q(content__statut_legal=statut_legal))
def search_query(self, query):
'''Take a one line query and try to build a search filter from it'''
@ -230,8 +232,7 @@ class PersonSearch(object):
def search_email(self, email):
self = copy.deepcopy(self)
f = self.q_normalize('email', email)
self.email_filters.append(f)
self.add_filter('email', self.q_normalize('email', email))
self.email_similarities.append(Value(1.0) - self.distance('email', email))
return self
@ -254,48 +255,48 @@ class PersonSearch(object):
for key, name in self.applications():
filters.append(Q(**{'content__cles_de_federation__%s' % key: identifier}))
q = functools.reduce(Q.__or__, filters)
self.key_filters.append(q)
self.add_filter('key', q)
return self
def search_birthdate(self, birthdate, window_days=0):
self = copy.deepcopy(self)
if hasattr(birthdate, 'keys'):
# case of dict
pass
elif hasattr(birthdate, 'year'):
if hasattr(birthdate, 'year'):
if hasattr(birthdate, 'date'):
birthdate = birthdate.date()
# fast path for date / datetime
if window_days:
before = birthdate - datetime.timedelta(days=window_days)
after = birthdate + datetime.timedelta(days=window_days)
self.birthdates_filters.append(
Q(content__date_de_naissance__timestamp__gte=before)
& Q(content__date_de_naissance__timestamp__lte=after)
)
else:
self.birthdates_filters.append(
Q(content__date_de_naissance__timestamp=birthdate)
)
return self
before = birthdate
after = birthdate
else:
# case of strings
birthdate = self.match_birthdate(birthdate).groupdict()
# case of string
if not hasattr(birthdate, 'keys'):
birthdate = self.match_birthdate(birthdate).groupdict()
this_year = datetime.date.today().year % 100
year = int(birthdate['year'])
if year < 100:
if year > this_year:
year += 1900
this_year = datetime.date.today().year % 100
year = int(birthdate['year'])
if year < 100:
if year > this_year:
year += 1900
else:
year += 2000
if len(birthdate) == 3:
before = after = datetime.date(
int(birthdate['year']), int(birthdate['month']), int(birthdate['day']))
elif len(birthdate) == 2:
after = datetime.date(int(birthdate['year']), int(birthdate['month']), 1)
before = ((after + datetime.timedelta(days=31)).replace(day=1)
- datetime.timedelta(days=1))
else:
year += 2000
q = Q(content__date_de_naissance__timestamp__year=year)
if birthdate['month']:
q &= Q(content__date_de_naissance__timestamp__month=birthdate['month'])
if birthdate['day']:
q &= Q(content__date_de_naissance__timestamp__day=birthdate['day'])
self.birthdates_filters.append(q)
after = datetime.date(int(birthdate['year']), 1, 1)
before = datetime.date(int(birthdate['year']), 12, 31)
if window_days:
after -= datetime.timedelta(days=window_days)
before += datetime.timedelta(days=window_days)
query = Q(content__date_de_naissance__timestamp__gte=after)
query &= Q(content__date_de_naissance__timestamp__lte=before)
self.add_filter('birthdate', query)
return self
@classmethod
@ -306,48 +307,45 @@ class PersonSearch(object):
def q_normalize(self, field, value):
return Q(**{'content__%s__normalize__trigram_similar' % field: self.luv(value)})
def search_name(self, first_name=None, last_name=None, factor=1.0, first_name_weight=1.0,
last_name_weight=1.0):
def search_name(self, fullname, factor=1.0, first_name_weight=1.0, last_name_weight=1.0):
self = copy.deepcopy(self)
q = Q()
if not first_name or not last_name:
factor *= 0.8
if last_name:
q &= (self.q_normalize('nom_d_usage', last_name)
| self.q_normalize('nom_de_naissance', last_name))
if first_name:
q &= self.q_normalize('prenoms', first_name)
self.name_filters.append(q)
fullname_naissance = ExpressionWrapper(
Normalize(TextCat(
JSONTextRef(F('content'), 'prenoms'),
Value(' '),
JSONTextRef(F('content'), 'nom_de_naissance'))),
output_field=CharField())
fullname_usage = ExpressionWrapper(
Normalize(TextCat(
JSONTextRef(F('content'), 'prenoms'),
Value(' '),
JSONTextRef(F('content'), 'nom_d_usage'))),
output_field=CharField())
fname_d = self.distance('prenoms', first_name)
name_of_use_d = self.distance('nom_d_usage', last_name)
name_of_birth_d = self.distance('nom_de_naissance', last_name)
self.annotations.append(('fullname_naissance', fullname_naissance))
self.annotations.append(('fullname_usage', fullname_usage))
if first_name and last_name:
similarity = Value(first_name_weight) * fname_d
similarity += Value(last_name_weight) * Least(name_of_use_d, name_of_birth_d)
similarity /= Value(first_name_weight + last_name_weight)
elif first_name:
similarity = fname_d
else:
similarity = Least(name_of_use_d, name_of_birth_d)
similarity = (Value(1.0) - similarity) * Value(factor)
self.name_similarities.append(similarity)
# Create the simple filter
q = (Q(fullname_naissance__trigram_similar=self.luv(fullname))
| Q(fullname_usage__trigram_similar=self.luv(fullname)))
self.add_filter('name', q)
# Compute similarity score
for expression in (fullname_naissance, fullname_usage):
self.name_similarities.append(
Value(1.0) - TrigramDistance(expression, self.luv(fullname)))
return self
def search_names(self, names):
if not names:
return self
fullname = u' '.join(names)
self = copy.deepcopy(self)
for i in range(0, len(names) + 1):
first_name, last_name = ' '.join(names[:i]), ' '.join(names[i:])
self = self.search_name(first_name, last_name)
if len(names) > 1:
self = self.search_name(last_name, first_name, factor=0.8)
return self
return self.search_name(fullname)
def copy(self):
return copy.deepcopy(self)
@ -432,20 +430,17 @@ class PersonSearch(object):
qs = Entity.objects.filter(schema=self.schema)
qs = qs.filter(
self.or_filters(
self.birthdates_filters))
qs = qs.filter(
self.or_filters(self.key_filters))
qs = qs.filter(
self.or_filters(self.email_filters))
qs = qs.filter(
self.or_filters(self.name_filters))
if self.statut_legal_filter:
qs = qs.filter(content__statut_legal=self.statut_legal_filter)
for key, annotation in self.annotations:
qs = qs.annotate(**{key: annotation})
# search filter upon name, id, key, email, birthdate and statut legal
for key in self.filters:
qs = qs.filter(
self.or_filters(
self.filters[key]))
qs = qs.annotate(
fullname=Concat(
fullname2=Concat(
Coalesce(
JSONTextRef(F('content'), 'nom_d_usage'),
JSONTextRef(F('content'), 'nom_de_naissance'),
@ -472,9 +467,9 @@ class PersonSearch(object):
qs = qs.annotate(similarity=functools.reduce(operator.__add__, similarities) /
Value(len(similarities)))
qs = qs.filter(similarity__gte=self.limit)
qs = qs.order_by('-similarity', 'fullname')
qs = qs.order_by('-similarity', 'fullname2')
else:
qs = qs.order_by('fullname')
qs = qs.order_by('fullname2')
if prefetch:
qs = qs.prefetch_related(