Merge remote-tracking branch 'origin' into nanterre-recette
This commit is contained in:
commit
bf52ff175b
|
@ -21,7 +21,7 @@ from zoo.zoo_nanterre.utils import (PersonSearch, adresse as get_individu_adress
|
|||
def test_person_search(db, rsu):
|
||||
search = PersonSearch()
|
||||
|
||||
found = list(search.search_name(rsu[0].content['prenoms'], rsu[0].content['nom_de_naissance']))
|
||||
found = list(search.search_name(rsu[0].content['prenoms'] + ' ' + rsu[0].content['nom_de_naissance']))
|
||||
assert rsu[0].id == found[0].id
|
||||
assert found[0].similarity == 1.0
|
||||
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from zoo.zoo_data.models import Entity, Transaction
|
||||
|
||||
# Ces tests ne sont pas exécutables publiquement car ils nécessitent des
|
||||
# données personnelles pour être efficaces:
|
||||
# Pour générer ces données faire dans psql:
|
||||
#
|
||||
# \copy (select e.content->>'genre',
|
||||
# e.content->>'statut_legal',
|
||||
# e.content->>'prenoms',
|
||||
# e.content->>'nom_de_naissance',
|
||||
# e.content->>'nom_d_usage',
|
||||
# e.content->>'date_de_naissance'
|
||||
# from zoo_data_entity as e, zoo_meta_entityschema as s
|
||||
# where e.schema_id = s.id and s.slug = 'individu') to '/tmp/export-etat-civil.csv' with csv;
|
||||
#
|
||||
# et copier le fichier export-etat-civil.csv dans le répertoire tests/, ensuite
|
||||
# écrire un fichier basic-searches.csv dans ce même répertoire avec le format
|
||||
# suivant:
|
||||
# * 1ère colonne: la requête, ex.: "benjamin dauvergne"
|
||||
# * 2ème colonne: un entier positif, n, indiquant que la réponse doit se trouver dans les n premiers résultats,
|
||||
# * 3ème colonne: un dictionnaire au format JSON indiquant les attributs pour
|
||||
# repérer l'entrée que l'on recherche, ex.: (JSON adapté pour le format CSV)
|
||||
# "{""nom"": "DAUVERGNE", ""prenoms"": ""BENJAMIN""}"
|
||||
#
|
||||
# En l'absence de ces fichiers les tests sont ignorés.
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lot_of_names(rsu_schema):
|
||||
try:
|
||||
reader = csv.reader(open('tests/export-etat-civil.csv'))
|
||||
except IOError:
|
||||
pytest.skip()
|
||||
|
||||
def generate():
|
||||
tr = Transaction.objects.create()
|
||||
for genre, statut_legal, prenoms, nom_de_naissance, nom_d_usage, date_de_naissance in reader:
|
||||
yield Entity(
|
||||
schema=rsu_schema['individu'],
|
||||
created=tr,
|
||||
content={
|
||||
'genre': genre,
|
||||
'prenoms': prenoms,
|
||||
'nom_de_naissance': nom_de_naissance,
|
||||
'nom_d_usage': nom_d_usage,
|
||||
'date_de_naissance': date_de_naissance,
|
||||
'telephones': [],
|
||||
'statut_legal': statut_legal,
|
||||
'email': '',
|
||||
'cles_de_federation': {},
|
||||
})
|
||||
|
||||
Entity.objects.bulk_create(generate())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def basic_searches():
|
||||
try:
|
||||
reader = csv.reader(open('tests/basic-searches.csv'))
|
||||
except IOError:
|
||||
pytest.skip()
|
||||
|
||||
def generate():
|
||||
for query, window, checks in reader:
|
||||
window = int(window)
|
||||
checks = json.loads(checks)
|
||||
|
||||
yield query, window, checks
|
||||
return list(generate())
|
||||
|
||||
|
||||
def check(record, checks):
|
||||
for key, value in checks.iteritems():
|
||||
if record.get(key) != value:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def test_basic(app, basic_searches, lot_of_names):
|
||||
for query, window, checks in basic_searches:
|
||||
response = app.get('/rsu/search/', params={'q': query, 'limit': 15})
|
||||
data = response.json['data'][:window]
|
||||
texts = [(row['score'], row['text']) for row in response.json['data']]
|
||||
# vérifie que la réponse que l'on cherche est bien dans les `window`
|
||||
# premières réponses.
|
||||
assert any(record for record in data if check(record, checks)), (
|
||||
'query %r does not match criterious %r in first %d records' % (query, checks, window))
|
|
@ -53,3 +53,8 @@ class JSONTextRef(Func):
|
|||
jsonb = JSONRef(jsonb, *expressions[1:-1])
|
||||
ref = Value(expressions[-1])
|
||||
super(JSONTextRef, self).__init__(jsonb, ref, **extra)
|
||||
|
||||
|
||||
class TextCat(Func):
|
||||
function = ''
|
||||
arg_joiner = ' || '
|
||||
|
|
|
@ -15,9 +15,25 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.db.models.signals import post_migrate, post_save
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ZooMetaAppConfig(AppConfig):
|
||||
name = 'zoo.zoo_meta'
|
||||
verbose_name = _('metadatas')
|
||||
|
||||
def post_migrate(self, **kwargs):
|
||||
from .models import EntitySchema
|
||||
|
||||
for schema in EntitySchema.objects.all():
|
||||
schema.rebuild_indexes()
|
||||
|
||||
def post_save(self, sender, instance, **kwargs):
|
||||
instance.rebuild_indexes()
|
||||
|
||||
def ready(self):
|
||||
from .models import EntitySchema
|
||||
|
||||
post_migrate.connect(self.post_migrate)
|
||||
post_save.connect(self.post_save, sender=EntitySchema)
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
from hashlib import md5
|
||||
|
||||
from django.apps import apps
|
||||
from django.db import models, connection
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.contrib.postgres.fields import JSONField
|
||||
|
@ -87,6 +88,25 @@ class CommonSchema(models.Model):
|
|||
'WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
|
||||
cursor.execute(sql)
|
||||
|
||||
def create_trigram_index(self, expr):
|
||||
from zoo.zoo_data.models import Entity, Relation
|
||||
|
||||
if isinstance(self, EntitySchema):
|
||||
table = Entity._meta.db_table
|
||||
elif isinstance(self, RelationSchema):
|
||||
table = Relation._meta.db_table
|
||||
else:
|
||||
raise NotImplementedError(self)
|
||||
|
||||
key = md5(expr).hexdigest()[:8]
|
||||
gin_sql = ('CREATE INDEX zoo_entity_%s_gin_%s_dynamic_idx ON %s USING gin ((%s) '
|
||||
'gin_trgm_ops) WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
|
||||
gist_sql = ('CREATE INDEX zoo_entity_%s_gist_%s_dynamic_idx ON %s USING gist ((%s)'
|
||||
' gist_trgm_ops) WHERE schema_id = %s' % (key, self.id, table, expr, self.id))
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(gin_sql)
|
||||
cursor.execute(gist_sql)
|
||||
|
||||
def rebuild_indexes(self):
|
||||
from zoo.zoo_data.models import Entity
|
||||
|
||||
|
@ -114,11 +134,12 @@ class CommonSchema(models.Model):
|
|||
if m is not None:
|
||||
m(cursor, table, path)
|
||||
|
||||
objects = GetBySlugManager()
|
||||
# delegate index building to custom applications
|
||||
for app in apps.get_app_configs():
|
||||
if hasattr(app, 'zoo_rebuild_indexes'):
|
||||
app.zoo_rebuild_indexes(self)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
super(CommonSchema, self).save(*args, **kwargs)
|
||||
self.rebuild_indexes()
|
||||
objects = GetBySlugManager()
|
||||
|
||||
def make_caption(self, value):
|
||||
if self.caption_template:
|
||||
|
|
|
@ -104,3 +104,14 @@ class ZooNanterreConfig(AppConfig):
|
|||
|
||||
def ready(self):
|
||||
post_migrate.connect(self.post_migrate)
|
||||
|
||||
def zoo_rebuild_indexes(self, schema):
|
||||
from .utils import INDIVIDU_ENT
|
||||
|
||||
if schema.slug != INDIVIDU_ENT:
|
||||
return
|
||||
|
||||
schema.create_trigram_index(
|
||||
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_de_naissance'))")
|
||||
schema.create_trigram_index(
|
||||
"immutable_normalize(content->>'prenoms' || ' ' || (content->>'nom_d_usage'))")
|
||||
|
|
|
@ -36,7 +36,7 @@ import psycopg2
|
|||
from django.conf import settings
|
||||
from django.contrib.postgres.search import TrigramDistance
|
||||
from django.db import connection
|
||||
from django.db.models import Q, F, Value
|
||||
from django.db.models import Q, F, Value, ExpressionWrapper, CharField
|
||||
from django.db.models.functions import Least, Greatest, Coalesce, Concat
|
||||
from django.db import transaction
|
||||
from django.utils.timezone import now, make_aware
|
||||
|
@ -44,7 +44,7 @@ from django.contrib.auth.hashers import make_password
|
|||
|
||||
from zoo.zoo_meta.models import EntitySchema, RelationSchema
|
||||
from zoo.zoo_data.models import Entity, Relation, Transaction, Log
|
||||
from zoo.zoo_data.search import JSONTextRef, Normalize
|
||||
from zoo.zoo_data.search import JSONTextRef, Normalize, TextCat
|
||||
|
||||
today = datetime.date.today
|
||||
|
||||
|
@ -184,18 +184,20 @@ class PersonSearch(object):
|
|||
|
||||
def __init__(self, limit=0.5, base_limit=0.3):
|
||||
self.birthdates_filters = []
|
||||
self.filters = {}
|
||||
self.name_filters = []
|
||||
self.name_similarities = []
|
||||
self.email_similarities = []
|
||||
self.key_filters = []
|
||||
self.email_filters = []
|
||||
self.statut_legal_filter = None
|
||||
self.schema = EntitySchema.objects.get(slug=INDIVIDU_ENT)
|
||||
self.limit = limit
|
||||
self.base_limit = base_limit
|
||||
self.annotations = []
|
||||
|
||||
def add_filter(self, name, filter_expression):
|
||||
self.filters.setdefault(name, []).append(filter_expression)
|
||||
|
||||
def search_statut_legal(self, statut_legal):
|
||||
self.statut_legal_filter = statut_legal
|
||||
self.add_filter('statut_legal', Q(content__statut_legal=statut_legal))
|
||||
|
||||
def search_query(self, query):
|
||||
'''Take a one line query and try to build a search filter from it'''
|
||||
|
@ -230,8 +232,7 @@ class PersonSearch(object):
|
|||
def search_email(self, email):
|
||||
self = copy.deepcopy(self)
|
||||
|
||||
f = self.q_normalize('email', email)
|
||||
self.email_filters.append(f)
|
||||
self.add_filter('email', self.q_normalize('email', email))
|
||||
self.email_similarities.append(Value(1.0) - self.distance('email', email))
|
||||
return self
|
||||
|
||||
|
@ -254,48 +255,48 @@ class PersonSearch(object):
|
|||
for key, name in self.applications():
|
||||
filters.append(Q(**{'content__cles_de_federation__%s' % key: identifier}))
|
||||
q = functools.reduce(Q.__or__, filters)
|
||||
self.key_filters.append(q)
|
||||
self.add_filter('key', q)
|
||||
return self
|
||||
|
||||
def search_birthdate(self, birthdate, window_days=0):
|
||||
self = copy.deepcopy(self)
|
||||
|
||||
if hasattr(birthdate, 'keys'):
|
||||
# case of dict
|
||||
pass
|
||||
elif hasattr(birthdate, 'year'):
|
||||
if hasattr(birthdate, 'year'):
|
||||
if hasattr(birthdate, 'date'):
|
||||
birthdate = birthdate.date()
|
||||
# fast path for date / datetime
|
||||
if window_days:
|
||||
before = birthdate - datetime.timedelta(days=window_days)
|
||||
after = birthdate + datetime.timedelta(days=window_days)
|
||||
self.birthdates_filters.append(
|
||||
Q(content__date_de_naissance__timestamp__gte=before)
|
||||
& Q(content__date_de_naissance__timestamp__lte=after)
|
||||
)
|
||||
else:
|
||||
self.birthdates_filters.append(
|
||||
Q(content__date_de_naissance__timestamp=birthdate)
|
||||
)
|
||||
return self
|
||||
before = birthdate
|
||||
after = birthdate
|
||||
else:
|
||||
# case of strings
|
||||
birthdate = self.match_birthdate(birthdate).groupdict()
|
||||
# case of string
|
||||
if not hasattr(birthdate, 'keys'):
|
||||
birthdate = self.match_birthdate(birthdate).groupdict()
|
||||
|
||||
this_year = datetime.date.today().year % 100
|
||||
year = int(birthdate['year'])
|
||||
if year < 100:
|
||||
if year > this_year:
|
||||
year += 1900
|
||||
this_year = datetime.date.today().year % 100
|
||||
year = int(birthdate['year'])
|
||||
if year < 100:
|
||||
if year > this_year:
|
||||
year += 1900
|
||||
else:
|
||||
year += 2000
|
||||
if len(birthdate) == 3:
|
||||
before = after = datetime.date(
|
||||
int(birthdate['year']), int(birthdate['month']), int(birthdate['day']))
|
||||
elif len(birthdate) == 2:
|
||||
after = datetime.date(int(birthdate['year']), int(birthdate['month']), 1)
|
||||
before = ((after + datetime.timedelta(days=31)).replace(day=1)
|
||||
- datetime.timedelta(days=1))
|
||||
else:
|
||||
year += 2000
|
||||
q = Q(content__date_de_naissance__timestamp__year=year)
|
||||
if birthdate['month']:
|
||||
q &= Q(content__date_de_naissance__timestamp__month=birthdate['month'])
|
||||
if birthdate['day']:
|
||||
q &= Q(content__date_de_naissance__timestamp__day=birthdate['day'])
|
||||
self.birthdates_filters.append(q)
|
||||
after = datetime.date(int(birthdate['year']), 1, 1)
|
||||
before = datetime.date(int(birthdate['year']), 12, 31)
|
||||
|
||||
if window_days:
|
||||
after -= datetime.timedelta(days=window_days)
|
||||
before += datetime.timedelta(days=window_days)
|
||||
|
||||
query = Q(content__date_de_naissance__timestamp__gte=after)
|
||||
query &= Q(content__date_de_naissance__timestamp__lte=before)
|
||||
self.add_filter('birthdate', query)
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
|
@ -306,48 +307,45 @@ class PersonSearch(object):
|
|||
def q_normalize(self, field, value):
|
||||
return Q(**{'content__%s__normalize__trigram_similar' % field: self.luv(value)})
|
||||
|
||||
def search_name(self, first_name=None, last_name=None, factor=1.0, first_name_weight=1.0,
|
||||
last_name_weight=1.0):
|
||||
def search_name(self, fullname, factor=1.0, first_name_weight=1.0, last_name_weight=1.0):
|
||||
self = copy.deepcopy(self)
|
||||
|
||||
q = Q()
|
||||
if not first_name or not last_name:
|
||||
factor *= 0.8
|
||||
if last_name:
|
||||
q &= (self.q_normalize('nom_d_usage', last_name)
|
||||
| self.q_normalize('nom_de_naissance', last_name))
|
||||
if first_name:
|
||||
q &= self.q_normalize('prenoms', first_name)
|
||||
self.name_filters.append(q)
|
||||
fullname_naissance = ExpressionWrapper(
|
||||
Normalize(TextCat(
|
||||
JSONTextRef(F('content'), 'prenoms'),
|
||||
Value(' '),
|
||||
JSONTextRef(F('content'), 'nom_de_naissance'))),
|
||||
output_field=CharField())
|
||||
fullname_usage = ExpressionWrapper(
|
||||
Normalize(TextCat(
|
||||
JSONTextRef(F('content'), 'prenoms'),
|
||||
Value(' '),
|
||||
JSONTextRef(F('content'), 'nom_d_usage'))),
|
||||
output_field=CharField())
|
||||
|
||||
fname_d = self.distance('prenoms', first_name)
|
||||
name_of_use_d = self.distance('nom_d_usage', last_name)
|
||||
name_of_birth_d = self.distance('nom_de_naissance', last_name)
|
||||
self.annotations.append(('fullname_naissance', fullname_naissance))
|
||||
self.annotations.append(('fullname_usage', fullname_usage))
|
||||
|
||||
if first_name and last_name:
|
||||
similarity = Value(first_name_weight) * fname_d
|
||||
similarity += Value(last_name_weight) * Least(name_of_use_d, name_of_birth_d)
|
||||
similarity /= Value(first_name_weight + last_name_weight)
|
||||
elif first_name:
|
||||
similarity = fname_d
|
||||
else:
|
||||
similarity = Least(name_of_use_d, name_of_birth_d)
|
||||
similarity = (Value(1.0) - similarity) * Value(factor)
|
||||
self.name_similarities.append(similarity)
|
||||
# Create the simple filter
|
||||
q = (Q(fullname_naissance__trigram_similar=self.luv(fullname))
|
||||
| Q(fullname_usage__trigram_similar=self.luv(fullname)))
|
||||
self.add_filter('name', q)
|
||||
|
||||
# Compute similarity score
|
||||
for expression in (fullname_naissance, fullname_usage):
|
||||
self.name_similarities.append(
|
||||
Value(1.0) - TrigramDistance(expression, self.luv(fullname)))
|
||||
return self
|
||||
|
||||
def search_names(self, names):
|
||||
if not names:
|
||||
return self
|
||||
|
||||
fullname = u' '.join(names)
|
||||
|
||||
self = copy.deepcopy(self)
|
||||
|
||||
for i in range(0, len(names) + 1):
|
||||
first_name, last_name = ' '.join(names[:i]), ' '.join(names[i:])
|
||||
self = self.search_name(first_name, last_name)
|
||||
if len(names) > 1:
|
||||
self = self.search_name(last_name, first_name, factor=0.8)
|
||||
return self
|
||||
return self.search_name(fullname)
|
||||
|
||||
def copy(self):
|
||||
return copy.deepcopy(self)
|
||||
|
@ -432,20 +430,17 @@ class PersonSearch(object):
|
|||
|
||||
qs = Entity.objects.filter(schema=self.schema)
|
||||
|
||||
qs = qs.filter(
|
||||
self.or_filters(
|
||||
self.birthdates_filters))
|
||||
qs = qs.filter(
|
||||
self.or_filters(self.key_filters))
|
||||
qs = qs.filter(
|
||||
self.or_filters(self.email_filters))
|
||||
qs = qs.filter(
|
||||
self.or_filters(self.name_filters))
|
||||
if self.statut_legal_filter:
|
||||
qs = qs.filter(content__statut_legal=self.statut_legal_filter)
|
||||
for key, annotation in self.annotations:
|
||||
qs = qs.annotate(**{key: annotation})
|
||||
|
||||
# search filter upon name, id, key, email, birthdate and statut legal
|
||||
for key in self.filters:
|
||||
qs = qs.filter(
|
||||
self.or_filters(
|
||||
self.filters[key]))
|
||||
|
||||
qs = qs.annotate(
|
||||
fullname=Concat(
|
||||
fullname2=Concat(
|
||||
Coalesce(
|
||||
JSONTextRef(F('content'), 'nom_d_usage'),
|
||||
JSONTextRef(F('content'), 'nom_de_naissance'),
|
||||
|
@ -472,9 +467,9 @@ class PersonSearch(object):
|
|||
qs = qs.annotate(similarity=functools.reduce(operator.__add__, similarities) /
|
||||
Value(len(similarities)))
|
||||
qs = qs.filter(similarity__gte=self.limit)
|
||||
qs = qs.order_by('-similarity', 'fullname')
|
||||
qs = qs.order_by('-similarity', 'fullname2')
|
||||
else:
|
||||
qs = qs.order_by('fullname')
|
||||
qs = qs.order_by('fullname2')
|
||||
|
||||
if prefetch:
|
||||
qs = qs.prefetch_related(
|
||||
|
|
Loading…
Reference in New Issue