172 lines
7.1 KiB
Python
172 lines
7.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
# zoo - versatile objects management
|
|
#
|
|
# Copyright (C) 2016 Entr'ouvert
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify it
|
|
# under the terms of the GNU Affero General Public License as published
|
|
# by the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
from decimal import Decimal
|
|
import datetime
|
|
|
|
import django
|
|
from django.core.management.base import BaseCommand, CommandParser
|
|
from django.utils.timezone import now
|
|
|
|
from zoo.zoo_nanterre.utils import individu_caption
|
|
from zoo.zoo_nanterre.duplicates import find_duplicates
|
|
from zoo.zoo_nanterre.models import Duplicate
|
|
|
|
|
|
class Table(object):
|
|
def __init__(self, names):
|
|
self.size = len(names)
|
|
self.names = names
|
|
self.rows = []
|
|
self.widths = [0] * self.size
|
|
|
|
def add_row(self, *args):
|
|
self.rows.append(args)
|
|
|
|
def add_rows(self, iterator):
|
|
for row in iterator:
|
|
self.add_row(row)
|
|
|
|
def computesize(self):
|
|
for row in self.rows:
|
|
for i, col in enumerate(row):
|
|
self.width[i] = max(self.width[i], u'%s' % col)
|
|
|
|
def __str__(self):
|
|
self.computesize()
|
|
s = u'|'
|
|
for width, name in zip(self.width, self.names):
|
|
s += u' %*s |' % (width, name)
|
|
s += u'\n'
|
|
for row in self.rows:
|
|
s += u'|'
|
|
for fmt, width, col in zip(self.fmt, self.width, row):
|
|
s += u' %*s |' % (width, col)
|
|
s += u'\n'
|
|
return s
|
|
|
|
|
|
class Command(BaseCommand):
|
|
def add_arguments(self, parser):
|
|
cmd = self
|
|
|
|
if django.VERSION < (2, 1):
|
|
# https://stackoverflow.com/questions/36706220/is-it-possible-to-create-subparsers-in-a-django-management-command
|
|
class SubParser(CommandParser):
|
|
def __init__(self, **kwargs):
|
|
super(SubParser, self).__init__(cmd, **kwargs)
|
|
|
|
subparser = parser.add_subparsers(dest='command', help='commands', parser_class=SubParser)
|
|
else:
|
|
subparser = parser.add_subparsers(title='subcommands', dest='command', required=True)
|
|
find_parser = subparser.add_parser('find', help='find duplicates')
|
|
find_parser.add_argument('--count', type=int, help='search last count persons created',
|
|
default=None)
|
|
find_parser.add_argument('--limit', type=float, help='similarity level', default=None)
|
|
find_parser.add_argument('--days', type=int,
|
|
help='limit search to tcreated or update in the last days',
|
|
default=None)
|
|
find_parser.add_argument('--ids', type=int,
|
|
help='limit search to theses RSU ids',
|
|
action='append',
|
|
default=[])
|
|
|
|
delete_parser = subparser.add_parser('delete', help='delete non false-positive duplicates')
|
|
delete_parser.add_argument('--limit', type=float, help='similarity level')
|
|
|
|
list_parser = subparser.add_parser('list', help='list duplicates')
|
|
list_parser.add_argument('--count', type=float, help='similarity level')
|
|
list_parser.add_argument('--days', type=float,
|
|
help='limit search to duplicate created in the last days',
|
|
default=None)
|
|
list_parser.add_argument('--false-positive', action='store_true', default=False,
|
|
help='show false positive')
|
|
list_parser.add_argument('--dedup', action='store_true', default=False,
|
|
help='show deduplicated')
|
|
|
|
def handle(self, verbosity, command=None, ids=[], count=None, limit=None, days=None,
|
|
false=False, dedup=False, *args, **options):
|
|
if command == 'find':
|
|
for t in find_duplicates(count=count, limit=limit, days=days, ids=ids,
|
|
progression=True):
|
|
if verbosity > 1:
|
|
self.stdout.write(
|
|
'New duplicates / persons scanned / persons total :'
|
|
' %05d / %05d / %05d\r' % t)
|
|
self.stdout.flush()
|
|
if verbosity > 1:
|
|
print
|
|
if verbosity > 0 and t[0]:
|
|
self.stdout.write('Found %d new duplicates.' % t[0])
|
|
elif command == 'delete':
|
|
qs = Duplicate.objects.all()
|
|
if limit:
|
|
qs = qs.filter(score__lt=limit)
|
|
qs.delete()
|
|
elif command == 'list':
|
|
qs = Duplicate.objects.order_by('-created', '-id')
|
|
if count:
|
|
qs = qs[:count]
|
|
if days:
|
|
since = now() - datetime.timedelta(days=days)
|
|
self.stdout.write('Duplicates created after', since)
|
|
qs = qs.filter(created__gte=since)
|
|
if false:
|
|
qs = qs.filter(state=Duplicate.STATE_FALSE_POSITIVE)
|
|
elif dedup:
|
|
qs = qs.filter(state=Duplicate.STATE_DEDUP)
|
|
else:
|
|
qs = qs.filter(state=Duplicate.STATE_NEW)
|
|
column_size = 0
|
|
for duplicate in qs:
|
|
column_size = max(column_size, len(individu_caption(duplicate.first)),
|
|
len(individu_caption(duplicate.second)))
|
|
|
|
self.stdout.write('%d duplicates\n' % qs.count())
|
|
|
|
if false:
|
|
table = Table(['Declared false', 'ID', 'Name', 'ID', 'Name', 'Score'])
|
|
table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
|
|
d.second_id, individu_caption(d.second), d.score) for d in qs])
|
|
elif dedup:
|
|
table = Table(['Deduplicated', 'ID', 'Name', 'ID', 'Name', 'Choice', 'Score'])
|
|
table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
|
|
d.second_id, individu_caption(d.second), d.content['dedup_choice'], d.score) for d in qs])
|
|
|
|
self.stdout.write('| %6s | %*s | %6s | %*s | %5s |' % (
|
|
'ID',
|
|
column_size,
|
|
u'État civil',
|
|
'ID',
|
|
column_size,
|
|
u'État civil',
|
|
'Score',
|
|
))
|
|
for duplicate in qs:
|
|
self.stdout.write('| %6d | %*s | %6d | %*s | %3d %% |' % (
|
|
duplicate.first_id,
|
|
column_size,
|
|
individu_caption(duplicate.first),
|
|
duplicate.second_id,
|
|
column_size,
|
|
individu_caption(duplicate.second),
|
|
duplicate.score * Decimal(100),
|
|
))
|