zoo/zoo/zoo_nanterre/management/commands/rsu-duplicates.py

172 lines
7.1 KiB
Python

# -*- coding: utf-8 -*-
# zoo - versatile objects management
#
# Copyright (C) 2016 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
from decimal import Decimal
import datetime
import django
from django.core.management.base import BaseCommand, CommandParser
from django.utils.timezone import now
from zoo.zoo_nanterre.utils import individu_caption
from zoo.zoo_nanterre.duplicates import find_duplicates
from zoo.zoo_nanterre.models import Duplicate
class Table(object):
def __init__(self, names):
self.size = len(names)
self.names = names
self.rows = []
self.widths = [0] * self.size
def add_row(self, *args):
self.rows.append(args)
def add_rows(self, iterator):
for row in iterator:
self.add_row(row)
def computesize(self):
for row in self.rows:
for i, col in enumerate(row):
self.width[i] = max(self.width[i], u'%s' % col)
def __str__(self):
self.computesize()
s = u'|'
for width, name in zip(self.width, self.names):
s += u' %*s |' % (width, name)
s += u'\n'
for row in self.rows:
s += u'|'
for fmt, width, col in zip(self.fmt, self.width, row):
s += u' %*s |' % (width, col)
s += u'\n'
return s
class Command(BaseCommand):
def add_arguments(self, parser):
cmd = self
if django.VERSION < (2, 1):
# https://stackoverflow.com/questions/36706220/is-it-possible-to-create-subparsers-in-a-django-management-command
class SubParser(CommandParser):
def __init__(self, **kwargs):
super(SubParser, self).__init__(cmd, **kwargs)
subparser = parser.add_subparsers(dest='command', help='commands', parser_class=SubParser)
else:
subparser = parser.add_subparsers(title='subcommands', dest='command', required=True)
find_parser = subparser.add_parser('find', help='find duplicates')
find_parser.add_argument('--count', type=int, help='search last count persons created',
default=None)
find_parser.add_argument('--limit', type=float, help='similarity level', default=None)
find_parser.add_argument('--days', type=int,
help='limit search to tcreated or update in the last days',
default=None)
find_parser.add_argument('--ids', type=int,
help='limit search to theses RSU ids',
action='append',
default=[])
delete_parser = subparser.add_parser('delete', help='delete non false-positive duplicates')
delete_parser.add_argument('--limit', type=float, help='similarity level')
list_parser = subparser.add_parser('list', help='list duplicates')
list_parser.add_argument('--count', type=float, help='similarity level')
list_parser.add_argument('--days', type=float,
help='limit search to duplicate created in the last days',
default=None)
list_parser.add_argument('--false-positive', action='store_true', default=False,
help='show false positive')
list_parser.add_argument('--dedup', action='store_true', default=False,
help='show deduplicated')
def handle(self, verbosity, command=None, ids=[], count=None, limit=None, days=None,
false=False, dedup=False, *args, **options):
if command == 'find':
for t in find_duplicates(count=count, limit=limit, days=days, ids=ids,
progression=True):
if verbosity > 1:
self.stdout.write(
'New duplicates / persons scanned / persons total :'
' %05d / %05d / %05d\r' % t)
self.stdout.flush()
if verbosity > 1:
print
if verbosity > 0 and t[0]:
self.stdout.write('Found %d new duplicates.' % t[0])
elif command == 'delete':
qs = Duplicate.objects.all()
if limit:
qs = qs.filter(score__lt=limit)
qs.delete()
elif command == 'list':
qs = Duplicate.objects.order_by('-created', '-id')
if count:
qs = qs[:count]
if days:
since = now() - datetime.timedelta(days=days)
self.stdout.write('Duplicates created after', since)
qs = qs.filter(created__gte=since)
if false:
qs = qs.filter(state=Duplicate.STATE_FALSE_POSITIVE)
elif dedup:
qs = qs.filter(state=Duplicate.STATE_DEDUP)
else:
qs = qs.filter(state=Duplicate.STATE_NEW)
column_size = 0
for duplicate in qs:
column_size = max(column_size, len(individu_caption(duplicate.first)),
len(individu_caption(duplicate.second)))
self.stdout.write('%d duplicates\n' % qs.count())
if false:
table = Table(['Declared false', 'ID', 'Name', 'ID', 'Name', 'Score'])
table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
d.second_id, individu_caption(d.second), d.score) for d in qs])
elif dedup:
table = Table(['Deduplicated', 'ID', 'Name', 'ID', 'Name', 'Choice', 'Score'])
table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
d.second_id, individu_caption(d.second), d.content['dedup_choice'], d.score) for d in qs])
self.stdout.write('| %6s | %*s | %6s | %*s | %5s |' % (
'ID',
column_size,
u'État civil',
'ID',
column_size,
u'État civil',
'Score',
))
for duplicate in qs:
self.stdout.write('| %6d | %*s | %6d | %*s | %3d %% |' % (
duplicate.first_id,
column_size,
individu_caption(duplicate.first),
duplicate.second_id,
column_size,
individu_caption(duplicate.second),
duplicate.score * Decimal(100),
))