zoo/zoo/zoo_nanterre/management/commands/rsu-duplicates.py

# -*- coding: utf-8 -*-
# zoo - versatile objects management
#
# Copyright (C) 2016  Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

from decimal import Decimal
import datetime

import django
from django.core.management.base import BaseCommand, CommandParser
from django.utils.timezone import now

from zoo.zoo_nanterre.utils import individu_caption
from zoo.zoo_nanterre.duplicates import find_duplicates
from zoo.zoo_nanterre.models import Duplicate


class Table(object):
    def __init__(self, names):
        self.size = len(names)
        self.names = names
        self.rows = []
        self.widths = [0] * self.size

    def add_row(self, *args):
        self.rows.append(args)

    def add_rows(self, iterator):
        for row in iterator:
            self.add_row(row)

    def computesize(self):
        for row in self.rows:
            for i, col in enumerate(row):
                self.width[i] = max(self.width[i], u'%s' % col)

    def __str__(self):
        self.computesize()
        s = u'|'
        for width, name in zip(self.width, self.names):
            s += u' %*s |' % (width, name)
        s += u'\n'
        for row in self.rows:
            s += u'|'
            for fmt, width, col in zip(self.fmt, self.width, row):
                s += u' %*s |' % (width, col)
            s += u'\n'
        return s


class Command(BaseCommand):
    def add_arguments(self, parser):
        cmd = self

        if django.VERSION < (2, 1):
            # https://stackoverflow.com/questions/36706220/is-it-possible-to-create-subparsers-in-a-django-management-command
            class SubParser(CommandParser):
                def __init__(self, **kwargs):
                    super(SubParser, self).__init__(cmd, **kwargs)

            subparser = parser.add_subparsers(dest='command', help='commands', parser_class=SubParser)
        else:
            subparser = parser.add_subparsers(title='subcommands', dest='command', required=True)
        find_parser = subparser.add_parser('find', help='find duplicates')
        find_parser.add_argument('--count', type=int, help='search last count persons created',
                                 default=None)
        find_parser.add_argument('--limit', type=float, help='similarity level', default=None)
        find_parser.add_argument('--days', type=int,
                                 help='limit search to tcreated or update in the last days',
                                 default=None)
        find_parser.add_argument('--ids', type=int,
                                 help='limit search to theses RSU ids',
                                 action='append',
                                 default=[])

        delete_parser = subparser.add_parser('delete', help='delete non false-positive duplicates')
        delete_parser.add_argument('--limit', type=float, help='similarity level')

        list_parser = subparser.add_parser('list', help='list duplicates')
        list_parser.add_argument('--count', type=float, help='similarity level')
        list_parser.add_argument('--days', type=float,
                                 help='limit search to duplicate created in the last days',
                                 default=None)
        list_parser.add_argument('--false-positive', action='store_true', default=False,
                                 help='show false positive')
        list_parser.add_argument('--dedup', action='store_true', default=False,
                                 help='show deduplicated')

    def handle(self, verbosity, command=None, ids=[], count=None, limit=None, days=None,
               false=False, dedup=False, *args, **options):
        if command == 'find':
            for t in find_duplicates(count=count, limit=limit, days=days, ids=ids,
                                     progression=True):
                if verbosity > 1:
                    self.stdout.write(
                        'New duplicates / persons scanned / persons total :'
                        ' %05d / %05d / %05d\r' % t)
                    self.stdout.flush()
            if verbosity > 1:
                print
            if verbosity > 0 and t[0]:
                self.stdout.write('Found %d new duplicates.' % t[0])
        elif command == 'delete':
            qs = Duplicate.objects.all()
            if limit:
                qs = qs.filter(score__lt=limit)
            qs.delete()
        elif command == 'list':
            qs = Duplicate.objects.order_by('-created', '-id')
            if count:
                qs = qs[:count]
            if days:
                since = now() - datetime.timedelta(days=days)
                self.stdout.write('Duplicates created after', since)
                qs = qs.filter(created__gte=since)
            if false:
                qs = qs.filter(state=Duplicate.STATE_FALSE_POSITIVE)
            elif dedup:
                qs = qs.filter(state=Duplicate.STATE_DEDUP)
            else:
                qs = qs.filter(state=Duplicate.STATE_NEW)
            column_size = 0
            for duplicate in qs:
                column_size = max(column_size, len(individu_caption(duplicate.first)),
                                  len(individu_caption(duplicate.second)))

            self.stdout.write('%d duplicates\n' % qs.count())

            if false:
                table = Table(['Declared false', 'ID', 'Name', 'ID', 'Name', 'Score'])
                table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
                                 d.second_id, individu_caption(d.second), d.score) for d in qs])
            elif dedup:
                table = Table(['Deduplicated', 'ID', 'Name', 'ID', 'Name', 'Choice', 'Score'])
                table.add_rows([(d.modified.isoformat(), d.first_id, individu_caption(d.first),
                                 d.second_id, individu_caption(d.second), d.content['dedup_choice'], d.score) for d in qs])

            self.stdout.write('| %6s | %*s | %6s | %*s | %5s |' % (
                'ID',
                column_size,
                u'État civil',
                'ID',
                column_size,
                u'État civil',
                'Score',
            ))
            for duplicate in qs:
                self.stdout.write('| %6d | %*s | %6d | %*s | %3d %% |' % (
                    duplicate.first_id,
                    column_size,
                    individu_caption(duplicate.first),
                    duplicate.second_id,
                    column_size,
                    individu_caption(duplicate.second),
                    duplicate.score * Decimal(100),
                ))