From 5bed45419bfe8981d000ecfa2100ab98a4089396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20P=C3=A9ters?= Date: Thu, 12 May 2016 15:09:47 +0200 Subject: [PATCH] general: add management command to import from tabellio database --- pfwb_thesaurus/settings.py | 2 + .../thesaurus/management/__init__.py | 0 .../thesaurus/management/commands/__init__.py | 0 .../management/commands/sync-tabellio.py | 114 ++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 pfwb_thesaurus/thesaurus/management/__init__.py create mode 100644 pfwb_thesaurus/thesaurus/management/commands/__init__.py create mode 100644 pfwb_thesaurus/thesaurus/management/commands/sync-tabellio.py diff --git a/pfwb_thesaurus/settings.py b/pfwb_thesaurus/settings.py index 4bb9c11..e4b9b59 100644 --- a/pfwb_thesaurus/settings.py +++ b/pfwb_thesaurus/settings.py @@ -96,6 +96,8 @@ STATICFILES_DIRS = ( ) +TABELLIO_DB_SETTINGS = {} + local_settings_file = os.environ.get('PFWB_THESAURUS_SETTINGS_FILE', os.path.join(os.path.dirname(__file__), 'local_settings.py')) if os.path.exists(local_settings_file): diff --git a/pfwb_thesaurus/thesaurus/management/__init__.py b/pfwb_thesaurus/thesaurus/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pfwb_thesaurus/thesaurus/management/commands/__init__.py b/pfwb_thesaurus/thesaurus/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pfwb_thesaurus/thesaurus/management/commands/sync-tabellio.py b/pfwb_thesaurus/thesaurus/management/commands/sync-tabellio.py new file mode 100644 index 0000000..5f8fc46 --- /dev/null +++ b/pfwb_thesaurus/thesaurus/management/commands/sync-tabellio.py @@ -0,0 +1,114 @@ +# pfwb_thesaurus - thesaurus system +# Copyright (C) 2016 Entr'ouvert +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import psycopg2 + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from ...models import Term, AlternativeTerm + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + pgconn = psycopg2.connect(**settings.TABELLIO_DB_SETTINGS) + cur = pgconn.cursor() + + cur.execute('''SELECT t_thesaurus_term.id, + t_thesaurus_term.term, + ARRAY( + SELECT term + FROM t_thesaurus_term AS t4, + t_thesaurus_rel AS t5 + WHERE t4.id = t5.primtermid + AND t5.rel = 'EQ' + AND t5.sectermid = t_thesaurus_term.id) AS eq, + (SELECT t2.term FROM t_thesaurus_term AS t2, + t_thesaurus_rel + WHERE t_thesaurus_rel.primtermid = t_thesaurus_term.id + AND t_thesaurus_rel.rel = 'HN' + AND t2.id = t_thesaurus_rel.sectermid) AS hn, + (SELECT t3.term FROM t_thesaurus_term AS t3, + t_thesaurus_rel + WHERE t_thesaurus_rel.primtermid = t_thesaurus_term.id + AND t_thesaurus_rel.rel = 'SN' + AND t3.id = t_thesaurus_rel.sectermid + LIMIT 1) AS sn + FROM t_thesaurus_term + WHERE EXISTS ( + SELECT id + FROM t_thesaurus_rel + WHERE (sectermid = t_thesaurus_term.id + OR primtermid = t_thesaurus_term.id) + AND rel in ('BT', 'RT')) + ''') + while True: + row = cur.fetchone() + if row is None: + break + term, created = Term.objects.get_or_create(tabellio_id=row[0]) + term.term = row[1] + term.historical_note = row[3] or '' + term.scope_note = row[4] or '' + current_equivs = set([x.term for x in term.alternativeterm_set.all()]) + new_equivs = set(row[2]) + term.save() + term.alternativeterm_set.filter(term__in=list(current_equivs - new_equivs)).delete() + for alt_term in new_equivs - current_equivs: + alt_term_object = AlternativeTerm(term=alt_term, primary_term=term) + alt_term_object.save() + + bt_rels = {} + rt_rels = {} + cur.execute('''SELECT primtermid, sectermid, rel + FROM t_thesaurus_rel + WHERE rel IN ('BT', 'RT') + ''') + while True: + row = cur.fetchone() + if row is None: + break + primtermid, sectermid, rel = row + if rel == 'BT': + if not bt_rels.get(primtermid): + bt_rels[primtermid] = [] + bt_rels[primtermid].append(sectermid) + elif rel == 'RT': + if not rt_rels.get(primtermid): + rt_rels[primtermid] = [] + rt_rels[primtermid].append(sectermid) + + for term, broader_term_ids in bt_rels.items(): + term = Term.objects.get(tabellio_id=term) + broader_terms = Term.objects.filter(tabellio_id__in=broader_term_ids) + current_set = set([x.id for x in term.broader.all()]) + new_set = set([x.id for x in broader_terms]) + for removed_term in current_set - new_set: + term.broader.remove(removed_term) + for added_term in new_set - current_set: + term.broader.add(added_term) + + for term, related_term_ids in rt_rels.items(): + term = Term.objects.get(tabellio_id=term) + related_terms = Term.objects.filter(tabellio_id__in=related_term_ids) + current_set = set([x.id for x in term.related.all()]) + new_set = set([x.id for x in related_terms]) + for removed_term in current_set - new_set: + term.related.remove(removed_term) + for added_term in new_set - current_set: + term.related.add(added_term) + + pgconn.close()