barbacompta/eo_gestion/chorus/annuaire.py

177 lines
6.9 KiB
Python

# barbacompta - accounting for dummies
# Copyright (C) 2010-2019 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
import xml.etree.ElementTree as ET
import zipfile
from xml.dom import pulldom
import requests
from django.core.files.storage import default_storage
from . import chorus
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
class AnnuaireManager:
STRUCTURE_UNITAIRE_TAG_NAME = 'CPPStructurePartenaireUnitaire'
def _update_annuaire(self):
from . import models
count = 0
insert_count = 0
update_count = 0
known = set()
for structures in grouper(self.download_and_parse_annuaire(), 1000):
structures = [struct for struct in structures if struct] # ignore None
inserts = []
updates = []
identifiers = {structure.full_identifier for structure in structures}
known.update(identifiers)
known_structures = {
struct.full_identifier: struct
for struct in models.Structure.objects.filter(full_identifier__in=identifiers)
}
for structure in structures:
known_structure = known_structures.get(structure.full_identifier)
if known_structure:
if known_structure.email != structure.email or str(known_structure) != str(structure):
structure.id = known_structure.id
updates.append(structure)
else:
inserts.append(structure)
models.Structure.objects.bulk_create(inserts)
for update in updates:
update.save()
count += len(structures)
insert_count += len(inserts)
update_count += len(updates)
yield count, insert_count, update_count, 0
obsolete = set(models.Structure.objects.values_list('full_identifier', flat=True)) - known
models.Structure.objects.filter(full_identifier__in=obsolete).delete()
yield count, insert_count, update_count, len(obsolete)
def update_annuaire(self):
for stats in self._update_annuaire():
pass
def download_and_parse_annuaire(self):
etag = None
if default_storage.exists('annuaire.etag') and default_storage.exists('annuaire.zip'):
with default_storage.open('annuaire.etag') as fd:
etag = fd.read()
headers = {}
if etag:
headers['If-None-Match'] = etag
else:
print('No etag')
with requests.get(chorus.get_annuaire_url(), stream=True, headers=headers) as response:
if response.status_code == 200:
with open(default_storage.path('annuaire.zip'), 'wb') as fd:
fd.write(response.content)
with open(default_storage.path('annuaire.etag'), 'w') as fd:
fd.write(response.headers['ETag'])
else:
print('Using already downloaded file')
with default_storage.open('annuaire.zip') as zip_fd:
with zipfile.ZipFile(zip_fd) as zipf:
for name in zipf.namelist():
with zipf.open(name) as fd:
yield from self.parse_annuaire(fd)
def parse_annuaire(self, fd):
from . import models
doc = pulldom.parse(fd)
for event, node in doc:
if event == pulldom.START_ELEMENT and node.tagName == self.STRUCTURE_UNITAIRE_TAG_NAME:
doc.expandNode(node)
document = ET.fromstring(node.toxml())
structure = self.parse_structure(document)
try:
services = structure['Services']
except KeyError:
structure['Services'] = []
else:
if not isinstance(services['Service'], list):
structure['Services'] = [services['Service']]
else:
structure['Services'] = services['Service']
if structure['StructureActive'] == 'false':
continue
if structure['GestionService'] == 'true':
for service in structure['Services']:
if service['ServiceActif'] == 'false':
continue
yield models.Structure(
name=structure['RaisonSociale'][:80],
siret=structure['Identifiant'],
service_code=service['Code'],
service_name=service['Nom'][:80],
email=structure['AdressePostale'].get('Courriel'),
engagement_obligatoire=(
service['GestionEGMT'] == 'true' or structure['GestionEngagement'] == 'true'
),
)
else:
yield models.Structure(
name=structure['RaisonSociale'],
siret=structure['Identifiant'],
email=structure['AdressePostale'].get('Courriel'),
engagement_obligatoire=structure['GestionEngagement'] == 'true',
)
def parse_structure(self, structure):
d = {}
for node in structure:
if node.tag in d and not isinstance(d[node.tag], list):
d[node.tag] = [d[node.tag]]
if len(node):
value = self.parse_structure(node)
else:
value = node.text
if node.tag in d and isinstance(d[node.tag], list):
d[node.tag].append(value)
else:
d[node.tag] = value
return d
if __name__ == '__main__':
import django
django.setup()
manager = AnnuaireManager()
for count, insert_count, update_count, delete_count in manager._update_annuaire():
print(
'Analyzed %10d - Inserted %10d - Updated %10d - Deleted %10d\r'
% (count, insert_count, update_count, delete_count),
end='',
)
print()