#! /usr/bin/env python import os import subprocess import re import sys from lxml import etree from lxml.html import html_parser FILENAME = os.path.join('tmp.html') if os.path.exists(FILENAME): os.unlink(FILENAME) URL = 'https://www.hautes-alpes.fr/5448-services-en-ligne.htm' subprocess.call(['wget', '--quiet', '-O', FILENAME, '--convert-links', URL]) content = open(FILENAME).read().strip() if len(content) < 500: sys.exit(0) root = etree.fromstring(content, parser=html_parser) for script in root.xpath('//script'): # remove all google related tags if script.text and 'google' in script.text or 'google' in script.attrib.get('src', ''): parent = script.getparent() parent.remove(script) # remove title and breadcrumb for element in ('title', 'nav[@id="ariane"]', 'div[@id="cookie_cnil"]'): for useless in root.xpath('//%s' % element): parent = useless.getparent() parent.remove(useless) # clear "corps" container to put content into it for main in root.xpath('//main[@id="corps"]'): main.clear() main.attrib['id'] = 'corps' content = etree.tostring(root, method='html', pretty_print=True) # remove all references to downloaded temporary file content = content.replace(FILENAME, '') # remove copy of jquery content = re.sub('', '{% block global_title %}{% endblock %}\n') content = content.replace('', '{% block head %}{% endblock %}') content = content.replace( '
', '''
{% block nav %}{% endblock %} {% block messages %}{% endblock %} {% block local-content-wrapper %} {% endblock %}''', ) content = content.replace('', '{% block local-body-bottom %}{% endblock %}') content = content.replace("SERVER_ROOT = '/'", 'SERVER_ROOT="//www.hautes-alpes.fr/"') content = content.replace( '
', '
{% block user-info %}{% endblock %}' ) open(os.path.join('.', 'base-theme.html'), 'w').write(content)