#! /usr/bin/env python import os import subprocess import re import sys from lxml import etree from lxml.html import html_parser FILENAME = 'tmp.html' if os.path.exists(FILENAME): os.unlink(FILENAME) URL = 'https://www.lametro.fr/501-plateforme-de-services.htm' subprocess.call(['wget', '--quiet', '-O', FILENAME, '--convert-links', URL]) content = open(FILENAME).read().strip() if len(content) < 500: sys.exit(0) root = etree.fromstring(content, parser=html_parser) for script in root.xpath('//script'): # remove all google related tags if script.text and 'google' in script.text or 'google' in script.attrib.get('src', ''): parent = script.getparent() parent.remove(script) # remove title and breadcrumb for element in ('title', 'div[@id="arianeUserTools"]'): for useless in root.xpath('//%s' % element): parent = useless.getparent() parent.remove(useless) # rename "contenu" container to put content into it for content in root.xpath('//div[@id="contenu"]'): content.clear() content.attrib['id'] = 'content' for content in root.xpath('//a[@id="eosm-btn"]'): content.clear() content.attrib['id'] = 'publik-mobile-menu' content = etree.tostring(root, method='html', pretty_print=True) # remove all references to downloaded temporary file content = content.replace(FILENAME, '') # remove copy of jquery content = re.sub('', '') content = content.replace('', '{% block global_title %}{% endblock %}\n') content = content.replace('', '{% block head %}{% endblock %}') content = content.replace( '
', ''' {% if include_top_links != False %}{% endif %} {% block nav %}{% endblock %} {% block grenoble-content %} {% block messages %} {% endblock %} {% endblock %}''', ) content = content.replace( '', ''' {% block tracking %} {% endblock %} {% block body-bottom %} {% endblock %} ''', ) open('base-theme.html', 'w').write(content)