summaryrefslogtreecommitdiffstats
path: root/doc-publik/update-publik-doc-from-mallard.py
blob: e0990cc1a233f748bcc8dc6e446a68ba9a5ca13a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urlparse
from bs4 import BeautifulSoup
import html5lib
import requests

from combo.data.models import TextCell

for cell in TextCell.objects.filter(slug__startswith='mallard-'):
    if not cell.slug.startswith('mallard-'):
        continue
    mallard_page = cell.slug.split('-', 1)[1]
    for module in ('wcs', 'publik-base-theme'):
        resp = requests.get('https://doc.entrouvert.org/%s/dev/%s.html' % (module, mallard_page))
        if resp.status_code != 200:
            continue
        document = BeautifulSoup(resp.content, 'html5lib')
        content = document.find('div', 'body')
        more_info = document.find('div', 'sect sect-links')
        for a in document.find_all('a'):
            href = a.attrs['href']
            parsed = urlparse.urlparse(href)
            if parsed.netloc:
                continue
            if '/' in parsed.path:
                continue
            try:
                target_cell = TextCell.objects.get(slug='mallard-%s' % parsed.path.replace('.html', ''),
                        page__snapshot__isnull=True)
            except TextCell.DoesNotExist:
                continue
            a.attrs['href'] = target_cell.page.get_online_url()

        new_content = unicode(content).replace(unicode(more_info), '')
        cell.text = new_content
        cell.save()