misc-fred/doc-publik/update-publik-doc-from-mall...

42 lines
1.6 KiB
Python

from django.utils.six.moves.urllib import parse as urlparse
from bs4 import BeautifulSoup
import html5lib
import requests
from combo.data.models import TextCell
for cell in TextCell.objects.filter(slug__startswith='mallard-', page__snapshot__isnull=True):
if not cell.slug.startswith('mallard-'):
continue
mallard_page = cell.slug.split('-', 1)[1]
for module in ('wcs', 'publik-base-theme'):
resp = requests.get('https://doc.entrouvert.org/%s/dev/%s.html' % (module, mallard_page))
if resp.status_code != 200:
continue
document = BeautifulSoup(resp.content, 'html5lib')
content = document.find('div', 'body')
more_info = document.find('div', 'sect sect-links')
for a in document.find_all('a'):
href = a.attrs['href']
parsed = urlparse.urlparse(href)
if parsed.netloc:
continue
if '/' in parsed.path:
continue
try:
target_cell = TextCell.objects.get(slug='mallard-%s' % parsed.path.replace('.html', ''),
page__snapshot__isnull=True)
except TextCell.DoesNotExist:
continue
a.attrs['href'] = target_cell.page.get_online_url()
for img in document.find_all('img'):
img.attrs['src'] = 'https://doc.entrouvert.org/%s/dev/%s' % (module, img.attrs['src'])
new_content = content.decode()
if more_info:
new_content = new_content.replace(more_info.decode(), '')
cell.text = new_content
cell.save()