40 lines
1.5 KiB
Python
40 lines
1.5 KiB
Python
from django.utils.six.moves.urllib import parse as urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
import html5lib
|
|
import requests
|
|
|
|
from combo.data.models import TextCell
|
|
|
|
for cell in TextCell.objects.filter(slug__startswith='mallard-', page__snapshot__isnull=True):
|
|
if not cell.slug.startswith('mallard-'):
|
|
continue
|
|
mallard_page = cell.slug.split('-', 1)[1]
|
|
for module in ('wcs', 'publik-base-theme'):
|
|
resp = requests.get('https://doc.entrouvert.org/%s/dev/%s.html' % (module, mallard_page))
|
|
if resp.status_code != 200:
|
|
continue
|
|
document = BeautifulSoup(resp.content, 'html5lib')
|
|
content = document.find('div', 'body')
|
|
more_info = document.find('div', 'sect sect-links')
|
|
for a in document.find_all('a'):
|
|
href = a.attrs['href']
|
|
parsed = urlparse.urlparse(href)
|
|
if parsed.netloc:
|
|
continue
|
|
if '/' in parsed.path:
|
|
continue
|
|
try:
|
|
target_cell = TextCell.objects.get(slug='mallard-%s' % parsed.path.replace('.html', ''),
|
|
page__snapshot__isnull=True)
|
|
except TextCell.DoesNotExist:
|
|
continue
|
|
a.attrs['href'] = target_cell.page.get_online_url()
|
|
for img in document.find_all('img'):
|
|
img.attrs['src'] = 'https://doc.entrouvert.org/%s/dev/%s' % (module, img.attrs['src'])
|
|
|
|
if more_info:
|
|
new_content = content.decode().replace(more_info.decode(), '')
|
|
cell.text = new_content
|
|
cell.save()
|