from django.utils.six.moves.urllib import parse as urlparse from bs4 import BeautifulSoup import html5lib import requests from combo.data.models import TextCell for cell in TextCell.objects.filter(slug__startswith='mallard-', page__snapshot__isnull=True): if not cell.slug.startswith('mallard-'): continue mallard_page = cell.slug.split('-', 1)[1] for module in ('wcs', 'publik-base-theme'): resp = requests.get('https://doc.entrouvert.org/%s/dev/%s.html' % (module, mallard_page)) if resp.status_code != 200: continue document = BeautifulSoup(resp.content, 'html5lib') content = document.find('div', 'body') more_info = document.find('div', 'sect sect-links') for a in document.find_all('a'): href = a.attrs['href'] parsed = urlparse.urlparse(href) if parsed.netloc: continue if '/' in parsed.path: continue try: target_cell = TextCell.objects.get(slug='mallard-%s' % parsed.path.replace('.html', ''), page__snapshot__isnull=True) except TextCell.DoesNotExist: continue a.attrs['href'] = target_cell.page.get_online_url() for img in document.find_all('img'): img.attrs['src'] = 'https://doc.entrouvert.org/%s/dev/%s' % (module, img.attrs['src']) if more_info: new_content = content.decode().replace(more_info.decode(), '') cell.text = new_content cell.save()