import urlparse from bs4 import BeautifulSoup import html5lib import requests from combo.data.models import TextCell for cell in TextCell.objects.filter(slug__startswith='mallard-'): if not cell.slug.startswith('mallard-'): continue mallard_page = cell.slug.split('-', 1)[1] for module in ('wcs', 'publik-base-theme'): resp = requests.get('https://doc.entrouvert.org/%s/dev/%s.html' % (module, mallard_page)) if resp.status_code != 200: continue document = BeautifulSoup(resp.content, 'html5lib') content = document.find('div', 'body') more_info = document.find('div', 'sect sect-links') for a in document.find_all('a'): href = a.attrs['href'] parsed = urlparse.urlparse(href) if parsed.netloc: continue if '/' in parsed.path: continue try: target_cell = TextCell.objects.get(slug='mallard-%s' % parsed.path.replace('.html', '')) except TextCell.DoesNotExist: continue a.attrs['href'] = target_cell.page.get_online_url() new_content = unicode(content).replace(unicode(more_info), '') cell.text = new_content cell.save()