"""Source for remote files""" from abc import ABC, abstractmethod from contextlib import asynccontextmanager from re import Pattern from re import compile as re_compile from typing import AsyncGenerator, AsyncIterable, Optional, cast from aiohttp import ClientSession from bs4 import BeautifulSoup from playwright.async_api import ( BrowserContext, Page, Route, ViewportSize, async_playwright, ) from frontools.cache import Cache from frontools.utils import ErrorSummary class Browser: def __init__(self, source: "Source", browser_context: BrowserContext) -> None: """Wraps a browser instance, with helpers methods to load pages.""" self._source = source self._browser_context = browser_context @asynccontextmanager async def load_page(self, url: str) -> AsyncGenerator[Page, None]: page = await self._browser_context.new_page() await page.route("*", self._source.route) await page.goto(url) await page.wait_for_load_state("networkidle") yield page await page.close() class Source(ABC): """Base class for sources""" def __init__(self, error_summary: ErrorSummary) -> None: self._error_summary = error_summary @abstractmethod async def get_url(self, url: str) -> bytes: """Retrieve the given url content""" @asynccontextmanager async def get_browser( self, width: Optional[int] = None, height: Optional[int] = None ) -> AsyncGenerator[Browser, None]: """Return a Playwright browser that will eventually get files from local cache""" viewport: ViewportSize = cast( ViewportSize, None ) # playwright typings are broken if width is not None: assert height is not None viewport = dict(width=width, height=height) async with async_playwright() as pwright: browser = await pwright.firefox.launch(headless=True) context = await browser.new_context( viewport=viewport, ignore_https_errors=True ) yield Browser(self, context) await browser.close() async def route(self, route: Route) -> None: content = await self.get_url(route.request.url) await route.fulfill(body=content, status=200) class CachedSource(Source): """Source loading urls from the internet.""" def __init__(self, error_summary: ErrorSummary, cache: Cache[bytes]) -> None: super().__init__(error_summary) self._cache = cache async def get_url(self, url: str) -> bytes: """Get a page content from the local or remote cache.""" return await self._cache.get(url, self._load_url) @staticmethod async def _load_url(url: str) -> bytes: async with ClientSession() as session: async with session.get(url) as response: return await response.content.read() class OverrideSource(Source): """Source overriding paths matching patterns with local files""" def __init__( self, error_summary: ErrorSummary, mappings: list[tuple[str, str]], next_source: Source, ): super().__init__(error_summary) self._mappings: list[tuple[Pattern[str], str]] = [] self._next_source = next_source for pattern, replace in mappings: self._mappings.append((re_compile(pattern), replace)) async def get_url(self, url: str) -> bytes: """Return local stylesheet""" for pattern, replace in self._mappings: if pattern.match(url): mapped_path = pattern.sub(replace, url) with open(mapped_path, "rb") as mapped_file: return mapped_file.read() return await self._next_source.get_url(url) async def get_page_stylesheets(source: Source, url: str) -> AsyncIterable[str]: """Return styleheets urls for a given page.""" page_content = await source.get_url(url) page_html = BeautifulSoup(page_content, features="html5lib") links = page_html.find_all("link") for link in links: if "stylesheet" not in link.get("rel", []): continue yield link["href"]