misc-csechet/frontools/sources.py

131 lines
4.1 KiB
Python

"""Source for remote files"""
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from re import Pattern
from re import compile as re_compile
from typing import AsyncGenerator, AsyncIterable, Optional, cast
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from playwright.async_api import (
BrowserContext,
Page,
Route,
ViewportSize,
async_playwright,
)
from frontools.cache import Cache
from frontools.utils import ErrorSummary
class Browser:
def __init__(self, source: "Source", browser_context: BrowserContext) -> None:
"""Wraps a browser instance, with helpers methods to load pages."""
self._source = source
self._browser_context = browser_context
@asynccontextmanager
async def load_page(self, url: str) -> AsyncGenerator[Page, None]:
page = await self._browser_context.new_page()
await page.route("*", self._source.route)
await page.goto(url)
await page.wait_for_load_state("networkidle")
yield page
await page.close()
class Source(ABC):
"""Base class for sources"""
def __init__(self, error_summary: ErrorSummary) -> None:
self._error_summary = error_summary
@abstractmethod
async def get_url(self, url: str) -> bytes:
"""Retrieve the given url content"""
@asynccontextmanager
async def get_browser(
self, width: Optional[int] = None, height: Optional[int] = None
) -> AsyncGenerator[Browser, None]:
"""Return a Playwright browser that will eventually get files from local cache"""
viewport: ViewportSize = cast(
ViewportSize, None
) # playwright typings are broken
if width is not None:
assert height is not None
viewport = dict(width=width, height=height)
async with async_playwright() as pwright:
browser = await pwright.firefox.launch(headless=True)
context = await browser.new_context(
viewport=viewport, ignore_https_errors=True
)
yield Browser(self, context)
await browser.close()
async def route(self, route: Route) -> None:
content = await self.get_url(route.request.url)
await route.fulfill(body=content, status=200)
class CachedSource(Source):
"""Source loading urls from the internet."""
def __init__(self, error_summary: ErrorSummary, cache: Cache[bytes]) -> None:
super().__init__(error_summary)
self._cache = cache
async def get_url(self, url: str) -> bytes:
"""Get a page content from the local or remote cache."""
return await self._cache.get(url, self._load_url)
@staticmethod
async def _load_url(url: str) -> bytes:
async with ClientSession() as session:
async with session.get(url) as response:
return await response.content.read()
class OverrideSource(Source):
"""Source overriding paths matching patterns with local files"""
def __init__(
self,
error_summary: ErrorSummary,
mappings: list[tuple[str, str]],
next_source: Source,
):
super().__init__(error_summary)
self._mappings: list[tuple[Pattern[str], str]] = []
self._next_source = next_source
for pattern, replace in mappings:
self._mappings.append((re_compile(pattern), replace))
async def get_url(self, url: str) -> bytes:
"""Return local stylesheet"""
for pattern, replace in self._mappings:
if pattern.match(url):
mapped_path = pattern.sub(replace, url)
with open(mapped_path, "rb") as mapped_file:
return mapped_file.read()
return await self._next_source.get_url(url)
async def get_page_stylesheets(source: Source, url: str) -> AsyncIterable[str]:
"""Return styleheets urls for a given page."""
page_content = await source.get_url(url)
page_html = BeautifulSoup(page_content, features="html5lib")
links = page_html.find_all("link")
for link in links:
if "stylesheet" not in link.get("rel", []):
continue
yield link["href"]