Coverage for src / lilbee / crawler / fetcher.py: 100%
4 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Protocol for web-fetcher backends.
3The orchestration layer (``api.py``) calls into a ``WebFetcher``
4instance; the adapter (``crawl4ai_fetcher.py``) implements this
5Protocol. Migrating to a different SDK is a one-file swap: delete
6the adapter, add a new one, change the import in ``api.py``.
8Lifecycle:
10 async with fetcher:
11 page = await fetcher.fetch_single(url, timeout=...)
12 async for page in fetcher.fetch_recursive(...):
13 ...
15``__aenter__`` must be called before any fetch method; ``__aexit__``
16tears the backend down (browser close, session cleanup, etc.).
17``fetch_recursive`` is the streaming entry point: it yields
18``FetchedPage`` objects as they arrive so callers can flush per-page.
19"""
21from __future__ import annotations
23from collections.abc import AsyncGenerator
24from typing import Any, Protocol, runtime_checkable
26from lilbee.crawler.models import (
27 CancelToken,
28 ConcurrencySpec,
29 FetchedPage,
30 FilterSpec,
31)
34@runtime_checkable
35class WebFetcher(Protocol):
36 """Backend contract for fetching web pages as markdown.
38 Implementations must honour ``CancelToken`` promptly inside
39 ``fetch_recursive`` so the streaming loop in ``api.py`` can
40 abort without waiting for an in-flight batch to drain.
42 Lifecycle ordering:
44 1. ``__aenter__`` is called before any fetch method. Adapters with
45 per-operation setup (e.g. crawl4ai opens a fresh
46 ``AsyncWebCrawler`` inside each fetch method) may no-op here.
47 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple
48 times during the same context; they must not assume fresh state.
49 3. ``fetch_recursive`` returns an async generator; callers are
50 expected to ``.aclose()`` it deterministically on early break.
51 4. ``__aexit__`` tears the backend down and must succeed even if
52 a prior fetch raised.
53 """
55 async def __aenter__(self) -> WebFetcher: ...
57 async def __aexit__(
58 self,
59 exc_type: type[BaseException] | None,
60 exc: BaseException | None,
61 tb: Any,
62 ) -> None: ...
64 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage:
65 """Fetch one URL and return its markdown + link set."""
66 ...
68 def fetch_recursive(
69 self,
70 seed_url: str,
71 *,
72 depth: int | None,
73 max_pages: int | None,
74 timeout: float,
75 concurrency: ConcurrencySpec,
76 filters: FilterSpec,
77 cancel: CancelToken | None = None,
78 ) -> AsyncGenerator[FetchedPage, None]:
79 """Stream pages discovered by BFS from ``seed_url``.
81 ``depth`` / ``max_pages``: positive int caps, or ``None`` for
82 unbounded. Adapters translate ``None`` into whatever sentinel the
83 underlying SDK wants (crawl4ai uses ``math.inf``).
85 Returns an async generator so the orchestration layer can
86 react per page (progress events, save-to-disk, cancel) and
87 deterministically ``.aclose()`` the stream when it breaks
88 out early (e.g. on ``max_pages`` hard cap).
89 """
90 ...