Coverage for src/lilbee/crawler/fetcher.py: 100%

1"""Protocol for web-fetcher backends.

3The orchestration layer (``api.py``) calls into a ``WebFetcher``

4instance; the adapter (``crawl4ai_fetcher.py``) implements this

5Protocol. Migrating to a different SDK is a one-file swap: delete

6the adapter, add a new one, change the import in ``api.py``.

8Lifecycle:

10 async with fetcher:

11 page = await fetcher.fetch_single(url, timeout=...)

12 async for page in fetcher.fetch_recursive(...):

13 ...

15``__aenter__`` must be called before any fetch method; ``__aexit__``

16tears the backend down (browser close, session cleanup, etc.).

17``fetch_recursive`` is the streaming entry point: it yields

18``FetchedPage`` objects as they arrive so callers can flush per-page.

19"""

21from __future__ import annotations

23from collections.abc import AsyncGenerator

24from typing import Any, Protocol, runtime_checkable

26from lilbee.crawler.models import (

27 CancelToken,

28 ConcurrencySpec,

29 FetchedPage,

30 FilterSpec,

31)

34@runtime_checkable

35class WebFetcher(Protocol):

36 """Backend contract for fetching web pages as markdown.

38 Implementations must honour ``CancelToken`` promptly inside

39 ``fetch_recursive`` so the streaming loop in ``api.py`` can

40 abort without waiting for an in-flight batch to drain.

42 Lifecycle ordering:

44 1. ``__aenter__`` is called before any fetch method. Adapters with

45 per-operation setup (e.g. crawl4ai opens a fresh

46 ``AsyncWebCrawler`` inside each fetch method) may no-op here.

47 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple

48 times during the same context; they must not assume fresh state.

49 3. ``fetch_recursive`` returns an async generator; callers are

50 expected to ``.aclose()`` it deterministically on early break.

51 4. ``__aexit__`` tears the backend down and must succeed even if

52 a prior fetch raised.

53 """

55 async def __aenter__(self) -> WebFetcher: ...

57 async def __aexit__(

58 self,

59 exc_type: type[BaseException] | None,

60 exc: BaseException | None,

61 tb: Any,

62 ) -> None: ...

64 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage:

65 """Fetch one URL and return its markdown + link set."""

66 ...

68 def fetch_recursive(

69 self,

70 seed_url: str,

71 *,

72 depth: int | None,

73 max_pages: int | None,

74 timeout: float,

75 concurrency: ConcurrencySpec,

76 filters: FilterSpec,

77 cancel: CancelToken | None = None,

78 ) -> AsyncGenerator[FetchedPage, None]:

79 """Stream pages discovered by BFS from ``seed_url``.

81 ``depth`` / ``max_pages``: positive int caps, or ``None`` for

82 unbounded. Adapters translate ``None`` into whatever sentinel the

83 underlying SDK wants (crawl4ai uses ``math.inf``).

85 Returns an async generator so the orchestration layer can

86 react per page (progress events, save-to-disk, cancel) and

87 deterministically ``.aclose()`` the stream when it breaks

88 out early (e.g. on ``max_pages`` hard cap).

89 """

90 ...

Coverage for src / lilbee / crawler / fetcher.py: 100%

4 statements