Coverage for src / lilbee / crawler / fetcher.py: 100%

4 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Protocol for web-fetcher backends. 

2 

3The orchestration layer (``api.py``) calls into a ``WebFetcher`` 

4instance; the adapter (``crawl4ai_fetcher.py``) implements this 

5Protocol. Migrating to a different SDK is a one-file swap: delete 

6the adapter, add a new one, change the import in ``api.py``. 

7 

8Lifecycle: 

9 

10 async with fetcher: 

11 page = await fetcher.fetch_single(url, timeout=...) 

12 async for page in fetcher.fetch_recursive(...): 

13 ... 

14 

15``__aenter__`` must be called before any fetch method; ``__aexit__`` 

16tears the backend down (browser close, session cleanup, etc.). 

17``fetch_recursive`` is the streaming entry point: it yields 

18``FetchedPage`` objects as they arrive so callers can flush per-page. 

19""" 

20 

21from __future__ import annotations 

22 

23from collections.abc import AsyncGenerator 

24from typing import Any, Protocol, runtime_checkable 

25 

26from lilbee.crawler.models import ( 

27 CancelToken, 

28 ConcurrencySpec, 

29 FetchedPage, 

30 FilterSpec, 

31) 

32 

33 

34@runtime_checkable 

35class WebFetcher(Protocol): 

36 """Backend contract for fetching web pages as markdown. 

37 

38 Implementations must honour ``CancelToken`` promptly inside 

39 ``fetch_recursive`` so the streaming loop in ``api.py`` can 

40 abort without waiting for an in-flight batch to drain. 

41 

42 Lifecycle ordering: 

43 

44 1. ``__aenter__`` is called before any fetch method. Adapters with 

45 per-operation setup (e.g. crawl4ai opens a fresh 

46 ``AsyncWebCrawler`` inside each fetch method) may no-op here. 

47 2. ``fetch_single`` and ``fetch_recursive`` may be called multiple 

48 times during the same context; they must not assume fresh state. 

49 3. ``fetch_recursive`` returns an async generator; callers are 

50 expected to ``.aclose()`` it deterministically on early break. 

51 4. ``__aexit__`` tears the backend down and must succeed even if 

52 a prior fetch raised. 

53 """ 

54 

55 async def __aenter__(self) -> WebFetcher: ... 

56 

57 async def __aexit__( 

58 self, 

59 exc_type: type[BaseException] | None, 

60 exc: BaseException | None, 

61 tb: Any, 

62 ) -> None: ... 

63 

64 async def fetch_single(self, url: str, *, timeout: float) -> FetchedPage: 

65 """Fetch one URL and return its markdown + link set.""" 

66 ... 

67 

68 def fetch_recursive( 

69 self, 

70 seed_url: str, 

71 *, 

72 depth: int | None, 

73 max_pages: int | None, 

74 timeout: float, 

75 concurrency: ConcurrencySpec, 

76 filters: FilterSpec, 

77 cancel: CancelToken | None = None, 

78 ) -> AsyncGenerator[FetchedPage, None]: 

79 """Stream pages discovered by BFS from ``seed_url``. 

80 

81 ``depth`` / ``max_pages``: positive int caps, or ``None`` for 

82 unbounded. Adapters translate ``None`` into whatever sentinel the 

83 underlying SDK wants (crawl4ai uses ``math.inf``). 

84 

85 Returns an async generator so the orchestration layer can 

86 react per page (progress events, save-to-disk, cancel) and 

87 deterministically ``.aclose()`` the stream when it breaks 

88 out early (e.g. on ``max_pages`` hard cap). 

89 """ 

90 ...