Coverage for src / lilbee / crawler / models.py: 100%
33 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Backend-agnostic value types for the crawler package.
3These dataclasses cross the seam between the orchestration layer
4(``api.py``) and the web-fetcher backend (``crawl4ai_fetcher.py``).
5No third-party types leak through them, so a future adapter can
6satisfy the ``WebFetcher`` Protocol without pulling in crawl4ai.
7"""
9from __future__ import annotations
11import threading
12from dataclasses import dataclass, field
13from typing import TypeAlias
16@dataclass
17class CrawlResult:
18 """Outcome of crawling a single URL.
20 This is the high-level result surfaced to lilbee callers
21 (CLI, MCP, HTTP, TUI). The adapter produces ``FetchedPage``
22 and the orchestration layer converts it to ``CrawlResult``
23 when returning up to the caller.
24 """
26 url: str
27 markdown: str = ""
28 success: bool = True
29 error: str | None = None
32@dataclass
33class FetchedPage:
34 """Single page produced by a ``WebFetcher`` backend.
36 Distinct from :class:`CrawlResult` so the adapter surface
37 stays narrow and neutral: just the bytes we needed out of
38 the underlying SDK's response object.
39 """
41 url: str
42 markdown: str = ""
43 success: bool = True
44 error: str | None = None
45 links: list[str] = field(default_factory=list)
48@dataclass
49class ConcurrencySpec:
50 """Backend-agnostic concurrency + rate-limit knobs.
52 The crawl4ai adapter translates these into ``RateLimiter`` and
53 ``SemaphoreDispatcher`` calls; a future adapter with its own
54 BFS loop maps them onto ``asyncio.Semaphore`` + retry logic.
55 """
57 semaphore_count: int = 1
58 mean_delay: float = 0.0
59 max_delay_range: float = 0.0
60 retry_on_rate_limit: bool = False
61 retry_base_delay_min: float = 0.0
62 retry_base_delay_max: float = 0.0
63 retry_max_backoff: float = 0.0
64 retry_max_attempts: int = 0
67@dataclass
68class FilterSpec:
69 """Backend-agnostic filter settings applied to discovered links.
71 Pure Python data; each adapter decides how to plug the settings
72 into its own filter pipeline.
73 """
75 exclude_patterns: list[str] = field(default_factory=list)
76 include_subdomains: bool = False
79CancelToken: TypeAlias = threading.Event
80"""Cancellation handle the orchestration layer passes to a fetcher.
82An already-``set()`` event means "stop as soon as you can". The
83crawl4ai adapter polls it in both its streaming loop and its BFS
84strategy's ``should_cancel`` hook; a future adapter can poll it
85in whatever granularity it supports.
86"""