Coverage for src / lilbee / crawler / __init__.py: 100%
9 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Web crawling. Fetch pages as markdown and save to the documents directory.
3This package is the public face of lilbee's crawling subsystem. All
4callers (``cli/commands.py``, ``mcp.py``, ``server/handlers.py``,
5``cli/tui/screens/chat.py``, ``crawl_task.py``, ``server/routes/setup.py``)
6import symbols from here.
8Layout:
10- :mod:`lilbee.crawler.models`: value types (``CrawlResult``, ``FetchedPage``,
11 specs)
12- :mod:`lilbee.crawler.fetcher`: ``WebFetcher`` Protocol
13- :mod:`lilbee.crawler.url_filter`: URL validation + host scope
14- :mod:`lilbee.crawler.sitemap`: best-effort sitemap progress hint
15- :mod:`lilbee.crawler.bootstrap`: Playwright Chromium install + detection
16- :mod:`lilbee.crawler.save`: URL-to-filename, metadata I/O, per-page save
17- :mod:`lilbee.crawler.api`: orchestration (``crawl_single``,
18 ``crawl_recursive``, ``crawl_and_save``)
19- :mod:`lilbee.crawler.crawl4ai_fetcher`: crawl4ai-backed ``WebFetcher``.
20 ONLY file importing ``crawl4ai``; the swap point for a future backend.
21"""
23from __future__ import annotations
25from lilbee.crawler.api import (
26 crawl_and_save,
27 crawl_recursive,
28 crawl_single,
29)
30from lilbee.crawler.bootstrap import (
31 CrawlerBackendMissing,
32 CrawlerBrowserMissing,
33 bootstrap_chromium,
34 chromium_installed,
35 crawler_browsers_path,
36)
37from lilbee.crawler.crawl4ai_fetcher import crawler_available
38from lilbee.crawler.fetcher import WebFetcher
39from lilbee.crawler.models import (
40 CancelToken,
41 ConcurrencySpec,
42 CrawlResult,
43 FetchedPage,
44 FilterSpec,
45)
46from lilbee.crawler.save import (
47 METADATA_FLUSH_INTERVAL,
48 CrawlMeta,
49 content_hash,
50 load_crawl_metadata,
51 save_crawl_metadata,
52 url_to_filename,
53)
54from lilbee.crawler.url_filter import (
55 get_blocked_networks,
56 is_url,
57 require_valid_crawl_url,
58 validate_crawl_url,
59)
61__all__ = [
62 "METADATA_FLUSH_INTERVAL",
63 "CancelToken",
64 "ConcurrencySpec",
65 "CrawlMeta",
66 "CrawlResult",
67 "CrawlerBackendMissing",
68 "CrawlerBrowserMissing",
69 "FetchedPage",
70 "FilterSpec",
71 "WebFetcher",
72 "bootstrap_chromium",
73 "chromium_installed",
74 "content_hash",
75 "crawl_and_save",
76 "crawl_recursive",
77 "crawl_single",
78 "crawler_available",
79 "crawler_browsers_path",
80 "get_blocked_networks",
81 "is_url",
82 "load_crawl_metadata",
83 "require_valid_crawl_url",
84 "save_crawl_metadata",
85 "url_to_filename",
86 "validate_crawl_url",
87]