Coverage for src / lilbee / crawler / __init__.py: 100%

9 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Web crawling. Fetch pages as markdown and save to the documents directory. 

2 

3This package is the public face of lilbee's crawling subsystem. All 

4callers (``cli/commands.py``, ``mcp.py``, ``server/handlers.py``, 

5``cli/tui/screens/chat.py``, ``crawl_task.py``, ``server/routes/setup.py``) 

6import symbols from here. 

7 

8Layout: 

9 

10- :mod:`lilbee.crawler.models`: value types (``CrawlResult``, ``FetchedPage``, 

11 specs) 

12- :mod:`lilbee.crawler.fetcher`: ``WebFetcher`` Protocol 

13- :mod:`lilbee.crawler.url_filter`: URL validation + host scope 

14- :mod:`lilbee.crawler.sitemap`: best-effort sitemap progress hint 

15- :mod:`lilbee.crawler.bootstrap`: Playwright Chromium install + detection 

16- :mod:`lilbee.crawler.save`: URL-to-filename, metadata I/O, per-page save 

17- :mod:`lilbee.crawler.api`: orchestration (``crawl_single``, 

18 ``crawl_recursive``, ``crawl_and_save``) 

19- :mod:`lilbee.crawler.crawl4ai_fetcher`: crawl4ai-backed ``WebFetcher``. 

20 ONLY file importing ``crawl4ai``; the swap point for a future backend. 

21""" 

22 

23from __future__ import annotations 

24 

25from lilbee.crawler.api import ( 

26 crawl_and_save, 

27 crawl_recursive, 

28 crawl_single, 

29) 

30from lilbee.crawler.bootstrap import ( 

31 CrawlerBackendMissing, 

32 CrawlerBrowserMissing, 

33 bootstrap_chromium, 

34 chromium_installed, 

35 crawler_browsers_path, 

36) 

37from lilbee.crawler.crawl4ai_fetcher import crawler_available 

38from lilbee.crawler.fetcher import WebFetcher 

39from lilbee.crawler.models import ( 

40 CancelToken, 

41 ConcurrencySpec, 

42 CrawlResult, 

43 FetchedPage, 

44 FilterSpec, 

45) 

46from lilbee.crawler.save import ( 

47 METADATA_FLUSH_INTERVAL, 

48 CrawlMeta, 

49 content_hash, 

50 load_crawl_metadata, 

51 save_crawl_metadata, 

52 url_to_filename, 

53) 

54from lilbee.crawler.url_filter import ( 

55 get_blocked_networks, 

56 is_url, 

57 require_valid_crawl_url, 

58 validate_crawl_url, 

59) 

60 

61__all__ = [ 

62 "METADATA_FLUSH_INTERVAL", 

63 "CancelToken", 

64 "ConcurrencySpec", 

65 "CrawlMeta", 

66 "CrawlResult", 

67 "CrawlerBackendMissing", 

68 "CrawlerBrowserMissing", 

69 "FetchedPage", 

70 "FilterSpec", 

71 "WebFetcher", 

72 "bootstrap_chromium", 

73 "chromium_installed", 

74 "content_hash", 

75 "crawl_and_save", 

76 "crawl_recursive", 

77 "crawl_single", 

78 "crawler_available", 

79 "crawler_browsers_path", 

80 "get_blocked_networks", 

81 "is_url", 

82 "load_crawl_metadata", 

83 "require_valid_crawl_url", 

84 "save_crawl_metadata", 

85 "url_to_filename", 

86 "validate_crawl_url", 

87]