Coverage for src / lilbee / crawler / sitemap.py: 100%

34 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Best-effort sitemap.xml lookups used as a progress-hint denominator. 

2 

3Pure HTTP + regex: fetches ``/sitemap.xml`` at the root of the starting 

4host, counts ``<loc>`` entries matching the crawl scope, and returns 

5the count. Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure so the 

6orchestrator can render ``[n/?]`` instead of a hard-coded ceiling. 

7 

8Not load-bearing: correctness is best-effort and every branch falls 

9back cleanly on error. 

10""" 

11 

12from __future__ import annotations 

13 

14import re 

15from urllib.parse import urlparse 

16 

17from lilbee.crawler.url_filter import host_in_scope 

18from lilbee.progress import CRAWL_TOTAL_UNKNOWN 

19 

20# Sitemap lookups are best-effort progress hints; never block the actual crawl. 

21_SITEMAP_FETCH_TIMEOUT_SECONDS = 5.0 

22_SITEMAP_MAX_URLS = 10_000 

23_SITEMAP_URL_TAG_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>", re.IGNORECASE) 

24 

25 

26def _fetch_sitemap_text(start_url: str) -> str | None: 

27 """Return sitemap.xml body or None on any fetch/status failure.""" 

28 import httpx 

29 

30 parsed = urlparse(start_url) 

31 sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml" 

32 try: 

33 resp = httpx.get(sitemap_url, timeout=_SITEMAP_FETCH_TIMEOUT_SECONDS, follow_redirects=True) 

34 except (httpx.HTTPError, OSError): 

35 return None 

36 if resp.status_code >= 400: 

37 return None 

38 return resp.text 

39 

40 

41def _count_sitemap_urls(start_url: str, *, include_subdomains: bool) -> int: 

42 """Best-effort count of URLs in the host's /sitemap.xml that match the crawl scope. 

43 

44 Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure (missing sitemap, timeout, 

45 parse error, redirect away from the starting host). This is purely a 

46 progress-hint denominator, so correctness is not load-bearing. 

47 

48 Only fetches sitemap.xml directly at the root of the starting host; does 

49 not follow robots.txt references or nested sitemap indexes. 

50 """ 

51 host = (urlparse(start_url).hostname or "").lower() 

52 if not host: 

53 return CRAWL_TOTAL_UNKNOWN 

54 text = _fetch_sitemap_text(start_url) 

55 if text is None: 

56 return CRAWL_TOTAL_UNKNOWN 

57 

58 count = 0 

59 for match in _SITEMAP_URL_TAG_RE.finditer(text): 

60 link_host = (urlparse(match.group(1).strip()).hostname or "").lower() 

61 if host_in_scope(link_host, host, include_subdomains=include_subdomains): 

62 count += 1 

63 if count >= _SITEMAP_MAX_URLS: 

64 break 

65 return count if count > 0 else CRAWL_TOTAL_UNKNOWN