Coverage for src / lilbee / crawler / sitemap.py: 100%
34 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Best-effort sitemap.xml lookups used as a progress-hint denominator.
3Pure HTTP + regex: fetches ``/sitemap.xml`` at the root of the starting
4host, counts ``<loc>`` entries matching the crawl scope, and returns
5the count. Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure so the
6orchestrator can render ``[n/?]`` instead of a hard-coded ceiling.
8Not load-bearing: correctness is best-effort and every branch falls
9back cleanly on error.
10"""
12from __future__ import annotations
14import re
15from urllib.parse import urlparse
17from lilbee.crawler.url_filter import host_in_scope
18from lilbee.progress import CRAWL_TOTAL_UNKNOWN
20# Sitemap lookups are best-effort progress hints; never block the actual crawl.
21_SITEMAP_FETCH_TIMEOUT_SECONDS = 5.0
22_SITEMAP_MAX_URLS = 10_000
23_SITEMAP_URL_TAG_RE = re.compile(r"<loc>\s*([^<]+?)\s*</loc>", re.IGNORECASE)
26def _fetch_sitemap_text(start_url: str) -> str | None:
27 """Return sitemap.xml body or None on any fetch/status failure."""
28 import httpx
30 parsed = urlparse(start_url)
31 sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"
32 try:
33 resp = httpx.get(sitemap_url, timeout=_SITEMAP_FETCH_TIMEOUT_SECONDS, follow_redirects=True)
34 except (httpx.HTTPError, OSError):
35 return None
36 if resp.status_code >= 400:
37 return None
38 return resp.text
41def _count_sitemap_urls(start_url: str, *, include_subdomains: bool) -> int:
42 """Best-effort count of URLs in the host's /sitemap.xml that match the crawl scope.
44 Returns ``CRAWL_TOTAL_UNKNOWN`` on any failure (missing sitemap, timeout,
45 parse error, redirect away from the starting host). This is purely a
46 progress-hint denominator, so correctness is not load-bearing.
48 Only fetches sitemap.xml directly at the root of the starting host; does
49 not follow robots.txt references or nested sitemap indexes.
50 """
51 host = (urlparse(start_url).hostname or "").lower()
52 if not host:
53 return CRAWL_TOTAL_UNKNOWN
54 text = _fetch_sitemap_text(start_url)
55 if text is None:
56 return CRAWL_TOTAL_UNKNOWN
58 count = 0
59 for match in _SITEMAP_URL_TAG_RE.finditer(text):
60 link_host = (urlparse(match.group(1).strip()).hostname or "").lower()
61 if host_in_scope(link_host, host, include_subdomains=include_subdomains):
62 count += 1
63 if count >= _SITEMAP_MAX_URLS:
64 break
65 return count if count > 0 else CRAWL_TOTAL_UNKNOWN