Coverage for src / lilbee / crawler / bootstrap.py: 100%
78 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Playwright Chromium bootstrap.
3Backend-neutral in the sense that it only manages the browser
4binary: any fetcher that drives Chromium via Playwright benefits
5from the same detection + install flow. The crawl4ai adapter
6currently calls ``chromium_installed()`` before opening a crawler.
7"""
9from __future__ import annotations
11import asyncio
12import os
13import re
14import sys
15from pathlib import Path
17from lilbee.progress import (
18 DetailedProgressCallback,
19 EventType,
20 SetupDoneEvent,
21 SetupProgressEvent,
22 SetupStartEvent,
23)
26class CrawlerBrowserMissing(RuntimeError):
27 """Playwright is installed but its Chromium browser binary is not."""
30class CrawlerBackendMissing(RuntimeError):
31 """The ``crawler`` extra (crawl4ai) was never installed."""
34_CHROMIUM_COMPONENT = "chromium"
35# Rough size estimate for the Chromium download; Playwright bundles vary
36# slightly per platform but this gives the UI a decent denominator before
37# 'Total bytes' parses out of stdout.
38_CHROMIUM_ESTIMATE_MB = 180
39_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024
41# Unit -> bytes scale for Playwright stdout progress lines.
42_BYTE_UNIT_SCALE: dict[str, int] = {
43 "b": 1,
44 "kb": 1024,
45 "kib": 1024,
46 "mb": 1024 * 1024,
47 "mib": 1024 * 1024,
48}
50# Playwright 1.58 prints lines like
51# ``|■■■■■■■■ | 10% of 162.3 MiB`` during
52# the chromium download. The percent comes first, then "of <total> <unit>".
53_PROGRESS_LINE_RE = re.compile(
54 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)",
55 re.IGNORECASE,
56)
59def _browsers_cache_path() -> Path:
60 """Return the root path where Playwright stores browser binaries."""
61 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH")
62 if override:
63 return Path(override).expanduser()
64 if sys.platform == "darwin":
65 return Path.home() / "Library" / "Caches" / "ms-playwright"
66 if sys.platform == "win32":
67 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local"))
68 return Path(local) / "ms-playwright"
69 return Path.home() / ".cache" / "ms-playwright"
72def chromium_installed() -> bool:
73 """Return True if at least one chromium-* install directory exists."""
74 root = _browsers_cache_path()
75 if not root.exists():
76 return False
77 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir())
80def crawler_browsers_path() -> Path:
81 """Public accessor for the crawler browser cache root.
83 Used by the HTTP status endpoint to tell plugins where Chromium
84 lives. The underlying resolver stays private because callers should
85 not depend on the Playwright-specific directory layout.
86 """
87 return _browsers_cache_path()
90def _bytes_from_stdout(line: str) -> tuple[int, int] | None:
91 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line.
93 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the
94 Chromium download. Returns None when the line doesn't match. The
95 percent and total both parse out of the same line so callers never
96 have to handle a missing total.
97 """
98 match = _PROGRESS_LINE_RE.search(line)
99 if match is None:
100 return None
101 pct = int(match.group(1))
102 raw_total = float(match.group(2))
103 unit = match.group(3).lower()
104 scale = _BYTE_UNIT_SCALE.get(unit, 1)
105 total = int(raw_total * scale)
106 downloaded = int(total * pct / 100)
107 return downloaded, total
110def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None:
111 if on_progress is None:
112 return
113 on_progress(
114 EventType.SETUP_START,
115 SetupStartEvent(
116 component=_CHROMIUM_COMPONENT,
117 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES,
118 ),
119 )
122def _emit_setup_done(
123 on_progress: DetailedProgressCallback | None,
124 *,
125 success: bool,
126 error: str | None,
127) -> None:
128 if on_progress is None:
129 return
130 on_progress(
131 EventType.SETUP_DONE,
132 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error),
133 )
136async def _drain_stdout_to_progress(
137 stream: asyncio.StreamReader,
138 on_progress: DetailedProgressCallback | None,
139) -> None:
140 while True:
141 line_bytes = await stream.readline()
142 if not line_bytes:
143 return
144 line = line_bytes.decode(errors="replace").rstrip()
145 parsed = _bytes_from_stdout(line)
146 if parsed is None or on_progress is None:
147 continue
148 downloaded, total = parsed
149 on_progress(
150 EventType.SETUP_PROGRESS,
151 SetupProgressEvent(
152 component=_CHROMIUM_COMPONENT,
153 downloaded_bytes=downloaded,
154 total_bytes=total,
155 detail=line,
156 ),
157 )
160async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None:
161 while True:
162 line_bytes = await stream.readline()
163 if not line_bytes:
164 return
165 tail.append(line_bytes.decode(errors="replace").rstrip())
168async def bootstrap_chromium(
169 on_progress: DetailedProgressCallback | None = None,
170) -> None:
171 """Run ``playwright install chromium`` as a subprocess, emitting events.
173 Short-circuits when ``chromium_installed()`` is already True. Emits
174 ``setup_start`` before spawning, ``setup_progress`` for each
175 recognizable progress line on stdout, and ``setup_done`` on exit
176 (``success=False`` + the subprocess stderr tail on failure). Raises
177 :class:`CrawlerBrowserMissing` with the tail so task workers route
178 to FAILED cleanly.
180 Uses the current Python interpreter's ``playwright`` module so this
181 works under ``uv tool install`` and bundled installs alike without
182 relying on a globally-installed ``playwright`` CLI.
183 """
184 if chromium_installed():
185 _emit_setup_done(on_progress, success=True, error=None)
186 return
188 _emit_setup_start(on_progress)
190 proc = await asyncio.create_subprocess_exec(
191 sys.executable,
192 "-m",
193 "playwright",
194 "install",
195 "chromium",
196 stdout=asyncio.subprocess.PIPE,
197 stderr=asyncio.subprocess.PIPE,
198 )
199 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees
200 # non-None streams at runtime; the asserts only satisfy the type checker.
201 assert proc.stdout is not None # noqa: S101
202 assert proc.stderr is not None # noqa: S101
204 stderr_tail: list[str] = []
205 await asyncio.gather(
206 _drain_stdout_to_progress(proc.stdout, on_progress),
207 _drain_stderr(proc.stderr, stderr_tail),
208 )
209 returncode = await proc.wait()
211 if returncode != 0:
212 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}"
213 _emit_setup_done(on_progress, success=False, error=tail)
214 raise CrawlerBrowserMissing(f"Chromium bootstrap failed (exit {returncode}): {tail}")
216 _emit_setup_done(on_progress, success=True, error=None)