Coverage for src / lilbee / crawler / bootstrap.py: 100%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Playwright Chromium bootstrap. 

2 

3Backend-neutral in the sense that it only manages the browser 

4binary: any fetcher that drives Chromium via Playwright benefits 

5from the same detection + install flow. The crawl4ai adapter 

6currently calls ``chromium_installed()`` before opening a crawler. 

7""" 

8 

9from __future__ import annotations 

10 

11import asyncio 

12import os 

13import re 

14import sys 

15from pathlib import Path 

16 

17from lilbee.progress import ( 

18 DetailedProgressCallback, 

19 EventType, 

20 SetupDoneEvent, 

21 SetupProgressEvent, 

22 SetupStartEvent, 

23) 

24 

25 

26class CrawlerBrowserMissing(RuntimeError): 

27 """Playwright is installed but its Chromium browser binary is not.""" 

28 

29 

30class CrawlerBackendMissing(RuntimeError): 

31 """The ``crawler`` extra (crawl4ai) was never installed.""" 

32 

33 

34_CHROMIUM_COMPONENT = "chromium" 

35# Rough size estimate for the Chromium download; Playwright bundles vary 

36# slightly per platform but this gives the UI a decent denominator before 

37# 'Total bytes' parses out of stdout. 

38_CHROMIUM_ESTIMATE_MB = 180 

39_CHROMIUM_SIZE_ESTIMATE_BYTES = _CHROMIUM_ESTIMATE_MB * 1024 * 1024 

40 

41# Unit -> bytes scale for Playwright stdout progress lines. 

42_BYTE_UNIT_SCALE: dict[str, int] = { 

43 "b": 1, 

44 "kb": 1024, 

45 "kib": 1024, 

46 "mb": 1024 * 1024, 

47 "mib": 1024 * 1024, 

48} 

49 

50# Playwright 1.58 prints lines like 

51# ``|■■■■■■■■ | 10% of 162.3 MiB`` during 

52# the chromium download. The percent comes first, then "of <total> <unit>". 

53_PROGRESS_LINE_RE = re.compile( 

54 r"(\d+)\s*%\s*of\s*(\d+(?:\.\d+)?)\s*(MiB|Mb|MB|KiB|KB|B)", 

55 re.IGNORECASE, 

56) 

57 

58 

59def _browsers_cache_path() -> Path: 

60 """Return the root path where Playwright stores browser binaries.""" 

61 override = os.environ.get("PLAYWRIGHT_BROWSERS_PATH") 

62 if override: 

63 return Path(override).expanduser() 

64 if sys.platform == "darwin": 

65 return Path.home() / "Library" / "Caches" / "ms-playwright" 

66 if sys.platform == "win32": 

67 local = os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local")) 

68 return Path(local) / "ms-playwright" 

69 return Path.home() / ".cache" / "ms-playwright" 

70 

71 

72def chromium_installed() -> bool: 

73 """Return True if at least one chromium-* install directory exists.""" 

74 root = _browsers_cache_path() 

75 if not root.exists(): 

76 return False 

77 return any(p.is_dir() and p.name.startswith("chromium-") for p in root.iterdir()) 

78 

79 

80def crawler_browsers_path() -> Path: 

81 """Public accessor for the crawler browser cache root. 

82 

83 Used by the HTTP status endpoint to tell plugins where Chromium 

84 lives. The underlying resolver stays private because callers should 

85 not depend on the Playwright-specific directory layout. 

86 """ 

87 return _browsers_cache_path() 

88 

89 

90def _bytes_from_stdout(line: str) -> tuple[int, int] | None: 

91 """Extract (downloaded_bytes, total_bytes) from a Playwright stdout line. 

92 

93 Matches the ``NN% of N.N MiB`` shape Playwright 1.58+ emits for the 

94 Chromium download. Returns None when the line doesn't match. The 

95 percent and total both parse out of the same line so callers never 

96 have to handle a missing total. 

97 """ 

98 match = _PROGRESS_LINE_RE.search(line) 

99 if match is None: 

100 return None 

101 pct = int(match.group(1)) 

102 raw_total = float(match.group(2)) 

103 unit = match.group(3).lower() 

104 scale = _BYTE_UNIT_SCALE.get(unit, 1) 

105 total = int(raw_total * scale) 

106 downloaded = int(total * pct / 100) 

107 return downloaded, total 

108 

109 

110def _emit_setup_start(on_progress: DetailedProgressCallback | None) -> None: 

111 if on_progress is None: 

112 return 

113 on_progress( 

114 EventType.SETUP_START, 

115 SetupStartEvent( 

116 component=_CHROMIUM_COMPONENT, 

117 size_estimate_bytes=_CHROMIUM_SIZE_ESTIMATE_BYTES, 

118 ), 

119 ) 

120 

121 

122def _emit_setup_done( 

123 on_progress: DetailedProgressCallback | None, 

124 *, 

125 success: bool, 

126 error: str | None, 

127) -> None: 

128 if on_progress is None: 

129 return 

130 on_progress( 

131 EventType.SETUP_DONE, 

132 SetupDoneEvent(component=_CHROMIUM_COMPONENT, success=success, error=error), 

133 ) 

134 

135 

136async def _drain_stdout_to_progress( 

137 stream: asyncio.StreamReader, 

138 on_progress: DetailedProgressCallback | None, 

139) -> None: 

140 while True: 

141 line_bytes = await stream.readline() 

142 if not line_bytes: 

143 return 

144 line = line_bytes.decode(errors="replace").rstrip() 

145 parsed = _bytes_from_stdout(line) 

146 if parsed is None or on_progress is None: 

147 continue 

148 downloaded, total = parsed 

149 on_progress( 

150 EventType.SETUP_PROGRESS, 

151 SetupProgressEvent( 

152 component=_CHROMIUM_COMPONENT, 

153 downloaded_bytes=downloaded, 

154 total_bytes=total, 

155 detail=line, 

156 ), 

157 ) 

158 

159 

160async def _drain_stderr(stream: asyncio.StreamReader, tail: list[str]) -> None: 

161 while True: 

162 line_bytes = await stream.readline() 

163 if not line_bytes: 

164 return 

165 tail.append(line_bytes.decode(errors="replace").rstrip()) 

166 

167 

168async def bootstrap_chromium( 

169 on_progress: DetailedProgressCallback | None = None, 

170) -> None: 

171 """Run ``playwright install chromium`` as a subprocess, emitting events. 

172 

173 Short-circuits when ``chromium_installed()`` is already True. Emits 

174 ``setup_start`` before spawning, ``setup_progress`` for each 

175 recognizable progress line on stdout, and ``setup_done`` on exit 

176 (``success=False`` + the subprocess stderr tail on failure). Raises 

177 :class:`CrawlerBrowserMissing` with the tail so task workers route 

178 to FAILED cleanly. 

179 

180 Uses the current Python interpreter's ``playwright`` module so this 

181 works under ``uv tool install`` and bundled installs alike without 

182 relying on a globally-installed ``playwright`` CLI. 

183 """ 

184 if chromium_installed(): 

185 _emit_setup_done(on_progress, success=True, error=None) 

186 return 

187 

188 _emit_setup_start(on_progress) 

189 

190 proc = await asyncio.create_subprocess_exec( 

191 sys.executable, 

192 "-m", 

193 "playwright", 

194 "install", 

195 "chromium", 

196 stdout=asyncio.subprocess.PIPE, 

197 stderr=asyncio.subprocess.PIPE, 

198 ) 

199 # mypy narrowing: asyncio.create_subprocess_exec with PIPE guarantees 

200 # non-None streams at runtime; the asserts only satisfy the type checker. 

201 assert proc.stdout is not None # noqa: S101 

202 assert proc.stderr is not None # noqa: S101 

203 

204 stderr_tail: list[str] = [] 

205 await asyncio.gather( 

206 _drain_stdout_to_progress(proc.stdout, on_progress), 

207 _drain_stderr(proc.stderr, stderr_tail), 

208 ) 

209 returncode = await proc.wait() 

210 

211 if returncode != 0: 

212 tail = "\n".join(stderr_tail[-10:]) or f"exit code {returncode}" 

213 _emit_setup_done(on_progress, success=False, error=tail) 

214 raise CrawlerBrowserMissing(f"Chromium bootstrap failed (exit {returncode}): {tail}") 

215 

216 _emit_setup_done(on_progress, success=True, error=None)