Coverage for src / lilbee / cli / commands.py: 100%

827 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""CLI command definitions registered on the app.""" 

2 

3from __future__ import annotations 

4 

5import asyncio 

6import json 

7import sys 

8from pathlib import Path 

9from typing import TYPE_CHECKING, Any 

10 

11import typer 

12 

13if TYPE_CHECKING: 

14 import uvicorn 

15 

16 from lilbee.wiki.entity_extractor import ExtractedEntity 

17from rich.table import Table 

18 

19from lilbee.cli import theme 

20from lilbee.cli.app import ( 

21 app, 

22 apply_overrides, 

23 console, 

24 data_dir_option, 

25 global_option, 

26 model_option, 

27 num_ctx_option, 

28 repeat_penalty_option, 

29 seed_option, 

30 temperature_option, 

31 top_k_sampling_option, 

32 top_p_option, 

33) 

34from lilbee.cli.helpers import ( 

35 CopyResult, 

36 add_paths, 

37 auto_sync, 

38 clean_result, 

39 copy_files, 

40 gather_status, 

41 get_version, 

42 json_output, 

43 perform_reset, 

44 render_status, 

45 sync_result_to_json, 

46) 

47from lilbee.cli.tui import messages as msg 

48from lilbee.config import cfg 

49from lilbee.crawler import CrawlerBrowserMissing, bootstrap_chromium, chromium_installed, is_url 

50from lilbee.progress import EventType, SetupProgressEvent 

51from lilbee.providers.base import ProviderError 

52from lilbee.services import get_services 

53from lilbee.store import SearchScope, scope_to_chunk_type 

54from lilbee.wiki.shared import ( 

55 DRAFTS_SUBDIR, 

56 SUMMARIES_SUBDIR, 

57) 

58 

59CHUNK_PREVIEW_LEN = 80 # characters shown in human-readable search output 

60 

61_ocr_option = typer.Option(None, "--ocr/--no-ocr", help="Force vision OCR on/off for scanned PDFs.") 

62_ocr_timeout_option = typer.Option( 

63 None, 

64 "--ocr-timeout", 

65 help="Per-page timeout in seconds for vision OCR (default: 120, 0 = no limit).", 

66) 

67_scope_option = typer.Option( 

68 SearchScope.BOTH, 

69 "--scope", 

70 "-s", 

71 help="Restrict the pool to raw chunks, wiki pages, or both (default).", 

72 case_sensitive=False, 

73) 

74 

75 

76def _apply_ocr_overrides(ocr: bool | None, ocr_timeout: float | None) -> None: 

77 """Apply --ocr/--no-ocr and --ocr-timeout CLI overrides to config.""" 

78 if ocr is not None: 

79 cfg.enable_ocr = ocr 

80 if ocr_timeout is not None: 

81 cfg.ocr_timeout = ocr_timeout 

82 

83 

84_paths_argument = typer.Argument( 

85 ..., 

86 help="Files, directories, or URLs to add to the knowledge base.", 

87) 

88 

89 

90@app.command() 

91def search( 

92 query: str = typer.Argument(..., help="Search query"), 

93 top_k: int = typer.Option(None, "--top-k", "-k", help="Number of results"), 

94 scope: SearchScope = _scope_option, 

95 data_dir: Path | None = data_dir_option, 

96 use_global: bool = global_option, 

97) -> None: 

98 """Search the knowledge base for relevant chunks.""" 

99 apply_overrides(data_dir=data_dir, use_global=use_global) 

100 

101 if not query or not query.strip(): 

102 if cfg.json_mode: 

103 json_output({"error": "query must not be empty"}) 

104 raise SystemExit(1) 

105 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] query must not be empty") 

106 raise SystemExit(1) 

107 

108 try: 

109 results = get_services().searcher.search( 

110 query, 

111 top_k=top_k or cfg.top_k, 

112 chunk_type=scope_to_chunk_type(scope), 

113 ) 

114 except Exception as exc: 

115 if cfg.json_mode: 

116 json_output({"error": str(exc)}) 

117 raise SystemExit(1) from None 

118 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}") 

119 raise SystemExit(1) from None 

120 cleaned = [clean_result(r) for r in results] 

121 

122 if cfg.json_mode: 

123 json_output({"command": "search", "query": query, "results": cleaned}) 

124 return 

125 

126 if not cleaned: 

127 console.print("No results found.") 

128 return 

129 

130 has_relevance = any("relevance_score" in r for r in cleaned) 

131 table = Table(title="Search Results") 

132 table.add_column("Source", style=theme.ACCENT) 

133 table.add_column("Chunk", max_width=80) 

134 score_label = "Score" if has_relevance else "Distance" 

135 table.add_column(score_label, justify="right", style=theme.MUTED) 

136 

137 for r in cleaned: 

138 chunk_text = r["chunk"] 

139 preview = chunk_text[:CHUNK_PREVIEW_LEN] 

140 if len(chunk_text) > CHUNK_PREVIEW_LEN: 

141 preview += "..." 

142 score = r.get("relevance_score") or r.get("distance") or 0 

143 table.add_row(r["source"], preview, f"{score:.4f}") 

144 console.print(table) 

145 

146 

147@app.command(name="sync") 

148def sync_cmd( 

149 data_dir: Path | None = data_dir_option, 

150 use_global: bool = global_option, 

151 ocr: bool | None = _ocr_option, 

152 ocr_timeout: float | None = _ocr_timeout_option, 

153) -> None: 

154 """Manually trigger document sync.""" 

155 apply_overrides(data_dir=data_dir, use_global=use_global) 

156 _apply_ocr_overrides(ocr, ocr_timeout) 

157 from lilbee.ingest import sync 

158 

159 try: 

160 result = asyncio.run(sync(quiet=cfg.json_mode)) 

161 except RuntimeError as exc: 

162 if cfg.json_mode: 

163 json_output({"error": str(exc)}) 

164 raise SystemExit(1) from None 

165 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}") 

166 raise SystemExit(1) from None 

167 if cfg.json_mode: 

168 json_output(sync_result_to_json(result)) 

169 return 

170 console.print(result) 

171 

172 

173@app.command() 

174def rebuild( 

175 data_dir: Path | None = data_dir_option, 

176 use_global: bool = global_option, 

177 ocr: bool | None = _ocr_option, 

178 ocr_timeout: float | None = _ocr_timeout_option, 

179) -> None: 

180 """Nuke the DB and re-ingest everything from documents/.""" 

181 apply_overrides(data_dir=data_dir, use_global=use_global) 

182 _apply_ocr_overrides(ocr, ocr_timeout) 

183 from lilbee.ingest import sync 

184 

185 try: 

186 result = asyncio.run(sync(force_rebuild=True, quiet=cfg.json_mode)) 

187 except RuntimeError as exc: 

188 if cfg.json_mode: 

189 json_output({"error": str(exc)}) 

190 raise SystemExit(1) from None 

191 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}") 

192 raise SystemExit(1) from None 

193 if cfg.json_mode: 

194 json_output({"command": "rebuild", "ingested": len(result.added)}) 

195 return 

196 console.print(f"Rebuilt: {len(result.added)} documents ingested") 

197 

198 

199_force_option = typer.Option(False, "--force", "-f", help="Overwrite existing files.") 

200_crawl_option = typer.Option( 

201 False, 

202 "--crawl", 

203 help="Recursively crawl URLs (whole site by default; see --depth and --max-pages).", 

204) 

205_depth_option = typer.Option( 

206 None, 

207 "--depth", 

208 help="Cap link-follow depth for --crawl. Unset = unbounded; 0 = single URL only.", 

209) 

210_max_pages_option = typer.Option( 

211 None, 

212 "--max-pages", 

213 help="Cap total pages for --crawl. Unset = no limit; positive int = hard cap.", 

214) 

215_include_subdomains_option = typer.Option( 

216 False, 

217 "--include-subdomains", 

218 help=( 

219 "Allow --crawl to follow links into sibling subdomains of the start " 

220 "host (e.g. en.wikipedia.org plus af.wikipedia.org). Default scopes " 

221 "the crawl to the exact start host only." 

222 ), 

223) 

224 

225 

226def _partition_inputs(inputs: list[str]) -> tuple[list[Path], list[str]]: 

227 """Split inputs into file paths and URLs.""" 

228 paths: list[Path] = [] 

229 urls: list[str] = [] 

230 for inp in inputs: 

231 if is_url(inp): 

232 urls.append(inp) 

233 else: 

234 paths.append(Path(inp)) 

235 return paths, urls 

236 

237 

238def _crawl_urls_blocking( 

239 urls: list[str], 

240 *, 

241 crawl: bool, 

242 depth: int | None, 

243 max_pages: int | None, 

244 include_subdomains: bool = False, 

245) -> list[Path]: 

246 """Crawl URLs synchronously (for CLI), returning paths written. 

247 

248 Without --crawl, each URL is fetched as a single page (depth=0). 

249 With --crawl, the default is whole-site unbounded (depth=None, pages=None). 

250 Explicit --depth / --max-pages override both. 

251 

252 Ctrl-C is handled by running the crawl through _run_crawl_with_signal_cancel, 

253 which installs a signal.signal handler that sets a threading.Event passed 

254 into crawl_and_save. crawl_recursive polls the event between pages so the 

255 signal flows through as a clean cancel instead of asyncio.run's default 

256 KeyboardInterrupt-raising (which left browser contexts mid-teardown). 

257 """ 

258 import threading 

259 

260 from rich.progress import Progress, SpinnerColumn, TaskID, TextColumn 

261 

262 from lilbee.crawler import crawl_and_save 

263 from lilbee.progress import CrawlPageEvent, DetailedProgressCallback, EventType, ProgressEvent 

264 

265 if crawl: 

266 effective_depth = depth 

267 effective_pages = max_pages 

268 else: 

269 effective_depth = 0 

270 effective_pages = None 

271 

272 cancel_event = threading.Event() 

273 

274 from rich.console import Console as RichConsole 

275 

276 err_console = RichConsole(stderr=True) 

277 all_paths: list[Path] = [] 

278 with Progress( 

279 SpinnerColumn(), 

280 TextColumn("{task.description}"), 

281 transient=True, 

282 console=err_console, 

283 disable=cfg.json_mode, 

284 ) as progress: 

285 for url in urls: 

286 if cancel_event.is_set(): 

287 break 

288 ptask = progress.add_task(f"Crawling {url}...", total=None) 

289 

290 def _make_callback(_t: TaskID = ptask) -> DetailedProgressCallback: 

291 def on_progress(event_type: EventType, data: ProgressEvent) -> None: 

292 if event_type == EventType.CRAWL_PAGE: 

293 if not isinstance(data, CrawlPageEvent): 

294 raise TypeError(f"Expected CrawlPageEvent, got {type(data).__name__}") 

295 total_str = str(data.total) if data.total > 0 else "?" 

296 progress.update( 

297 _t, 

298 description=f"Crawled {data.current}/{total_str}: {data.url}", 

299 ) 

300 

301 return on_progress 

302 

303 paths = _run_crawl_with_signal_cancel( 

304 url, 

305 depth=effective_depth, 

306 max_pages=effective_pages, 

307 on_progress=_make_callback(), 

308 cancel_event=cancel_event, 

309 crawl_and_save=crawl_and_save, 

310 include_subdomains=include_subdomains, 

311 ) 

312 all_paths.extend(paths) 

313 progress.update(ptask, description=f"Done: {url} ({len(paths)} pages)") 

314 return all_paths 

315 

316 

317def _run_crawl_with_signal_cancel( 

318 url: str, 

319 *, 

320 depth: int | None, 

321 max_pages: int | None, 

322 on_progress: object, 

323 cancel_event: object, 

324 crawl_and_save: object, 

325 include_subdomains: bool = False, 

326) -> list[Path]: 

327 """Run crawl_and_save on a dedicated event loop with a SIGINT->cancel hook. 

328 

329 asyncio.run() installs its own SIGINT handler that raises 

330 KeyboardInterrupt, which tears the crawl down ungracefully. Registering a 

331 plain signal.signal handler on the main thread AND running the crawl on a 

332 loop we own (instead of asyncio.run) lets Ctrl-C set our threading.Event, 

333 which crawl_recursive polls between pages so it can close the stream and 

334 stop dispatch cleanly. 

335 """ 

336 import signal 

337 

338 previous_handler = signal.getsignal(signal.SIGINT) 

339 

340 def _on_sigint(_signum: int, _frame: object) -> None: 

341 # Set the cancel event that crawl_recursive polls between pages, so 

342 # a Ctrl-C flows through as a clean cancel instead of asyncio.run's 

343 # default KeyboardInterrupt-raising dance. 

344 cancel_event.set() # type: ignore[attr-defined] 

345 

346 signal.signal(signal.SIGINT, _on_sigint) 

347 # Manage the event loop explicitly. In the CLI this runs once per process, 

348 # but under pytest-xdist the same worker thread runs many tests; leaving a 

349 # closed loop set as the "current" loop for the thread poisons every later 

350 # asyncio.get_event_loop() call and hangs macOS 3.12/3.13 unit-test CI. 

351 # Always clear the thread-current loop in finally. 

352 loop = asyncio.new_event_loop() 

353 try: 

354 asyncio.set_event_loop(loop) 

355 coro = crawl_and_save( # type: ignore[operator] 

356 url, 

357 depth=depth, 

358 max_pages=max_pages, 

359 on_progress=on_progress, 

360 cancel=cancel_event, 

361 quiet=cfg.json_mode, 

362 include_subdomains=include_subdomains, 

363 ) 

364 return loop.run_until_complete(coro) 

365 finally: 

366 loop.close() 

367 asyncio.set_event_loop(None) 

368 signal.signal(signal.SIGINT, previous_handler) 

369 

370 

371@app.command() 

372def add( 

373 paths: list[str] = _paths_argument, 

374 data_dir: Path | None = data_dir_option, 

375 use_global: bool = global_option, 

376 force: bool = _force_option, 

377 ocr: bool | None = _ocr_option, 

378 ocr_timeout: float | None = _ocr_timeout_option, 

379 crawl: bool = _crawl_option, 

380 depth: int | None = _depth_option, 

381 max_pages: int | None = _max_pages_option, 

382 include_subdomains: bool = _include_subdomains_option, 

383) -> None: 

384 """Copy files or crawl URLs into the knowledge base and ingest them.""" 

385 apply_overrides(data_dir=data_dir, use_global=use_global) 

386 _apply_ocr_overrides(ocr, ocr_timeout) 

387 

388 file_paths, urls = _partition_inputs(paths) 

389 # Validate file paths exist 

390 for fp in file_paths: 

391 if not fp.exists(): 

392 if cfg.json_mode: 

393 json_output({"error": f"Path not found: {fp}"}) 

394 raise SystemExit(1) 

395 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] Path not found: {fp}") 

396 raise SystemExit(1) 

397 

398 try: 

399 # Crawl URLs first (saves .md files into documents/_web/) 

400 crawled_paths: list[Path] = [] 

401 if urls: 

402 from lilbee.crawler import crawler_available 

403 

404 if not crawler_available(): 

405 console.print( 

406 f"[{theme.ERROR}]Web crawling requires: " 

407 f"pip install 'lilbee[crawler]'[/{theme.ERROR}]" 

408 ) 

409 raise SystemExit(1) 

410 crawled_paths = _crawl_urls_blocking( 

411 urls, 

412 crawl=crawl, 

413 depth=depth, 

414 max_pages=max_pages, 

415 include_subdomains=include_subdomains, 

416 ) 

417 if not cfg.json_mode: 

418 console.print( 

419 f"[{theme.MUTED}]Crawled {len(crawled_paths)} page(s)" 

420 f" from {len(urls)} URL(s)[/{theme.MUTED}]" 

421 ) 

422 

423 if cfg.json_mode: 

424 from lilbee.ingest import sync 

425 

426 copy_result = CopyResult() 

427 if file_paths: 

428 copy_result = copy_files(file_paths, force=force) 

429 result = asyncio.run(sync(quiet=True)) 

430 json_output( 

431 { 

432 "command": "add", 

433 "copied": copy_result.copied, 

434 "skipped": copy_result.skipped, 

435 "crawled": len(crawled_paths), 

436 "sync": sync_result_to_json(result), 

437 } 

438 ) 

439 return 

440 

441 if file_paths: 

442 add_paths(file_paths, console, force=force) 

443 elif urls: 

444 # URLs already saved; just trigger sync 

445 from lilbee.ingest import sync 

446 

447 result = asyncio.run(sync()) 

448 console.print(result) 

449 except RuntimeError as exc: 

450 if cfg.json_mode: 

451 json_output({"error": str(exc)}) 

452 raise SystemExit(1) from None 

453 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}") 

454 raise SystemExit(1) from None 

455 

456 

457_chunks_source_argument = typer.Argument(..., help="Source name to inspect chunks for.") 

458 

459 

460@app.command() 

461def chunks( 

462 source: str = _chunks_source_argument, 

463 data_dir: Path | None = data_dir_option, 

464 use_global: bool = global_option, 

465) -> None: 

466 """Show chunks a document was split into (useful for debugging retrieval).""" 

467 apply_overrides(data_dir=data_dir, use_global=use_global) 

468 

469 store = get_services().store 

470 known = {s["filename"] for s in store.get_sources()} 

471 if source not in known: 

472 if cfg.json_mode: 

473 json_output({"error": f"Source not found: {source}"}) 

474 raise SystemExit(1) 

475 console.print(f"[{theme.ERROR}]Source not found:[/{theme.ERROR}] {source}") 

476 raise SystemExit(1) 

477 

478 raw_chunks = store.get_chunks_by_source(source) 

479 cleaned = sorted( 

480 [clean_result(c) for c in raw_chunks], 

481 key=lambda c: c.get("chunk_index", 0), 

482 ) 

483 

484 if cfg.json_mode: 

485 json_output({"command": "chunks", "source": source, "chunks": cleaned}) 

486 return 

487 

488 console.print( 

489 f"[{theme.LABEL}]{len(cleaned)}[/{theme.LABEL}]" 

490 f" chunks from [{theme.ACCENT}]{source}[/{theme.ACCENT}]\n" 

491 ) 

492 for c in cleaned: 

493 idx = c.get("chunk_index", "?") 

494 preview = c.get("chunk", "")[:CHUNK_PREVIEW_LEN] 

495 if len(c.get("chunk", "")) > CHUNK_PREVIEW_LEN: 

496 preview += "..." 

497 console.print(f" [{idx}] {preview}") 

498 

499 

500_remove_names_argument = typer.Argument( 

501 ..., help="Source name(s) to remove from the knowledge base." 

502) 

503 

504_delete_file_option = typer.Option( 

505 False, "--delete", help="Also delete the file from the documents directory." 

506) 

507 

508 

509@app.command() 

510def remove( 

511 names: list[str] = _remove_names_argument, 

512 data_dir: Path | None = data_dir_option, 

513 use_global: bool = global_option, 

514 delete_file: bool = _delete_file_option, 

515) -> None: 

516 """Remove documents from the knowledge base by source name.""" 

517 apply_overrides(data_dir=data_dir, use_global=use_global) 

518 

519 result = get_services().store.remove_documents( 

520 names, delete_files=delete_file, documents_dir=cfg.documents_dir 

521 ) 

522 

523 if cfg.json_mode: 

524 payload: dict = {"command": "remove", "removed": result.removed} 

525 if result.not_found: 

526 payload["not_found"] = result.not_found 

527 json_output(payload) 

528 if not result.removed and result.not_found: 

529 raise SystemExit(1) 

530 return 

531 

532 for name in result.removed: 

533 console.print(f"Removed [{theme.ACCENT}]{name}[/{theme.ACCENT}]") 

534 for name in result.not_found: 

535 console.print(f"[{theme.ERROR}]Not found:[/{theme.ERROR}] {name}") 

536 if not result.removed and result.not_found: 

537 raise SystemExit(1) 

538 

539 

540@app.command() 

541def ask( 

542 question: str = typer.Argument(..., help="Question to ask"), 

543 scope: SearchScope = _scope_option, 

544 data_dir: Path | None = data_dir_option, 

545 model: str | None = model_option, 

546 use_global: bool = global_option, 

547 temperature: float | None = temperature_option, 

548 top_p: float | None = top_p_option, 

549 top_k_sampling: int | None = top_k_sampling_option, 

550 repeat_penalty: float | None = repeat_penalty_option, 

551 num_ctx: int | None = num_ctx_option, 

552 seed: int | None = seed_option, 

553) -> None: 

554 """Ask a one-shot question (auto-syncs first).""" 

555 apply_overrides( 

556 data_dir=data_dir, 

557 model=model, 

558 use_global=use_global, 

559 temperature=temperature, 

560 top_p=top_p, 

561 top_k_sampling=top_k_sampling, 

562 repeat_penalty=repeat_penalty, 

563 num_ctx=num_ctx, 

564 seed=seed, 

565 ) 

566 

567 try: 

568 from lilbee.models import ensure_chat_model 

569 

570 ensure_chat_model() 

571 get_services().embedder.validate_model() 

572 if cfg.json_mode: 

573 from rich.console import Console as _QuietConsole 

574 

575 auto_sync(_QuietConsole(quiet=True)) 

576 else: 

577 auto_sync(console) 

578 

579 chunk_type = scope_to_chunk_type(scope) 

580 

581 if cfg.json_mode: 

582 result = get_services().searcher.ask_raw(question, chunk_type=chunk_type) 

583 json_output( 

584 { 

585 "command": "ask", 

586 "question": question, 

587 "answer": result.answer, 

588 "sources": [clean_result(s) for s in result.sources], 

589 } 

590 ) 

591 return 

592 

593 for token in get_services().searcher.ask_stream(question, chunk_type=chunk_type): 

594 console.print(token.content, end="") 

595 console.print() 

596 except (RuntimeError, ProviderError) as exc: 

597 if cfg.json_mode: 

598 json_output({"error": str(exc)}) 

599 raise SystemExit(1) from None 

600 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}") 

601 raise SystemExit(1) from None 

602 

603 

604@app.command() 

605def chat( 

606 data_dir: Path | None = data_dir_option, 

607 model: str | None = model_option, 

608 use_global: bool = global_option, 

609 temperature: float | None = temperature_option, 

610 top_p: float | None = top_p_option, 

611 top_k_sampling: int | None = top_k_sampling_option, 

612 repeat_penalty: float | None = repeat_penalty_option, 

613 num_ctx: int | None = num_ctx_option, 

614 seed: int | None = seed_option, 

615) -> None: 

616 """Interactive chat loop (auto-syncs first).""" 

617 apply_overrides( 

618 data_dir=data_dir, 

619 model=model, 

620 use_global=use_global, 

621 temperature=temperature, 

622 top_p=top_p, 

623 top_k_sampling=top_k_sampling, 

624 repeat_penalty=repeat_penalty, 

625 num_ctx=num_ctx, 

626 seed=seed, 

627 ) 

628 

629 if cfg.json_mode: 

630 json_output({"error": "Chat requires a terminal, not --json"}) 

631 raise SystemExit(1) 

632 if not sys.stdin.isatty() or not sys.stdout.isatty(): 

633 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] Chat requires a terminal.") 

634 raise SystemExit(1) 

635 from lilbee.cli.tui import run_tui 

636 

637 run_tui(auto_sync=True) 

638 

639 

640@app.command() 

641def version() -> None: 

642 """Show the lilbee version.""" 

643 ver = get_version() 

644 if cfg.json_mode: 

645 json_output({"command": "version", "version": ver}) 

646 return 

647 console.print(f"lilbee {ver}") 

648 

649 

650_SELF_CHECK_CHAT_REPO = "bartowski/SmolLM2-135M-Instruct-GGUF" 

651_SELF_CHECK_CHAT_FILE = "SmolLM2-135M-Instruct-Q3_K_S.gguf" 

652_SELF_CHECK_EMBED_REPO = "nomic-ai/nomic-embed-text-v1.5-GGUF" 

653_SELF_CHECK_EMBED_FILE = "nomic-embed-text-v1.5.Q4_K_M.gguf" 

654 

655 

656def _download_self_check_model(repo: str, filename: str) -> Path: 

657 """Fetch a GGUF from the HuggingFace CDN via urllib (stdlib only). 

658 

659 Avoids huggingface_hub / httpx entirely. Inside the PyInstaller --onefile 

660 bundle, huggingface_hub's retry path has re-entered a closed httpx client 

661 after transient DNS failures on macOS runners. urllib is synchronous, 

662 lives in the stdlib, and has no long-lived client to close. 

663 """ 

664 import tempfile 

665 import urllib.request 

666 

667 url = f"https://huggingface.co/{repo}/resolve/main/{filename}" 

668 dest_dir = Path(tempfile.mkdtemp(prefix="lilbee-self-check-")) 

669 dest = dest_dir / filename 

670 console.print(f"Downloading {url}") 

671 last_exc: BaseException | None = None 

672 for attempt in range(3): 

673 try: 

674 with urllib.request.urlopen(url, timeout=120) as response: # noqa: S310 — literal https url 

675 dest.write_bytes(response.read()) 

676 return dest 

677 except (OSError, urllib.error.URLError) as exc: 

678 last_exc = exc 

679 console.print(f"download attempt {attempt + 1} failed: {exc!r}") 

680 raise RuntimeError(f"GGUF download failed after 3 attempts: {last_exc!r}") 

681 

682 

683_self_check_chat_path_option = typer.Option( 

684 None, 

685 "--chat-model-path", 

686 help="Path to a chat GGUF file. Skips the HuggingFace download.", 

687) 

688_self_check_embed_path_option = typer.Option( 

689 None, 

690 "--embed-model-path", 

691 help="Path to an embedding GGUF file. Skips the HuggingFace download.", 

692) 

693_self_check_max_tokens_option = typer.Option(5, "--max-tokens", help="Tokens to generate.") 

694_self_check_skip_embedding_option = typer.Option( 

695 False, 

696 "--skip-embedding", 

697 help="Skip the embedding-model leg of the self-check.", 

698) 

699 

700 

701def _self_check_emit_failure(error: str) -> None: 

702 if cfg.json_mode: 

703 json_output({"ok": False, "error": error}) 

704 else: 

705 console.print(f"[{theme.ERROR}]SELF-CHECK FAILED:[/{theme.ERROR}] {error}") 

706 

707 

708@app.command("self-check") 

709def self_check_cmd( 

710 chat_model_path: Path | None = _self_check_chat_path_option, 

711 embed_model_path: Path | None = _self_check_embed_path_option, 

712 max_tokens: int = _self_check_max_tokens_option, 

713 skip_embedding: bool = _self_check_skip_embedding_option, 

714) -> None: 

715 """Verify the installation can load llama.cpp and run real inference. 

716 

717 Two legs: 

718 

719 1. **Chat**: downloads ``SmolLM2-135M-Instruct-Q3_K_S.gguf`` (~90MB) and 

720 runs a tiny ``create_completion`` so we know decoder-style models work 

721 end-to-end and the vendored shared libraries load. 

722 2. **Embedding**: downloads ``nomic-embed-text-v1.5.Q4_K_M.gguf`` (~84MB) 

723 and runs ``create_embedding``. This is the leg that catches the 

724 "Memory is not initialized" assert from llama-cpp-python <0.3.19, where 

725 BERT-style encoders trip ``kv_cache_clear`` on a context that never 

726 allocated memory. 

727 

728 Exits 0 on success, 1 on any failure. Intended for post-install 

729 verification and as the end-to-end gate in release CI. 

730 """ 

731 from typing import Any, cast 

732 

733 try: 

734 chat_path = chat_model_path or _download_self_check_model( 

735 _SELF_CHECK_CHAT_REPO, _SELF_CHECK_CHAT_FILE 

736 ) 

737 console.print(f"Loading chat model {chat_path}") 

738 

739 import llama_cpp 

740 

741 from lilbee.providers.llama_cpp_provider import install_llama_log_handler 

742 

743 install_llama_log_handler() 

744 llm = llama_cpp.Llama(model_path=str(chat_path), n_ctx=256, verbose=False) 

745 # stream=False (default) returns a dict, not an iterator, but 

746 # create_completion's return type is a union; cast to Any so the 

747 # indexing below type-checks without forcing llama_cpp to be a 

748 # typecheck-time dep of lilbee. 

749 out = cast(Any, llm.create_completion("2+2=", max_tokens=max_tokens)) 

750 text: str = out["choices"][0]["text"] 

751 except Exception as exc: 

752 _self_check_emit_failure(repr(exc)) 

753 raise typer.Exit(1) from exc 

754 

755 if not text.strip(): 

756 _self_check_emit_failure("empty inference response") 

757 raise typer.Exit(1) 

758 

759 embedding_dims: int | None = None 

760 if not skip_embedding: 

761 try: 

762 embed_path = embed_model_path or _download_self_check_model( 

763 _SELF_CHECK_EMBED_REPO, _SELF_CHECK_EMBED_FILE 

764 ) 

765 console.print(f"Loading embedding model {embed_path}") 

766 enc = llama_cpp.Llama( 

767 model_path=str(embed_path), 

768 embedding=True, 

769 n_ctx=512, 

770 verbose=False, 

771 ) 

772 emb = cast(Any, enc.create_embedding(input=["test"])) 

773 vec = emb["data"][0]["embedding"] 

774 except Exception as exc: 

775 _self_check_emit_failure(repr(exc)) 

776 raise typer.Exit(1) from exc 

777 

778 if not vec: 

779 _self_check_emit_failure("empty embedding vector") 

780 raise typer.Exit(1) 

781 embedding_dims = len(vec) 

782 

783 if cfg.json_mode: 

784 payload: dict[str, Any] = { 

785 "ok": True, 

786 "chat_response": text, 

787 "chat_model": str(chat_path), 

788 } 

789 if embedding_dims is not None: 

790 payload["embedding_dims"] = embedding_dims 

791 json_output(payload) 

792 else: 

793 console.print(f"Chat response: {text!r}") 

794 if embedding_dims is not None: 

795 console.print(f"Embedding dims: {embedding_dims}") 

796 console.print(f"[{theme.ACCENT}]SELF-CHECK PASSED[/{theme.ACCENT}]") 

797 

798 

799@app.command() 

800def status( 

801 data_dir: Path | None = data_dir_option, 

802 use_global: bool = global_option, 

803) -> None: 

804 """Show indexed documents, paths, and chunk counts.""" 

805 apply_overrides(data_dir=data_dir, use_global=use_global) 

806 if cfg.json_mode: 

807 json_output(gather_status().model_dump(exclude_none=True)) 

808 return 

809 render_status(console) 

810 

811 

812_yes_option = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt.") 

813 

814 

815@app.command() 

816def reset( 

817 data_dir: Path | None = data_dir_option, 

818 use_global: bool = global_option, 

819 yes: bool = _yes_option, 

820) -> None: 

821 """Delete all documents and data (full factory reset).""" 

822 apply_overrides(data_dir=data_dir, use_global=use_global) 

823 if not yes: 

824 if cfg.json_mode: 

825 json_output({"error": "Use --yes to confirm reset in JSON mode"}) 

826 raise SystemExit(1) 

827 console.print( 

828 f"[{theme.ERROR_BOLD}]This will delete ALL documents and data.[/{theme.ERROR_BOLD}]\n" 

829 f" Documents: {cfg.documents_dir}\n" 

830 f" Data: {cfg.data_dir}" 

831 ) 

832 confirmed = typer.confirm("Are you sure?", default=False) 

833 if not confirmed: 

834 console.print("Aborted.") 

835 raise SystemExit(0) 

836 

837 result = perform_reset() 

838 

839 if cfg.json_mode: 

840 json_output(result.model_dump()) 

841 return 

842 

843 console.print( 

844 f"Reset complete: {result.deleted_docs} document(s), " 

845 f"{result.deleted_data} data item(s) deleted." 

846 ) 

847 if result.skipped: 

848 console.print( 

849 f"[{theme.WARNING}]{len(result.skipped)} item(s) could not be deleted " 

850 f"(locked or permission denied).[/{theme.WARNING}]" 

851 ) 

852 

853 

854@app.command() 

855def init() -> None: 

856 """Initialize a local .lilbee/ knowledge base in the current directory.""" 

857 root = Path.cwd() / ".lilbee" 

858 if root.is_dir(): 

859 if cfg.json_mode: 

860 json_output({"command": "init", "path": str(root), "created": False}) 

861 return 

862 console.print(f"Already initialized: {root}") 

863 return 

864 

865 docs = root / "documents" 

866 data = root / "data" 

867 docs.mkdir(parents=True) 

868 data.mkdir(parents=True) 

869 (root / ".gitignore").write_text("data/\n") 

870 

871 if cfg.json_mode: 

872 json_output({"command": "init", "path": str(root), "created": True}) 

873 return 

874 console.print(f"Initialized local knowledge base at {root}") 

875 

876 

877def _port_file() -> Path: 

878 return cfg.data_dir / "server.port" 

879 

880 

881async def _run_server(server: uvicorn.Server, config: uvicorn.Config, host: str) -> None: 

882 """Start uvicorn, write port file, and clean up on shutdown.""" 

883 import atexit 

884 

885 from lilbee.parent_monitor import parse_parent_pid, watch_parent_async 

886 

887 port_path = _port_file() 

888 

889 def _cleanup_port_file() -> None: 

890 port_path.unlink(missing_ok=True) 

891 

892 if not config.loaded: 

893 config.load() 

894 server.lifespan = config.lifespan_class(config) 

895 await server.startup() 

896 

897 parent_pid = parse_parent_pid() 

898 parent_watcher: asyncio.Task[None] | None = None 

899 if parent_pid is not None: 

900 

901 def _on_parent_death() -> None: 

902 server.should_exit = True 

903 

904 parent_watcher = asyncio.create_task(watch_parent_async(parent_pid, _on_parent_death)) 

905 

906 try: 

907 if server.servers: 

908 sock = server.servers[0].sockets[0] 

909 actual_port = sock.getsockname()[1] 

910 port_path.parent.mkdir(parents=True, exist_ok=True) 

911 port_path.write_text(str(actual_port)) 

912 atexit.register(_cleanup_port_file) 

913 console.print(f"Listening on http://{host}:{actual_port}") 

914 await server.main_loop() 

915 finally: 

916 if parent_watcher is not None and not parent_watcher.done(): 

917 parent_watcher.cancel() 

918 port_path.unlink(missing_ok=True) 

919 await server.shutdown() 

920 

921 

922@app.command() 

923def serve( 

924 host: str = typer.Option(None, "--host", "-H", help="Bind address (default: 127.0.0.1)"), 

925 port: int = typer.Option(None, "--port", "-p", help="Port (default: 0/random)"), 

926 data_dir: Path | None = data_dir_option, 

927 use_global: bool = global_option, 

928) -> None: 

929 """Start the HTTP API server.""" 

930 apply_overrides(data_dir=data_dir, use_global=use_global) 

931 if host is not None: 

932 cfg.server_host = host 

933 if port is not None: 

934 cfg.server_port = port 

935 

936 import logging 

937 

938 import uvicorn 

939 

940 from lilbee.server import create_app 

941 

942 logging.getLogger("asyncio").setLevel(logging.ERROR) 

943 

944 config = uvicorn.Config(create_app(), host=cfg.server_host, port=cfg.server_port) 

945 server = uvicorn.Server(config) 

946 asyncio.run(_run_server(server, config, cfg.server_host)) 

947 

948 

949@app.command() 

950def token( 

951 data_dir: Path | None = data_dir_option, 

952 use_global: bool = global_option, 

953) -> None: 

954 """Print the auth token for a running server.""" 

955 from lilbee.server.auth import server_json_path 

956 

957 apply_overrides(data_dir=data_dir, use_global=use_global) 

958 path = server_json_path() 

959 if not path.exists(): 

960 if cfg.json_mode: 

961 json_output({"error": "No running server found"}) 

962 else: 

963 console.print("No running server found (server.json missing).") 

964 raise SystemExit(1) 

965 try: 

966 data = json.loads(path.read_text()) 

967 tok = data.get("token", "") 

968 except (json.JSONDecodeError, OSError) as exc: 

969 if cfg.json_mode: 

970 json_output({"error": f"Could not read server.json: {exc}"}) 

971 else: 

972 console.print( 

973 f"[{theme.ERROR}]Error:[/{theme.ERROR}] Could not read server.json: {exc}" 

974 ) 

975 raise SystemExit(1) from None 

976 if cfg.json_mode: 

977 json_output({"token": tok}) 

978 return 

979 console.print(tok) 

980 

981 

982@app.command() 

983def topics( 

984 query: str = typer.Argument(None, help="Optional query to find related concepts."), 

985 top_k: int = typer.Option(10, "--top-k", "-k", help="Number of results."), 

986 data_dir: Path | None = data_dir_option, 

987 use_global: bool = global_option, 

988) -> None: 

989 """Show top concept communities or concepts related to a query.""" 

990 apply_overrides(data_dir=data_dir, use_global=use_global) 

991 

992 from lilbee.concepts import concepts_available 

993 

994 if not concepts_available(): 

995 msg = "Concept graph requires: pip install 'lilbee[graph]'" 

996 if cfg.json_mode: 

997 json_output({"error": msg}) 

998 raise SystemExit(1) 

999 console.print(f"[{theme.ERROR}]{msg}[/{theme.ERROR}]") 

1000 raise SystemExit(1) 

1001 

1002 if not cfg.concept_graph: 

1003 if cfg.json_mode: 

1004 json_output({"error": "Concept graph is disabled (LILBEE_CONCEPT_GRAPH=false)"}) 

1005 raise SystemExit(1) 

1006 console.print( 

1007 f"[{theme.ERROR}]Concept graph is disabled.[/{theme.ERROR}] " 

1008 "Enable with LILBEE_CONCEPT_GRAPH=true" 

1009 ) 

1010 raise SystemExit(1) 

1011 

1012 if not get_services().concepts.get_graph(): 

1013 if cfg.json_mode: 

1014 json_output({"error": "Concept graph not available"}) 

1015 raise SystemExit(1) 

1016 console.print(f"[{theme.ERROR}]Concept graph not available.[/{theme.ERROR}]") 

1017 raise SystemExit(1) 

1018 

1019 if query: 

1020 _topics_for_query(query) 

1021 else: 

1022 _topics_overview(top_k) 

1023 

1024 

1025def _topics_for_query(query: str) -> None: 

1026 """Show concepts related to a query.""" 

1027 cg = get_services().concepts 

1028 concepts = cg.extract_concepts(query) 

1029 related = cg.expand_query(query) 

1030 all_concepts = concepts + [r for r in related if r not in concepts] 

1031 

1032 if cfg.json_mode: 

1033 json_output({"command": "topics", "query": query, "concepts": all_concepts}) 

1034 return 

1035 if not all_concepts: 

1036 console.print("No concepts found for this query.") 

1037 return 

1038 console.print(f"Concepts related to [{theme.ACCENT}]{query}[/{theme.ACCENT}]:") 

1039 for c in all_concepts: 

1040 console.print(f" {c}") 

1041 

1042 

1043def _topics_overview(top_k: int) -> None: 

1044 """Show top concept communities.""" 

1045 from dataclasses import asdict 

1046 

1047 communities = get_services().concepts.top_communities(k=top_k) 

1048 if cfg.json_mode: 

1049 json_output({"command": "topics", "communities": [asdict(c) for c in communities]}) 

1050 return 

1051 if not communities: 

1052 console.print("No concept communities found. Try syncing some documents first.") 

1053 return 

1054 table = Table(title="Concept Communities") 

1055 table.add_column("Cluster", justify="right", style=theme.MUTED) 

1056 table.add_column("Size", justify="right") 

1057 table.add_column("Top Concepts", style=theme.ACCENT) 

1058 for comm in communities: 

1059 preview = ", ".join(comm.concepts[:5]) 

1060 if len(comm.concepts) > 5: 

1061 preview += f" (+{len(comm.concepts) - 5} more)" 

1062 table.add_row(str(comm.cluster_id), str(comm.size), preview) 

1063 console.print(table) 

1064 

1065 

1066@app.command() 

1067def login() -> None: 

1068 """Log in to HuggingFace for access to gated models (Mistral, Llama, etc.).""" 

1069 import webbrowser 

1070 

1071 from huggingface_hub import get_token 

1072 from huggingface_hub import login as hf_login 

1073 

1074 if get_token(): 

1075 typer.echo("Already logged in to HuggingFace.") 

1076 if not typer.confirm("Log in again?", default=False): 

1077 return 

1078 

1079 typer.echo("Opening HuggingFace token page in your browser...") 

1080 typer.echo("Create a token with 'Read' access, then paste it below.\n") 

1081 webbrowser.open("https://huggingface.co/settings/tokens") 

1082 

1083 token = typer.prompt("Paste your HuggingFace token", hide_input=True) 

1084 if not token.strip(): 

1085 typer.echo("No token provided.", err=True) 

1086 raise typer.Exit(1) 

1087 

1088 hf_login(token=token.strip(), add_to_git_credential=False) 

1089 typer.echo("Logged in! Gated models (Mistral, Llama, etc.) are now accessible.") 

1090 

1091 

1092@app.command(name="mcp") 

1093def mcp_cmd() -> None: 

1094 """Start the MCP server (stdio transport) for agent integration.""" 

1095 from lilbee.mcp import main 

1096 

1097 main() 

1098 

1099 

1100setup_app = typer.Typer(help="One-time setup for optional runtime components.") 

1101app.add_typer(setup_app, name="setup") 

1102 

1103 

1104@setup_app.command(name="crawler") 

1105def setup_crawler_cmd() -> None: 

1106 """Install Playwright's Chromium browser, needed for /crawl. 

1107 

1108 No-op when Chromium is already present. Emits a simple progress 

1109 readout; use '--json' mode on the top-level 'lilbee' command to get 

1110 a single JSON blob with the final install state instead. 

1111 """ 

1112 if chromium_installed(): 

1113 if cfg.json_mode: 

1114 typer.echo(json.dumps({"component": "chromium", "already_installed": True})) 

1115 else: 

1116 typer.echo("Chromium already installed.") 

1117 return 

1118 

1119 last_pct: list[int] = [-1] 

1120 

1121 def _on_progress(event_type: object, data: object) -> None: 

1122 if event_type != EventType.SETUP_PROGRESS or not isinstance(data, SetupProgressEvent): 

1123 return 

1124 total = data.total_bytes or 0 

1125 pct = int(data.downloaded_bytes * 100 / total) if total > 0 else 0 

1126 if pct != last_pct[0] and not cfg.json_mode: 

1127 last_pct[0] = pct 

1128 typer.echo(msg.SETUP_CHROMIUM_CLI_PROGRESS.format(pct=pct), err=True) 

1129 

1130 try: 

1131 asyncio.run(bootstrap_chromium(on_progress=_on_progress)) 

1132 except CrawlerBrowserMissing as exc: 

1133 if cfg.json_mode: 

1134 typer.echo(json.dumps({"component": "chromium", "error": str(exc)})) 

1135 else: 

1136 typer.secho(f"Install failed: {exc}", fg=typer.colors.RED) 

1137 raise typer.Exit(code=1) from exc 

1138 

1139 if cfg.json_mode: 

1140 typer.echo(json.dumps({"component": "chromium", "installed": True})) 

1141 else: 

1142 typer.echo("Chromium installed.") 

1143 

1144 

1145wiki_app = typer.Typer(help="Wiki layer commands: generate, lint, citations, status, prune.") 

1146app.add_typer(wiki_app, name="wiki") 

1147 

1148 

1149@wiki_app.command(name="lint") 

1150def wiki_lint( 

1151 wiki_source: str = typer.Argument("", help="Wiki page path (empty = lint all)."), 

1152 data_dir: Path | None = data_dir_option, 

1153 use_global: bool = global_option, 

1154) -> None: 

1155 """Lint wiki pages for stale citations, missing sources, and unmarked claims.""" 

1156 apply_overrides(data_dir=data_dir, use_global=use_global) 

1157 from lilbee.wiki.lint import lint_all as _lint_all 

1158 from lilbee.wiki.lint import lint_wiki_page 

1159 

1160 store = get_services().store 

1161 if wiki_source: 

1162 issues = lint_wiki_page(wiki_source, store) 

1163 else: 

1164 report = _lint_all(store) 

1165 issues = report.issues 

1166 

1167 if cfg.json_mode: 

1168 json_output( 

1169 { 

1170 "command": "wiki_lint", 

1171 "issues": [i.to_dict() for i in issues], 

1172 "total": len(issues), 

1173 } 

1174 ) 

1175 return 

1176 

1177 if not issues: 

1178 console.print("No issues found.") 

1179 return 

1180 

1181 table = Table(title="Wiki Lint Issues") 

1182 table.add_column("Page", style=theme.ACCENT) 

1183 table.add_column("Severity") 

1184 table.add_column("Message") 

1185 for issue in issues: 

1186 sev_style = theme.ERROR if issue.severity.value == "error" else theme.WARNING 

1187 sev_text = f"[{sev_style}]{issue.severity.value}[/{sev_style}]" 

1188 table.add_row(issue.wiki_source, sev_text, issue.message) 

1189 console.print(table) 

1190 

1191 

1192@wiki_app.command(name="citations") 

1193def wiki_citations( 

1194 wiki_source: str = typer.Argument(..., help="Wiki page path, e.g. wiki/summaries/doc.md."), 

1195 data_dir: Path | None = data_dir_option, 

1196 use_global: bool = global_option, 

1197) -> None: 

1198 """Show citations for a wiki page.""" 

1199 apply_overrides(data_dir=data_dir, use_global=use_global) 

1200 

1201 records = get_services().store.get_citations_for_wiki(wiki_source) 

1202 

1203 if cfg.json_mode: 

1204 json_output( 

1205 { 

1206 "command": "wiki_citations", 

1207 "wiki_source": wiki_source, 

1208 "citations": [dict(r) for r in records], 

1209 "total": len(records), 

1210 } 

1211 ) 

1212 return 

1213 

1214 if not records: 

1215 console.print(f"No citations found for [{theme.ACCENT}]{wiki_source}[/{theme.ACCENT}]") 

1216 return 

1217 

1218 table = Table(title=f"Citations: {wiki_source}") 

1219 table.add_column("Key", style=theme.ACCENT) 

1220 table.add_column("Source") 

1221 table.add_column("Type", style=theme.MUTED) 

1222 table.add_column("Excerpt", max_width=60) 

1223 for rec in records: 

1224 excerpt = rec["excerpt"][:57] + "..." if len(rec["excerpt"]) > 60 else rec["excerpt"] 

1225 table.add_row(rec["citation_key"], rec["source_filename"], rec["claim_type"], excerpt) 

1226 console.print(table) 

1227 

1228 

1229@wiki_app.command(name="status") 

1230def wiki_status( 

1231 data_dir: Path | None = data_dir_option, 

1232 use_global: bool = global_option, 

1233) -> None: 

1234 """Show wiki layer status: page counts and lint summary.""" 

1235 apply_overrides(data_dir=data_dir, use_global=use_global) 

1236 

1237 wiki_root = cfg.data_root / cfg.wiki_dir 

1238 if not wiki_root.exists(): 

1239 if cfg.json_mode: 

1240 json_output({"wiki_enabled": cfg.wiki, "pages": 0, "issues": 0}) 

1241 return 

1242 console.print("Wiki directory does not exist yet. Run sync with wiki enabled.") 

1243 return 

1244 

1245 summaries = _count_md_files(wiki_root / SUMMARIES_SUBDIR) 

1246 drafts = _count_md_files(wiki_root / DRAFTS_SUBDIR) 

1247 

1248 from lilbee.wiki.lint import lint_all as _lint_all 

1249 

1250 report = _lint_all(get_services().store) 

1251 

1252 if cfg.json_mode: 

1253 json_output( 

1254 { 

1255 "wiki_enabled": cfg.wiki, 

1256 SUMMARIES_SUBDIR: summaries, 

1257 DRAFTS_SUBDIR: drafts, 

1258 "pages": summaries + drafts, 

1259 "lint_errors": report.error_count, 

1260 "lint_warnings": report.warning_count, 

1261 } 

1262 ) 

1263 return 

1264 

1265 color = "green" if cfg.wiki else "red" 

1266 label = "enabled" if cfg.wiki else "disabled" 

1267 console.print(f"Wiki: [{color}]{label}[/{color}]") 

1268 console.print(f" Summaries: [{theme.LABEL}]{summaries}[/{theme.LABEL}]") 

1269 console.print(f" Drafts: [{theme.LABEL}]{drafts}[/{theme.LABEL}]") 

1270 if report.error_count or report.warning_count: 

1271 console.print( 

1272 f" Lint: [{theme.ERROR}]{report.error_count} error(s)[/{theme.ERROR}], " 

1273 f"[{theme.WARNING}]{report.warning_count} warning(s)[/{theme.WARNING}]" 

1274 ) 

1275 else: 

1276 console.print(" Lint: all clean") 

1277 

1278 

1279@wiki_app.command(name="synthesize") 

1280def wiki_synthesize( 

1281 data_dir: Path | None = data_dir_option, 

1282 use_global: bool = global_option, 

1283) -> None: 

1284 """Generate synthesis pages for concept clusters spanning 3+ sources.""" 

1285 apply_overrides(data_dir=data_dir, use_global=use_global) 

1286 if not cfg.wiki: 

1287 _fail_wiki_disabled() 

1288 return 

1289 from lilbee.wiki.gen import generate_synthesis_pages 

1290 

1291 svc = get_services() 

1292 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer) 

1293 

1294 if cfg.json_mode: 

1295 json_output( 

1296 { 

1297 "command": "wiki_synthesize", 

1298 "paths": [str(p) for p in paths], 

1299 "count": len(paths), 

1300 } 

1301 ) 

1302 return 

1303 

1304 if not paths: 

1305 console.print("No synthesis pages generated (need 3+ sources per cluster).") 

1306 return 

1307 

1308 console.print(f"Generated [{theme.LABEL}]{len(paths)}[/{theme.LABEL}] synthesis pages:") 

1309 for path in paths: 

1310 console.print(f" {path}") 

1311 

1312 

1313@wiki_app.command(name="prune") 

1314def wiki_prune( 

1315 data_dir: Path | None = data_dir_option, 

1316 use_global: bool = global_option, 

1317) -> None: 

1318 """Prune stale and orphaned wiki pages.""" 

1319 apply_overrides(data_dir=data_dir, use_global=use_global) 

1320 from lilbee.wiki.prune import prune_wiki 

1321 

1322 report = prune_wiki(get_services().store) 

1323 

1324 if cfg.json_mode: 

1325 json_output( 

1326 { 

1327 "command": "wiki_prune", 

1328 "records": [r.to_dict() for r in report.records], 

1329 "archived": report.archived_count, 

1330 "flagged": report.flagged_count, 

1331 } 

1332 ) 

1333 return 

1334 

1335 if not report.records: 

1336 console.print("No pages pruned.") 

1337 return 

1338 

1339 table = Table(title="Wiki Prune Results") 

1340 table.add_column("Page", style=theme.ACCENT) 

1341 table.add_column("Action") 

1342 table.add_column("Reason") 

1343 for rec in report.records: 

1344 action_style = theme.ERROR if rec.action.value == "archived" else theme.WARNING 

1345 action_text = f"[{action_style}]{rec.action.value}[/{action_style}]" 

1346 table.add_row(rec.wiki_source, action_text, rec.reason) 

1347 console.print(table) 

1348 

1349 

1350def _count_md_files(directory: Path) -> int: 

1351 """Count markdown files in a directory.""" 

1352 if not directory.exists(): 

1353 return 0 

1354 return len(list(directory.rglob("*.md"))) 

1355 

1356 

1357def _fail_wiki_disabled() -> None: 

1358 """Emit the standard wiki-disabled message in the caller's output mode.""" 

1359 if cfg.json_mode: 

1360 json_output({"error": msg.CMD_WIKI_DISABLED}) 

1361 return 

1362 console.print(msg.CMD_WIKI_DISABLED) 

1363 

1364 

1365@wiki_app.command(name="build") 

1366def wiki_build( 

1367 data_dir: Path | None = data_dir_option, 

1368 use_global: bool = global_option, 

1369 dry_run: bool = typer.Option( 

1370 False, 

1371 "--dry-run", 

1372 help=( 

1373 "Run extraction only; skip every LLM call. Prints the NER entity candidates. " 

1374 "LLM-curated concept pages require a build call and are not shown in dry-run." 

1375 ), 

1376 ), 

1377) -> None: 

1378 """Build the concept and entity wiki across all ingested sources.""" 

1379 apply_overrides(data_dir=data_dir, use_global=use_global) 

1380 if not cfg.wiki: 

1381 _fail_wiki_disabled() 

1382 return 

1383 

1384 if dry_run: 

1385 from lilbee.store import SearchChunk 

1386 from lilbee.wiki.entity_extractor import get_entity_extractor 

1387 

1388 svc = get_services() 

1389 chunks: list[SearchChunk] = [] 

1390 for record in svc.store.get_sources(): 

1391 chunks.extend(svc.store.get_chunks_by_source(record["filename"])) 

1392 extractor = get_entity_extractor(cfg.wiki_entity_mode, svc.provider, cfg) 

1393 entities = extractor.extract(chunks) 

1394 _wiki_build_dry_run_output(entities) 

1395 return 

1396 

1397 from lilbee.wiki import run_full_build 

1398 

1399 result = run_full_build(cfg) 

1400 

1401 if cfg.json_mode: 

1402 json_output({"command": "wiki_build", **result}) 

1403 return 

1404 

1405 pages = result["paths"] 

1406 if not pages: 

1407 console.print("No concept or entity pages generated.") 

1408 return 

1409 

1410 console.print( 

1411 f"Generated [{theme.LABEL}]{result['count']}[/{theme.LABEL}] " 

1412 f"wiki pages from {result['entities']} extracted records:" 

1413 ) 

1414 for path in pages: 

1415 console.print(f" {path}") 

1416 

1417 

1418_DRY_RUN_CONCEPT_NOTE = ( 

1419 "Note: LLM-curated concepts are not shown in --dry-run. " 

1420 "Run `lilbee wiki build` to see which concepts the LLM proposes." 

1421) 

1422 

1423 

1424def _wiki_build_dry_run_output(entities: list[ExtractedEntity]) -> None: 

1425 """Render the extraction result as JSON or table without calling any LLM. 

1426 

1427 Phase D: concepts come from the per-source batched LLM call, so 

1428 listing them would require the call we are trying to avoid. The 

1429 dry-run surface is NER-entity only, with a trailing note so a 

1430 user who expected concepts in the output knows why they are 

1431 missing. 

1432 """ 

1433 rows: list[dict[str, Any]] = [ 

1434 { 

1435 "slug": e.slug, 

1436 "label": e.label, 

1437 "kind": e.kind.value, 

1438 "type_hint": e.type_hint, 

1439 "mentions": len(e.chunk_refs), 

1440 "sources": sorted({r.source for r in e.chunk_refs}), 

1441 } 

1442 for e in entities 

1443 ] 

1444 

1445 if cfg.json_mode: 

1446 json_output( 

1447 { 

1448 "command": "wiki_build", 

1449 "dry_run": True, 

1450 "entities": rows, 

1451 "count": len(rows), 

1452 "note": _DRY_RUN_CONCEPT_NOTE, 

1453 } 

1454 ) 

1455 return 

1456 

1457 if not rows: 

1458 console.print("No candidate entities extracted. Run sync first.") 

1459 console.print(f"[{theme.MUTED}]{_DRY_RUN_CONCEPT_NOTE}[/{theme.MUTED}]") 

1460 return 

1461 

1462 table = Table(title=f"Wiki build dry-run ({len(rows)} NER entity candidates)") 

1463 table.add_column("Slug", style=theme.ACCENT) 

1464 table.add_column("Kind", style=theme.MUTED) 

1465 table.add_column("Type") 

1466 table.add_column("Mentions") 

1467 table.add_column("Sources") 

1468 for row in rows: 

1469 sources_list: list[str] = row["sources"] 

1470 table.add_row( 

1471 str(row["slug"]), 

1472 str(row["kind"]), 

1473 str(row["type_hint"]), 

1474 str(row["mentions"]), 

1475 ", ".join(sources_list[:3]) + (", ..." if len(sources_list) > 3 else ""), 

1476 ) 

1477 console.print(table) 

1478 console.print( 

1479 f"Dry run: [{theme.LABEL}]{len(rows)}[/{theme.LABEL}] candidate entities. " 

1480 "No LLM calls were made." 

1481 ) 

1482 console.print(f"[{theme.MUTED}]{_DRY_RUN_CONCEPT_NOTE}[/{theme.MUTED}]") 

1483 

1484 

1485@wiki_app.command(name="update") 

1486def wiki_update( 

1487 data_dir: Path | None = data_dir_option, 

1488 use_global: bool = global_option, 

1489) -> None: 

1490 """Refresh the concept and entity wiki after an ingest. 

1491 

1492 Currently a full rebuild. The incremental touched-slug regeneration 

1493 lands in the ingest-hook task and will re-route this command then. 

1494 """ 

1495 wiki_build(data_dir=data_dir, use_global=use_global, dry_run=False) 

1496 

1497 

1498drafts_app = typer.Typer(help="Review wiki drafts: list, diff, accept, reject.") 

1499wiki_app.add_typer(drafts_app, name="drafts") 

1500 

1501 

1502@drafts_app.command(name="list") 

1503def wiki_drafts_list( 

1504 data_dir: Path | None = data_dir_option, 

1505 use_global: bool = global_option, 

1506) -> None: 

1507 """List pending wiki drafts with drift, faithfulness, and pairing info.""" 

1508 apply_overrides(data_dir=data_dir, use_global=use_global) 

1509 from lilbee.wiki.drafts import PENDING_KIND_DRIFT, list_drafts 

1510 

1511 wiki_root = cfg.data_root / cfg.wiki_dir 

1512 drafts = list_drafts(wiki_root) 

1513 

1514 if cfg.json_mode: 

1515 json_output( 

1516 { 

1517 "command": "wiki_drafts_list", 

1518 "drafts": [d.to_dict() for d in drafts], 

1519 "total": len(drafts), 

1520 } 

1521 ) 

1522 return 

1523 

1524 if not drafts: 

1525 console.print("No drafts pending review.") 

1526 return 

1527 

1528 table = Table(title="Wiki Drafts") 

1529 table.add_column("Slug", style=theme.ACCENT) 

1530 table.add_column("Kind", style=theme.MUTED) 

1531 table.add_column("Drift") 

1532 table.add_column("Faithfulness") 

1533 table.add_column("Published?", style=theme.MUTED) 

1534 for d in drafts: 

1535 kind = d.pending_kind or PENDING_KIND_DRIFT 

1536 drift = f"{d.drift_ratio:.0%}" if d.drift_ratio is not None else "-" 

1537 faith = f"{d.faithfulness_score:.2f}" if d.faithfulness_score is not None else "-" 

1538 published = "yes" if d.published_exists else "no" 

1539 table.add_row(d.slug, kind, drift, faith, published) 

1540 console.print(table) 

1541 

1542 

1543@drafts_app.command(name="diff") 

1544def wiki_drafts_diff( 

1545 slug: str = typer.Argument(..., help="Draft slug (e.g. chevrolet)."), 

1546 data_dir: Path | None = data_dir_option, 

1547 use_global: bool = global_option, 

1548) -> None: 

1549 """Show a unified diff of the draft against its published counterpart.""" 

1550 apply_overrides(data_dir=data_dir, use_global=use_global) 

1551 from lilbee.wiki.drafts import diff_draft 

1552 

1553 wiki_root = cfg.data_root / cfg.wiki_dir 

1554 try: 

1555 diff = diff_draft(slug, wiki_root) 

1556 except FileNotFoundError as exc: 

1557 if cfg.json_mode: 

1558 json_output({"error": str(exc)}) 

1559 else: 

1560 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]") 

1561 raise typer.Exit(1) from None 

1562 

1563 if cfg.json_mode: 

1564 json_output({"command": "wiki_drafts_diff", "slug": slug, "diff": diff}) 

1565 return 

1566 console.print(diff or "(no differences)") 

1567 

1568 

1569@drafts_app.command(name="accept") 

1570def wiki_drafts_accept( 

1571 slug: str = typer.Argument(..., help="Draft slug to accept."), 

1572 data_dir: Path | None = data_dir_option, 

1573 use_global: bool = global_option, 

1574) -> None: 

1575 """Overwrite the published page with the draft and re-index its chunks.""" 

1576 apply_overrides(data_dir=data_dir, use_global=use_global) 

1577 from lilbee.wiki.drafts import accept_draft 

1578 

1579 wiki_root = cfg.data_root / cfg.wiki_dir 

1580 try: 

1581 result = accept_draft(slug, wiki_root, get_services().store) 

1582 except FileNotFoundError as exc: 

1583 if cfg.json_mode: 

1584 json_output({"error": str(exc)}) 

1585 else: 

1586 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]") 

1587 raise typer.Exit(1) from None 

1588 

1589 if cfg.json_mode: 

1590 json_output({"command": "wiki_drafts_accept", **result.to_dict()}) 

1591 return 

1592 console.print( 

1593 f"Accepted [{theme.ACCENT}]{slug}[/{theme.ACCENT}] -> " 

1594 f"{result.moved_to} ({result.reindexed_chunks} chunks re-indexed)" 

1595 ) 

1596 

1597 

1598@drafts_app.command(name="reject") 

1599def wiki_drafts_reject( 

1600 slug: str = typer.Argument(..., help="Draft slug to reject."), 

1601 data_dir: Path | None = data_dir_option, 

1602 use_global: bool = global_option, 

1603) -> None: 

1604 """Delete the draft file. Does not touch the published page or index.""" 

1605 apply_overrides(data_dir=data_dir, use_global=use_global) 

1606 from lilbee.wiki.drafts import reject_draft 

1607 

1608 wiki_root = cfg.data_root / cfg.wiki_dir 

1609 try: 

1610 reject_draft(slug, wiki_root) 

1611 except FileNotFoundError as exc: 

1612 if cfg.json_mode: 

1613 json_output({"error": str(exc)}) 

1614 else: 

1615 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]") 

1616 raise typer.Exit(1) from None 

1617 

1618 if cfg.json_mode: 

1619 json_output({"command": "wiki_drafts_reject", "slug": slug}) 

1620 return 

1621 console.print(f"Rejected [{theme.ACCENT}]{slug}[/{theme.ACCENT}]")