Coverage for src/lilbee/cli/commands.py: 100%

1"""CLI command definitions registered on the app."""

3from __future__ import annotations

5import asyncio

6import json

7import sys

8from pathlib import Path

9from typing import TYPE_CHECKING, Any

11import typer

13if TYPE_CHECKING:

14 import uvicorn

16 from lilbee.wiki.entity_extractor import ExtractedEntity

17from rich.table import Table

19from lilbee.cli import theme

20from lilbee.cli.app import (

21 app,

22 apply_overrides,

23 console,

24 data_dir_option,

25 global_option,

26 model_option,

27 num_ctx_option,

28 repeat_penalty_option,

29 seed_option,

30 temperature_option,

31 top_k_sampling_option,

32 top_p_option,

33)

34from lilbee.cli.helpers import (

35 CopyResult,

36 add_paths,

37 auto_sync,

38 clean_result,

39 copy_files,

40 gather_status,

41 get_version,

42 json_output,

43 perform_reset,

44 render_status,

45 sync_result_to_json,

46)

47from lilbee.cli.tui import messages as msg

48from lilbee.config import cfg

49from lilbee.crawler import CrawlerBrowserMissing, bootstrap_chromium, chromium_installed, is_url

50from lilbee.progress import EventType, SetupProgressEvent

51from lilbee.providers.base import ProviderError

52from lilbee.services import get_services

53from lilbee.store import SearchScope, scope_to_chunk_type

54from lilbee.wiki.shared import (

55 DRAFTS_SUBDIR,

56 SUMMARIES_SUBDIR,

57)

59CHUNK_PREVIEW_LEN = 80 # characters shown in human-readable search output

61_ocr_option = typer.Option(None, "--ocr/--no-ocr", help="Force vision OCR on/off for scanned PDFs.")

62_ocr_timeout_option = typer.Option(

63 None,

64 "--ocr-timeout",

65 help="Per-page timeout in seconds for vision OCR (default: 120, 0 = no limit).",

66)

67_scope_option = typer.Option(

68 SearchScope.BOTH,

69 "--scope",

70 "-s",

71 help="Restrict the pool to raw chunks, wiki pages, or both (default).",

72 case_sensitive=False,

73)

76def _apply_ocr_overrides(ocr: bool | None, ocr_timeout: float | None) -> None:

77 """Apply --ocr/--no-ocr and --ocr-timeout CLI overrides to config."""

78 if ocr is not None:

79 cfg.enable_ocr = ocr

80 if ocr_timeout is not None:

81 cfg.ocr_timeout = ocr_timeout

84_paths_argument = typer.Argument(

85 ...,

86 help="Files, directories, or URLs to add to the knowledge base.",

87)

90@app.command()

91def search(

92 query: str = typer.Argument(..., help="Search query"),

93 top_k: int = typer.Option(None, "--top-k", "-k", help="Number of results"),

94 scope: SearchScope = _scope_option,

95 data_dir: Path | None = data_dir_option,

96 use_global: bool = global_option,

97) -> None:

98 """Search the knowledge base for relevant chunks."""

99 apply_overrides(data_dir=data_dir, use_global=use_global)

100

101 if not query or not query.strip():

102 if cfg.json_mode:

103 json_output({"error": "query must not be empty"})

104 raise SystemExit(1)

105 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] query must not be empty")

106 raise SystemExit(1)

107

108 try:

109 results = get_services().searcher.search(

110 query,

111 top_k=top_k or cfg.top_k,

112 chunk_type=scope_to_chunk_type(scope),

113 )

114 except Exception as exc:

115 if cfg.json_mode:

116 json_output({"error": str(exc)})

117 raise SystemExit(1) from None

118 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}")

119 raise SystemExit(1) from None

120 cleaned = [clean_result(r) for r in results]

121

122 if cfg.json_mode:

123 json_output({"command": "search", "query": query, "results": cleaned})

124 return

125

126 if not cleaned:

127 console.print("No results found.")

128 return

129

130 has_relevance = any("relevance_score" in r for r in cleaned)

131 table = Table(title="Search Results")

132 table.add_column("Source", style=theme.ACCENT)

133 table.add_column("Chunk", max_width=80)

134 score_label = "Score" if has_relevance else "Distance"

135 table.add_column(score_label, justify="right", style=theme.MUTED)

136

137 for r in cleaned:

138 chunk_text = r["chunk"]

139 preview = chunk_text[:CHUNK_PREVIEW_LEN]

140 if len(chunk_text) > CHUNK_PREVIEW_LEN:

141 preview += "..."

142 score = r.get("relevance_score") or r.get("distance") or 0

143 table.add_row(r["source"], preview, f"{score:.4f}")

144 console.print(table)

145

146

147@app.command(name="sync")

148def sync_cmd(

149 data_dir: Path | None = data_dir_option,

150 use_global: bool = global_option,

151 ocr: bool | None = _ocr_option,

152 ocr_timeout: float | None = _ocr_timeout_option,

153) -> None:

154 """Manually trigger document sync."""

155 apply_overrides(data_dir=data_dir, use_global=use_global)

156 _apply_ocr_overrides(ocr, ocr_timeout)

157 from lilbee.ingest import sync

158

159 try:

160 result = asyncio.run(sync(quiet=cfg.json_mode))

161 except RuntimeError as exc:

162 if cfg.json_mode:

163 json_output({"error": str(exc)})

164 raise SystemExit(1) from None

165 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}")

166 raise SystemExit(1) from None

167 if cfg.json_mode:

168 json_output(sync_result_to_json(result))

169 return

170 console.print(result)

171

172

173@app.command()

174def rebuild(

175 data_dir: Path | None = data_dir_option,

176 use_global: bool = global_option,

177 ocr: bool | None = _ocr_option,

178 ocr_timeout: float | None = _ocr_timeout_option,

179) -> None:

180 """Nuke the DB and re-ingest everything from documents/."""

181 apply_overrides(data_dir=data_dir, use_global=use_global)

182 _apply_ocr_overrides(ocr, ocr_timeout)

183 from lilbee.ingest import sync

184

185 try:

186 result = asyncio.run(sync(force_rebuild=True, quiet=cfg.json_mode))

187 except RuntimeError as exc:

188 if cfg.json_mode:

189 json_output({"error": str(exc)})

190 raise SystemExit(1) from None

191 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}")

192 raise SystemExit(1) from None

193 if cfg.json_mode:

194 json_output({"command": "rebuild", "ingested": len(result.added)})

195 return

196 console.print(f"Rebuilt: {len(result.added)} documents ingested")

197

198

199_force_option = typer.Option(False, "--force", "-f", help="Overwrite existing files.")

200_crawl_option = typer.Option(

201 False,

202 "--crawl",

203 help="Recursively crawl URLs (whole site by default; see --depth and --max-pages).",

204)

205_depth_option = typer.Option(

206 None,

207 "--depth",

208 help="Cap link-follow depth for --crawl. Unset = unbounded; 0 = single URL only.",

209)

210_max_pages_option = typer.Option(

211 None,

212 "--max-pages",

213 help="Cap total pages for --crawl. Unset = no limit; positive int = hard cap.",

214)

215_include_subdomains_option = typer.Option(

216 False,

217 "--include-subdomains",

218 help=(

219 "Allow --crawl to follow links into sibling subdomains of the start "

220 "host (e.g. en.wikipedia.org plus af.wikipedia.org). Default scopes "

221 "the crawl to the exact start host only."

222 ),

223)

224

225

226def _partition_inputs(inputs: list[str]) -> tuple[list[Path], list[str]]:

227 """Split inputs into file paths and URLs."""

228 paths: list[Path] = []

229 urls: list[str] = []

230 for inp in inputs:

231 if is_url(inp):

232 urls.append(inp)

233 else:

234 paths.append(Path(inp))

235 return paths, urls

236

237

238def _crawl_urls_blocking(

239 urls: list[str],

240 *,

241 crawl: bool,

242 depth: int | None,

243 max_pages: int | None,

244 include_subdomains: bool = False,

245) -> list[Path]:

246 """Crawl URLs synchronously (for CLI), returning paths written.

247

248 Without --crawl, each URL is fetched as a single page (depth=0).

249 With --crawl, the default is whole-site unbounded (depth=None, pages=None).

250 Explicit --depth / --max-pages override both.

251

252 Ctrl-C is handled by running the crawl through _run_crawl_with_signal_cancel,

253 which installs a signal.signal handler that sets a threading.Event passed

254 into crawl_and_save. crawl_recursive polls the event between pages so the

255 signal flows through as a clean cancel instead of asyncio.run's default

256 KeyboardInterrupt-raising (which left browser contexts mid-teardown).

257 """

258 import threading

259

260 from rich.progress import Progress, SpinnerColumn, TaskID, TextColumn

261

262 from lilbee.crawler import crawl_and_save

263 from lilbee.progress import CrawlPageEvent, DetailedProgressCallback, EventType, ProgressEvent

264

265 if crawl:

266 effective_depth = depth

267 effective_pages = max_pages

268 else:

269 effective_depth = 0

270 effective_pages = None

271

272 cancel_event = threading.Event()

273

274 from rich.console import Console as RichConsole

275

276 err_console = RichConsole(stderr=True)

277 all_paths: list[Path] = []

278 with Progress(

279 SpinnerColumn(),

280 TextColumn("{task.description}"),

281 transient=True,

282 console=err_console,

283 disable=cfg.json_mode,

284 ) as progress:

285 for url in urls:

286 if cancel_event.is_set():

287 break

288 ptask = progress.add_task(f"Crawling {url}...", total=None)

289

290 def _make_callback(_t: TaskID = ptask) -> DetailedProgressCallback:

291 def on_progress(event_type: EventType, data: ProgressEvent) -> None:

292 if event_type == EventType.CRAWL_PAGE:

293 if not isinstance(data, CrawlPageEvent):

294 raise TypeError(f"Expected CrawlPageEvent, got {type(data).__name__}")

295 total_str = str(data.total) if data.total > 0 else "?"

296 progress.update(

297 _t,

298 description=f"Crawled {data.current}/{total_str}: {data.url}",

299 )

300

301 return on_progress

302

303 paths = _run_crawl_with_signal_cancel(

304 url,

305 depth=effective_depth,

306 max_pages=effective_pages,

307 on_progress=_make_callback(),

308 cancel_event=cancel_event,

309 crawl_and_save=crawl_and_save,

310 include_subdomains=include_subdomains,

311 )

312 all_paths.extend(paths)

313 progress.update(ptask, description=f"Done: {url} ({len(paths)} pages)")

314 return all_paths

315

316

317def _run_crawl_with_signal_cancel(

318 url: str,

319 *,

320 depth: int | None,

321 max_pages: int | None,

322 on_progress: object,

323 cancel_event: object,

324 crawl_and_save: object,

325 include_subdomains: bool = False,

326) -> list[Path]:

327 """Run crawl_and_save on a dedicated event loop with a SIGINT->cancel hook.

328

329 asyncio.run() installs its own SIGINT handler that raises

330 KeyboardInterrupt, which tears the crawl down ungracefully. Registering a

331 plain signal.signal handler on the main thread AND running the crawl on a

332 loop we own (instead of asyncio.run) lets Ctrl-C set our threading.Event,

333 which crawl_recursive polls between pages so it can close the stream and

334 stop dispatch cleanly.

335 """

336 import signal

337

338 previous_handler = signal.getsignal(signal.SIGINT)

339

340 def _on_sigint(_signum: int, _frame: object) -> None:

341 # Set the cancel event that crawl_recursive polls between pages, so

342 # a Ctrl-C flows through as a clean cancel instead of asyncio.run's

343 # default KeyboardInterrupt-raising dance.

344 cancel_event.set() # type: ignore[attr-defined]

345

346 signal.signal(signal.SIGINT, _on_sigint)

347 # Manage the event loop explicitly. In the CLI this runs once per process,

348 # but under pytest-xdist the same worker thread runs many tests; leaving a

349 # closed loop set as the "current" loop for the thread poisons every later

350 # asyncio.get_event_loop() call and hangs macOS 3.12/3.13 unit-test CI.

351 # Always clear the thread-current loop in finally.

352 loop = asyncio.new_event_loop()

353 try:

354 asyncio.set_event_loop(loop)

355 coro = crawl_and_save( # type: ignore[operator]

356 url,

357 depth=depth,

358 max_pages=max_pages,

359 on_progress=on_progress,

360 cancel=cancel_event,

361 quiet=cfg.json_mode,

362 include_subdomains=include_subdomains,

363 )

364 return loop.run_until_complete(coro)

365 finally:

366 loop.close()

367 asyncio.set_event_loop(None)

368 signal.signal(signal.SIGINT, previous_handler)

369

370

371@app.command()

372def add(

373 paths: list[str] = _paths_argument,

374 data_dir: Path | None = data_dir_option,

375 use_global: bool = global_option,

376 force: bool = _force_option,

377 ocr: bool | None = _ocr_option,

378 ocr_timeout: float | None = _ocr_timeout_option,

379 crawl: bool = _crawl_option,

380 depth: int | None = _depth_option,

381 max_pages: int | None = _max_pages_option,

382 include_subdomains: bool = _include_subdomains_option,

383) -> None:

384 """Copy files or crawl URLs into the knowledge base and ingest them."""

385 apply_overrides(data_dir=data_dir, use_global=use_global)

386 _apply_ocr_overrides(ocr, ocr_timeout)

387

388 file_paths, urls = _partition_inputs(paths)

389 # Validate file paths exist

390 for fp in file_paths:

391 if not fp.exists():

392 if cfg.json_mode:

393 json_output({"error": f"Path not found: {fp}"})

394 raise SystemExit(1)

395 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] Path not found: {fp}")

396 raise SystemExit(1)

397

398 try:

399 # Crawl URLs first (saves .md files into documents/_web/)

400 crawled_paths: list[Path] = []

401 if urls:

402 from lilbee.crawler import crawler_available

403

404 if not crawler_available():

405 console.print(

406 f"[{theme.ERROR}]Web crawling requires: "

407 f"pip install 'lilbee[crawler]'[/{theme.ERROR}]"

408 )

409 raise SystemExit(1)

410 crawled_paths = _crawl_urls_blocking(

411 urls,

412 crawl=crawl,

413 depth=depth,

414 max_pages=max_pages,

415 include_subdomains=include_subdomains,

416 )

417 if not cfg.json_mode:

418 console.print(

419 f"[{theme.MUTED}]Crawled {len(crawled_paths)} page(s)"

420 f" from {len(urls)} URL(s)[/{theme.MUTED}]"

421 )

422

423 if cfg.json_mode:

424 from lilbee.ingest import sync

425

426 copy_result = CopyResult()

427 if file_paths:

428 copy_result = copy_files(file_paths, force=force)

429 result = asyncio.run(sync(quiet=True))

430 json_output(

431 {

432 "command": "add",

433 "copied": copy_result.copied,

434 "skipped": copy_result.skipped,

435 "crawled": len(crawled_paths),

436 "sync": sync_result_to_json(result),

437 }

438 )

439 return

440

441 if file_paths:

442 add_paths(file_paths, console, force=force)

443 elif urls:

444 # URLs already saved; just trigger sync

445 from lilbee.ingest import sync

446

447 result = asyncio.run(sync())

448 console.print(result)

449 except RuntimeError as exc:

450 if cfg.json_mode:

451 json_output({"error": str(exc)})

452 raise SystemExit(1) from None

453 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}")

454 raise SystemExit(1) from None

455

456

457_chunks_source_argument = typer.Argument(..., help="Source name to inspect chunks for.")

458

459

460@app.command()

461def chunks(

462 source: str = _chunks_source_argument,

463 data_dir: Path | None = data_dir_option,

464 use_global: bool = global_option,

465) -> None:

466 """Show chunks a document was split into (useful for debugging retrieval)."""

467 apply_overrides(data_dir=data_dir, use_global=use_global)

468

469 store = get_services().store

470 known = {s["filename"] for s in store.get_sources()}

471 if source not in known:

472 if cfg.json_mode:

473 json_output({"error": f"Source not found: {source}"})

474 raise SystemExit(1)

475 console.print(f"[{theme.ERROR}]Source not found:[/{theme.ERROR}] {source}")

476 raise SystemExit(1)

477

478 raw_chunks = store.get_chunks_by_source(source)

479 cleaned = sorted(

480 [clean_result(c) for c in raw_chunks],

481 key=lambda c: c.get("chunk_index", 0),

482 )

483

484 if cfg.json_mode:

485 json_output({"command": "chunks", "source": source, "chunks": cleaned})

486 return

487

488 console.print(

489 f"[{theme.LABEL}]{len(cleaned)}[/{theme.LABEL}]"

490 f" chunks from [{theme.ACCENT}]{source}[/{theme.ACCENT}]\n"

491 )

492 for c in cleaned:

493 idx = c.get("chunk_index", "?")

494 preview = c.get("chunk", "")[:CHUNK_PREVIEW_LEN]

495 if len(c.get("chunk", "")) > CHUNK_PREVIEW_LEN:

496 preview += "..."

497 console.print(f" [{idx}] {preview}")

498

499

500_remove_names_argument = typer.Argument(

501 ..., help="Source name(s) to remove from the knowledge base."

502)

503

504_delete_file_option = typer.Option(

505 False, "--delete", help="Also delete the file from the documents directory."

506)

507

508

509@app.command()

510def remove(

511 names: list[str] = _remove_names_argument,

512 data_dir: Path | None = data_dir_option,

513 use_global: bool = global_option,

514 delete_file: bool = _delete_file_option,

515) -> None:

516 """Remove documents from the knowledge base by source name."""

517 apply_overrides(data_dir=data_dir, use_global=use_global)

518

519 result = get_services().store.remove_documents(

520 names, delete_files=delete_file, documents_dir=cfg.documents_dir

521 )

522

523 if cfg.json_mode:

524 payload: dict = {"command": "remove", "removed": result.removed}

525 if result.not_found:

526 payload["not_found"] = result.not_found

527 json_output(payload)

528 if not result.removed and result.not_found:

529 raise SystemExit(1)

530 return

531

532 for name in result.removed:

533 console.print(f"Removed [{theme.ACCENT}]{name}[/{theme.ACCENT}]")

534 for name in result.not_found:

535 console.print(f"[{theme.ERROR}]Not found:[/{theme.ERROR}] {name}")

536 if not result.removed and result.not_found:

537 raise SystemExit(1)

538

539

540@app.command()

541def ask(

542 question: str = typer.Argument(..., help="Question to ask"),

543 scope: SearchScope = _scope_option,

544 data_dir: Path | None = data_dir_option,

545 model: str | None = model_option,

546 use_global: bool = global_option,

547 temperature: float | None = temperature_option,

548 top_p: float | None = top_p_option,

549 top_k_sampling: int | None = top_k_sampling_option,

550 repeat_penalty: float | None = repeat_penalty_option,

551 num_ctx: int | None = num_ctx_option,

552 seed: int | None = seed_option,

553) -> None:

554 """Ask a one-shot question (auto-syncs first)."""

555 apply_overrides(

556 data_dir=data_dir,

557 model=model,

558 use_global=use_global,

559 temperature=temperature,

560 top_p=top_p,

561 top_k_sampling=top_k_sampling,

562 repeat_penalty=repeat_penalty,

563 num_ctx=num_ctx,

564 seed=seed,

565 )

566

567 try:

568 from lilbee.models import ensure_chat_model

569

570 ensure_chat_model()

571 get_services().embedder.validate_model()

572 if cfg.json_mode:

573 from rich.console import Console as _QuietConsole

574

575 auto_sync(_QuietConsole(quiet=True))

576 else:

577 auto_sync(console)

578

579 chunk_type = scope_to_chunk_type(scope)

580

581 if cfg.json_mode:

582 result = get_services().searcher.ask_raw(question, chunk_type=chunk_type)

583 json_output(

584 {

585 "command": "ask",

586 "question": question,

587 "answer": result.answer,

588 "sources": [clean_result(s) for s in result.sources],

589 }

590 )

591 return

592

593 for token in get_services().searcher.ask_stream(question, chunk_type=chunk_type):

594 console.print(token.content, end="")

595 console.print()

596 except (RuntimeError, ProviderError) as exc:

597 if cfg.json_mode:

598 json_output({"error": str(exc)})

599 raise SystemExit(1) from None

600 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] {exc}")

601 raise SystemExit(1) from None

602

603

604@app.command()

605def chat(

606 data_dir: Path | None = data_dir_option,

607 model: str | None = model_option,

608 use_global: bool = global_option,

609 temperature: float | None = temperature_option,

610 top_p: float | None = top_p_option,

611 top_k_sampling: int | None = top_k_sampling_option,

612 repeat_penalty: float | None = repeat_penalty_option,

613 num_ctx: int | None = num_ctx_option,

614 seed: int | None = seed_option,

615) -> None:

616 """Interactive chat loop (auto-syncs first)."""

617 apply_overrides(

618 data_dir=data_dir,

619 model=model,

620 use_global=use_global,

621 temperature=temperature,

622 top_p=top_p,

623 top_k_sampling=top_k_sampling,

624 repeat_penalty=repeat_penalty,

625 num_ctx=num_ctx,

626 seed=seed,

627 )

628

629 if cfg.json_mode:

630 json_output({"error": "Chat requires a terminal, not --json"})

631 raise SystemExit(1)

632 if not sys.stdin.isatty() or not sys.stdout.isatty():

633 console.print(f"[{theme.ERROR}]Error:[/{theme.ERROR}] Chat requires a terminal.")

634 raise SystemExit(1)

635 from lilbee.cli.tui import run_tui

636

637 run_tui(auto_sync=True)

638

639

640@app.command()

641def version() -> None:

642 """Show the lilbee version."""

643 ver = get_version()

644 if cfg.json_mode:

645 json_output({"command": "version", "version": ver})

646 return

647 console.print(f"lilbee {ver}")

648

649

650_SELF_CHECK_CHAT_REPO = "bartowski/SmolLM2-135M-Instruct-GGUF"

651_SELF_CHECK_CHAT_FILE = "SmolLM2-135M-Instruct-Q3_K_S.gguf"

652_SELF_CHECK_EMBED_REPO = "nomic-ai/nomic-embed-text-v1.5-GGUF"

653_SELF_CHECK_EMBED_FILE = "nomic-embed-text-v1.5.Q4_K_M.gguf"

654

655

656def _download_self_check_model(repo: str, filename: str) -> Path:

657 """Fetch a GGUF from the HuggingFace CDN via urllib (stdlib only).

658

659 Avoids huggingface_hub / httpx entirely. Inside the PyInstaller --onefile

660 bundle, huggingface_hub's retry path has re-entered a closed httpx client

661 after transient DNS failures on macOS runners. urllib is synchronous,

662 lives in the stdlib, and has no long-lived client to close.

663 """

664 import tempfile

665 import urllib.request

666

667 url = f"https://huggingface.co/{repo}/resolve/main/{filename}"

668 dest_dir = Path(tempfile.mkdtemp(prefix="lilbee-self-check-"))

669 dest = dest_dir / filename

670 console.print(f"Downloading {url}")

671 last_exc: BaseException | None = None

672 for attempt in range(3):

673 try:

674 with urllib.request.urlopen(url, timeout=120) as response: # noqa: S310 — literal https url

675 dest.write_bytes(response.read())

676 return dest

677 except (OSError, urllib.error.URLError) as exc:

678 last_exc = exc

679 console.print(f"download attempt {attempt + 1} failed: {exc!r}")

680 raise RuntimeError(f"GGUF download failed after 3 attempts: {last_exc!r}")

681

682

683_self_check_chat_path_option = typer.Option(

684 None,

685 "--chat-model-path",

686 help="Path to a chat GGUF file. Skips the HuggingFace download.",

687)

688_self_check_embed_path_option = typer.Option(

689 None,

690 "--embed-model-path",

691 help="Path to an embedding GGUF file. Skips the HuggingFace download.",

692)

693_self_check_max_tokens_option = typer.Option(5, "--max-tokens", help="Tokens to generate.")

694_self_check_skip_embedding_option = typer.Option(

695 False,

696 "--skip-embedding",

697 help="Skip the embedding-model leg of the self-check.",

698)

699

700

701def _self_check_emit_failure(error: str) -> None:

702 if cfg.json_mode:

703 json_output({"ok": False, "error": error})

704 else:

705 console.print(f"[{theme.ERROR}]SELF-CHECK FAILED:[/{theme.ERROR}] {error}")

706

707

708@app.command("self-check")

709def self_check_cmd(

710 chat_model_path: Path | None = _self_check_chat_path_option,

711 embed_model_path: Path | None = _self_check_embed_path_option,

712 max_tokens: int = _self_check_max_tokens_option,

713 skip_embedding: bool = _self_check_skip_embedding_option,

714) -> None:

715 """Verify the installation can load llama.cpp and run real inference.

716

717 Two legs:

718

719 1. **Chat**: downloads ``SmolLM2-135M-Instruct-Q3_K_S.gguf`` (~90MB) and

720 runs a tiny ``create_completion`` so we know decoder-style models work

721 end-to-end and the vendored shared libraries load.

722 2. **Embedding**: downloads ``nomic-embed-text-v1.5.Q4_K_M.gguf`` (~84MB)

723 and runs ``create_embedding``. This is the leg that catches the

724 "Memory is not initialized" assert from llama-cpp-python <0.3.19, where

725 BERT-style encoders trip ``kv_cache_clear`` on a context that never

726 allocated memory.

727

728 Exits 0 on success, 1 on any failure. Intended for post-install

729 verification and as the end-to-end gate in release CI.

730 """

731 from typing import Any, cast

732

733 try:

734 chat_path = chat_model_path or _download_self_check_model(

735 _SELF_CHECK_CHAT_REPO, _SELF_CHECK_CHAT_FILE

736 )

737 console.print(f"Loading chat model {chat_path}")

738

739 import llama_cpp

740

741 from lilbee.providers.llama_cpp_provider import install_llama_log_handler

742

743 install_llama_log_handler()

744 llm = llama_cpp.Llama(model_path=str(chat_path), n_ctx=256, verbose=False)

745 # stream=False (default) returns a dict, not an iterator, but

746 # create_completion's return type is a union; cast to Any so the

747 # indexing below type-checks without forcing llama_cpp to be a

748 # typecheck-time dep of lilbee.

749 out = cast(Any, llm.create_completion("2+2=", max_tokens=max_tokens))

750 text: str = out["choices"][0]["text"]

751 except Exception as exc:

752 _self_check_emit_failure(repr(exc))

753 raise typer.Exit(1) from exc

754

755 if not text.strip():

756 _self_check_emit_failure("empty inference response")

757 raise typer.Exit(1)

758

759 embedding_dims: int | None = None

760 if not skip_embedding:

761 try:

762 embed_path = embed_model_path or _download_self_check_model(

763 _SELF_CHECK_EMBED_REPO, _SELF_CHECK_EMBED_FILE

764 )

765 console.print(f"Loading embedding model {embed_path}")

766 enc = llama_cpp.Llama(

767 model_path=str(embed_path),

768 embedding=True,

769 n_ctx=512,

770 verbose=False,

771 )

772 emb = cast(Any, enc.create_embedding(input=["test"]))

773 vec = emb["data"][0]["embedding"]

774 except Exception as exc:

775 _self_check_emit_failure(repr(exc))

776 raise typer.Exit(1) from exc

777

778 if not vec:

779 _self_check_emit_failure("empty embedding vector")

780 raise typer.Exit(1)

781 embedding_dims = len(vec)

782

783 if cfg.json_mode:

784 payload: dict[str, Any] = {

785 "ok": True,

786 "chat_response": text,

787 "chat_model": str(chat_path),

788 }

789 if embedding_dims is not None:

790 payload["embedding_dims"] = embedding_dims

791 json_output(payload)

792 else:

793 console.print(f"Chat response: {text!r}")

794 if embedding_dims is not None:

795 console.print(f"Embedding dims: {embedding_dims}")

796 console.print(f"[{theme.ACCENT}]SELF-CHECK PASSED[/{theme.ACCENT}]")

797

798

799@app.command()

800def status(

801 data_dir: Path | None = data_dir_option,

802 use_global: bool = global_option,

803) -> None:

804 """Show indexed documents, paths, and chunk counts."""

805 apply_overrides(data_dir=data_dir, use_global=use_global)

806 if cfg.json_mode:

807 json_output(gather_status().model_dump(exclude_none=True))

808 return

809 render_status(console)

810

811

812_yes_option = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt.")

813

814

815@app.command()

816def reset(

817 data_dir: Path | None = data_dir_option,

818 use_global: bool = global_option,

819 yes: bool = _yes_option,

820) -> None:

821 """Delete all documents and data (full factory reset)."""

822 apply_overrides(data_dir=data_dir, use_global=use_global)

823 if not yes:

824 if cfg.json_mode:

825 json_output({"error": "Use --yes to confirm reset in JSON mode"})

826 raise SystemExit(1)

827 console.print(

828 f"[{theme.ERROR_BOLD}]This will delete ALL documents and data.[/{theme.ERROR_BOLD}]\n"

829 f" Documents: {cfg.documents_dir}\n"

830 f" Data: {cfg.data_dir}"

831 )

832 confirmed = typer.confirm("Are you sure?", default=False)

833 if not confirmed:

834 console.print("Aborted.")

835 raise SystemExit(0)

836

837 result = perform_reset()

838

839 if cfg.json_mode:

840 json_output(result.model_dump())

841 return

842

843 console.print(

844 f"Reset complete: {result.deleted_docs} document(s), "

845 f"{result.deleted_data} data item(s) deleted."

846 )

847 if result.skipped:

848 console.print(

849 f"[{theme.WARNING}]{len(result.skipped)} item(s) could not be deleted "

850 f"(locked or permission denied).[/{theme.WARNING}]"

851 )

852

853

854@app.command()

855def init() -> None:

856 """Initialize a local .lilbee/ knowledge base in the current directory."""

857 root = Path.cwd() / ".lilbee"

858 if root.is_dir():

859 if cfg.json_mode:

860 json_output({"command": "init", "path": str(root), "created": False})

861 return

862 console.print(f"Already initialized: {root}")

863 return

864

865 docs = root / "documents"

866 data = root / "data"

867 docs.mkdir(parents=True)

868 data.mkdir(parents=True)

869 (root / ".gitignore").write_text("data/\n")

870

871 if cfg.json_mode:

872 json_output({"command": "init", "path": str(root), "created": True})

873 return

874 console.print(f"Initialized local knowledge base at {root}")

875

876

877def _port_file() -> Path:

878 return cfg.data_dir / "server.port"

879

880

881async def _run_server(server: uvicorn.Server, config: uvicorn.Config, host: str) -> None:

882 """Start uvicorn, write port file, and clean up on shutdown."""

883 import atexit

884

885 from lilbee.parent_monitor import parse_parent_pid, watch_parent_async

886

887 port_path = _port_file()

888

889 def _cleanup_port_file() -> None:

890 port_path.unlink(missing_ok=True)

891

892 if not config.loaded:

893 config.load()

894 server.lifespan = config.lifespan_class(config)

895 await server.startup()

896

897 parent_pid = parse_parent_pid()

898 parent_watcher: asyncio.Task[None] | None = None

899 if parent_pid is not None:

900

901 def _on_parent_death() -> None:

902 server.should_exit = True

903

904 parent_watcher = asyncio.create_task(watch_parent_async(parent_pid, _on_parent_death))

905

906 try:

907 if server.servers:

908 sock = server.servers[0].sockets[0]

909 actual_port = sock.getsockname()[1]

910 port_path.parent.mkdir(parents=True, exist_ok=True)

911 port_path.write_text(str(actual_port))

912 atexit.register(_cleanup_port_file)

913 console.print(f"Listening on http://{host}:{actual_port}")

914 await server.main_loop()

915 finally:

916 if parent_watcher is not None and not parent_watcher.done():

917 parent_watcher.cancel()

918 port_path.unlink(missing_ok=True)

919 await server.shutdown()

920

921

922@app.command()

923def serve(

924 host: str = typer.Option(None, "--host", "-H", help="Bind address (default: 127.0.0.1)"),

925 port: int = typer.Option(None, "--port", "-p", help="Port (default: 0/random)"),

926 data_dir: Path | None = data_dir_option,

927 use_global: bool = global_option,

928) -> None:

929 """Start the HTTP API server."""

930 apply_overrides(data_dir=data_dir, use_global=use_global)

931 if host is not None:

932 cfg.server_host = host

933 if port is not None:

934 cfg.server_port = port

935

936 import logging

937

938 import uvicorn

939

940 from lilbee.server import create_app

941

942 logging.getLogger("asyncio").setLevel(logging.ERROR)

943

944 config = uvicorn.Config(create_app(), host=cfg.server_host, port=cfg.server_port)

945 server = uvicorn.Server(config)

946 asyncio.run(_run_server(server, config, cfg.server_host))

947

948

949@app.command()

950def token(

951 data_dir: Path | None = data_dir_option,

952 use_global: bool = global_option,

953) -> None:

954 """Print the auth token for a running server."""

955 from lilbee.server.auth import server_json_path

956

957 apply_overrides(data_dir=data_dir, use_global=use_global)

958 path = server_json_path()

959 if not path.exists():

960 if cfg.json_mode:

961 json_output({"error": "No running server found"})

962 else:

963 console.print("No running server found (server.json missing).")

964 raise SystemExit(1)

965 try:

966 data = json.loads(path.read_text())

967 tok = data.get("token", "")

968 except (json.JSONDecodeError, OSError) as exc:

969 if cfg.json_mode:

970 json_output({"error": f"Could not read server.json: {exc}"})

971 else:

972 console.print(

973 f"[{theme.ERROR}]Error:[/{theme.ERROR}] Could not read server.json: {exc}"

974 )

975 raise SystemExit(1) from None

976 if cfg.json_mode:

977 json_output({"token": tok})

978 return

979 console.print(tok)

980

981

982@app.command()

983def topics(

984 query: str = typer.Argument(None, help="Optional query to find related concepts."),

985 top_k: int = typer.Option(10, "--top-k", "-k", help="Number of results."),

986 data_dir: Path | None = data_dir_option,

987 use_global: bool = global_option,

988) -> None:

989 """Show top concept communities or concepts related to a query."""

990 apply_overrides(data_dir=data_dir, use_global=use_global)

991

992 from lilbee.concepts import concepts_available

993

994 if not concepts_available():

995 msg = "Concept graph requires: pip install 'lilbee[graph]'"

996 if cfg.json_mode:

997 json_output({"error": msg})

998 raise SystemExit(1)

999 console.print(f"[{theme.ERROR}]{msg}[/{theme.ERROR}]")

1000 raise SystemExit(1)

1001

1002 if not cfg.concept_graph:

1003 if cfg.json_mode:

1004 json_output({"error": "Concept graph is disabled (LILBEE_CONCEPT_GRAPH=false)"})

1005 raise SystemExit(1)

1006 console.print(

1007 f"[{theme.ERROR}]Concept graph is disabled.[/{theme.ERROR}] "

1008 "Enable with LILBEE_CONCEPT_GRAPH=true"

1009 )

1010 raise SystemExit(1)

1011

1012 if not get_services().concepts.get_graph():

1013 if cfg.json_mode:

1014 json_output({"error": "Concept graph not available"})

1015 raise SystemExit(1)

1016 console.print(f"[{theme.ERROR}]Concept graph not available.[/{theme.ERROR}]")

1017 raise SystemExit(1)

1018

1019 if query:

1020 _topics_for_query(query)

1021 else:

1022 _topics_overview(top_k)

1023

1024

1025def _topics_for_query(query: str) -> None:

1026 """Show concepts related to a query."""

1027 cg = get_services().concepts

1028 concepts = cg.extract_concepts(query)

1029 related = cg.expand_query(query)

1030 all_concepts = concepts + [r for r in related if r not in concepts]

1031

1032 if cfg.json_mode:

1033 json_output({"command": "topics", "query": query, "concepts": all_concepts})

1034 return

1035 if not all_concepts:

1036 console.print("No concepts found for this query.")

1037 return

1038 console.print(f"Concepts related to [{theme.ACCENT}]{query}[/{theme.ACCENT}]:")

1039 for c in all_concepts:

1040 console.print(f" {c}")

1041

1042

1043def _topics_overview(top_k: int) -> None:

1044 """Show top concept communities."""

1045 from dataclasses import asdict

1046

1047 communities = get_services().concepts.top_communities(k=top_k)

1048 if cfg.json_mode:

1049 json_output({"command": "topics", "communities": [asdict(c) for c in communities]})

1050 return

1051 if not communities:

1052 console.print("No concept communities found. Try syncing some documents first.")

1053 return

1054 table = Table(title="Concept Communities")

1055 table.add_column("Cluster", justify="right", style=theme.MUTED)

1056 table.add_column("Size", justify="right")

1057 table.add_column("Top Concepts", style=theme.ACCENT)

1058 for comm in communities:

1059 preview = ", ".join(comm.concepts[:5])

1060 if len(comm.concepts) > 5:

1061 preview += f" (+{len(comm.concepts) - 5} more)"

1062 table.add_row(str(comm.cluster_id), str(comm.size), preview)

1063 console.print(table)

1064

1065

1066@app.command()

1067def login() -> None:

1068 """Log in to HuggingFace for access to gated models (Mistral, Llama, etc.)."""

1069 import webbrowser

1070

1071 from huggingface_hub import get_token

1072 from huggingface_hub import login as hf_login

1073

1074 if get_token():

1075 typer.echo("Already logged in to HuggingFace.")

1076 if not typer.confirm("Log in again?", default=False):

1077 return

1078

1079 typer.echo("Opening HuggingFace token page in your browser...")

1080 typer.echo("Create a token with 'Read' access, then paste it below.\n")

1081 webbrowser.open("https://huggingface.co/settings/tokens")

1082

1083 token = typer.prompt("Paste your HuggingFace token", hide_input=True)

1084 if not token.strip():

1085 typer.echo("No token provided.", err=True)

1086 raise typer.Exit(1)

1087

1088 hf_login(token=token.strip(), add_to_git_credential=False)

1089 typer.echo("Logged in! Gated models (Mistral, Llama, etc.) are now accessible.")

1090

1091

1092@app.command(name="mcp")

1093def mcp_cmd() -> None:

1094 """Start the MCP server (stdio transport) for agent integration."""

1095 from lilbee.mcp import main

1096

1097 main()

1098

1099

1100setup_app = typer.Typer(help="One-time setup for optional runtime components.")

1101app.add_typer(setup_app, name="setup")

1102

1103

1104@setup_app.command(name="crawler")

1105def setup_crawler_cmd() -> None:

1106 """Install Playwright's Chromium browser, needed for /crawl.

1107

1108 No-op when Chromium is already present. Emits a simple progress

1109 readout; use '--json' mode on the top-level 'lilbee' command to get

1110 a single JSON blob with the final install state instead.

1111 """

1112 if chromium_installed():

1113 if cfg.json_mode:

1114 typer.echo(json.dumps({"component": "chromium", "already_installed": True}))

1115 else:

1116 typer.echo("Chromium already installed.")

1117 return

1118

1119 last_pct: list[int] = [-1]

1120

1121 def _on_progress(event_type: object, data: object) -> None:

1122 if event_type != EventType.SETUP_PROGRESS or not isinstance(data, SetupProgressEvent):

1123 return

1124 total = data.total_bytes or 0

1125 pct = int(data.downloaded_bytes * 100 / total) if total > 0 else 0

1126 if pct != last_pct[0] and not cfg.json_mode:

1127 last_pct[0] = pct

1128 typer.echo(msg.SETUP_CHROMIUM_CLI_PROGRESS.format(pct=pct), err=True)

1129

1130 try:

1131 asyncio.run(bootstrap_chromium(on_progress=_on_progress))

1132 except CrawlerBrowserMissing as exc:

1133 if cfg.json_mode:

1134 typer.echo(json.dumps({"component": "chromium", "error": str(exc)}))

1135 else:

1136 typer.secho(f"Install failed: {exc}", fg=typer.colors.RED)

1137 raise typer.Exit(code=1) from exc

1138

1139 if cfg.json_mode:

1140 typer.echo(json.dumps({"component": "chromium", "installed": True}))

1141 else:

1142 typer.echo("Chromium installed.")

1143

1144

1145wiki_app = typer.Typer(help="Wiki layer commands: generate, lint, citations, status, prune.")

1146app.add_typer(wiki_app, name="wiki")

1147

1148

1149@wiki_app.command(name="lint")

1150def wiki_lint(

1151 wiki_source: str = typer.Argument("", help="Wiki page path (empty = lint all)."),

1152 data_dir: Path | None = data_dir_option,

1153 use_global: bool = global_option,

1154) -> None:

1155 """Lint wiki pages for stale citations, missing sources, and unmarked claims."""

1156 apply_overrides(data_dir=data_dir, use_global=use_global)

1157 from lilbee.wiki.lint import lint_all as _lint_all

1158 from lilbee.wiki.lint import lint_wiki_page

1159

1160 store = get_services().store

1161 if wiki_source:

1162 issues = lint_wiki_page(wiki_source, store)

1163 else:

1164 report = _lint_all(store)

1165 issues = report.issues

1166

1167 if cfg.json_mode:

1168 json_output(

1169 {

1170 "command": "wiki_lint",

1171 "issues": [i.to_dict() for i in issues],

1172 "total": len(issues),

1173 }

1174 )

1175 return

1176

1177 if not issues:

1178 console.print("No issues found.")

1179 return

1180

1181 table = Table(title="Wiki Lint Issues")

1182 table.add_column("Page", style=theme.ACCENT)

1183 table.add_column("Severity")

1184 table.add_column("Message")

1185 for issue in issues:

1186 sev_style = theme.ERROR if issue.severity.value == "error" else theme.WARNING

1187 sev_text = f"[{sev_style}]{issue.severity.value}[/{sev_style}]"

1188 table.add_row(issue.wiki_source, sev_text, issue.message)

1189 console.print(table)

1190

1191

1192@wiki_app.command(name="citations")

1193def wiki_citations(

1194 wiki_source: str = typer.Argument(..., help="Wiki page path, e.g. wiki/summaries/doc.md."),

1195 data_dir: Path | None = data_dir_option,

1196 use_global: bool = global_option,

1197) -> None:

1198 """Show citations for a wiki page."""

1199 apply_overrides(data_dir=data_dir, use_global=use_global)

1200

1201 records = get_services().store.get_citations_for_wiki(wiki_source)

1202

1203 if cfg.json_mode:

1204 json_output(

1205 {

1206 "command": "wiki_citations",

1207 "wiki_source": wiki_source,

1208 "citations": [dict(r) for r in records],

1209 "total": len(records),

1210 }

1211 )

1212 return

1213

1214 if not records:

1215 console.print(f"No citations found for [{theme.ACCENT}]{wiki_source}[/{theme.ACCENT}]")

1216 return

1217

1218 table = Table(title=f"Citations: {wiki_source}")

1219 table.add_column("Key", style=theme.ACCENT)

1220 table.add_column("Source")

1221 table.add_column("Type", style=theme.MUTED)

1222 table.add_column("Excerpt", max_width=60)

1223 for rec in records:

1224 excerpt = rec["excerpt"][:57] + "..." if len(rec["excerpt"]) > 60 else rec["excerpt"]

1225 table.add_row(rec["citation_key"], rec["source_filename"], rec["claim_type"], excerpt)

1226 console.print(table)

1227

1228

1229@wiki_app.command(name="status")

1230def wiki_status(

1231 data_dir: Path | None = data_dir_option,

1232 use_global: bool = global_option,

1233) -> None:

1234 """Show wiki layer status: page counts and lint summary."""

1235 apply_overrides(data_dir=data_dir, use_global=use_global)

1236

1237 wiki_root = cfg.data_root / cfg.wiki_dir

1238 if not wiki_root.exists():

1239 if cfg.json_mode:

1240 json_output({"wiki_enabled": cfg.wiki, "pages": 0, "issues": 0})

1241 return

1242 console.print("Wiki directory does not exist yet. Run sync with wiki enabled.")

1243 return

1244

1245 summaries = _count_md_files(wiki_root / SUMMARIES_SUBDIR)

1246 drafts = _count_md_files(wiki_root / DRAFTS_SUBDIR)

1247

1248 from lilbee.wiki.lint import lint_all as _lint_all

1249

1250 report = _lint_all(get_services().store)

1251

1252 if cfg.json_mode:

1253 json_output(

1254 {

1255 "wiki_enabled": cfg.wiki,

1256 SUMMARIES_SUBDIR: summaries,

1257 DRAFTS_SUBDIR: drafts,

1258 "pages": summaries + drafts,

1259 "lint_errors": report.error_count,

1260 "lint_warnings": report.warning_count,

1261 }

1262 )

1263 return

1264

1265 color = "green" if cfg.wiki else "red"

1266 label = "enabled" if cfg.wiki else "disabled"

1267 console.print(f"Wiki: [{color}]{label}[/{color}]")

1268 console.print(f" Summaries: [{theme.LABEL}]{summaries}[/{theme.LABEL}]")

1269 console.print(f" Drafts: [{theme.LABEL}]{drafts}[/{theme.LABEL}]")

1270 if report.error_count or report.warning_count:

1271 console.print(

1272 f" Lint: [{theme.ERROR}]{report.error_count} error(s)[/{theme.ERROR}], "

1273 f"[{theme.WARNING}]{report.warning_count} warning(s)[/{theme.WARNING}]"

1274 )

1275 else:

1276 console.print(" Lint: all clean")

1277

1278

1279@wiki_app.command(name="synthesize")

1280def wiki_synthesize(

1281 data_dir: Path | None = data_dir_option,

1282 use_global: bool = global_option,

1283) -> None:

1284 """Generate synthesis pages for concept clusters spanning 3+ sources."""

1285 apply_overrides(data_dir=data_dir, use_global=use_global)

1286 if not cfg.wiki:

1287 _fail_wiki_disabled()

1288 return

1289 from lilbee.wiki.gen import generate_synthesis_pages

1290

1291 svc = get_services()

1292 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer)

1293

1294 if cfg.json_mode:

1295 json_output(

1296 {

1297 "command": "wiki_synthesize",

1298 "paths": [str(p) for p in paths],

1299 "count": len(paths),

1300 }

1301 )

1302 return

1303

1304 if not paths:

1305 console.print("No synthesis pages generated (need 3+ sources per cluster).")

1306 return

1307

1308 console.print(f"Generated [{theme.LABEL}]{len(paths)}[/{theme.LABEL}] synthesis pages:")

1309 for path in paths:

1310 console.print(f" {path}")

1311

1312

1313@wiki_app.command(name="prune")

1314def wiki_prune(

1315 data_dir: Path | None = data_dir_option,

1316 use_global: bool = global_option,

1317) -> None:

1318 """Prune stale and orphaned wiki pages."""

1319 apply_overrides(data_dir=data_dir, use_global=use_global)

1320 from lilbee.wiki.prune import prune_wiki

1321

1322 report = prune_wiki(get_services().store)

1323

1324 if cfg.json_mode:

1325 json_output(

1326 {

1327 "command": "wiki_prune",

1328 "records": [r.to_dict() for r in report.records],

1329 "archived": report.archived_count,

1330 "flagged": report.flagged_count,

1331 }

1332 )

1333 return

1334

1335 if not report.records:

1336 console.print("No pages pruned.")

1337 return

1338

1339 table = Table(title="Wiki Prune Results")

1340 table.add_column("Page", style=theme.ACCENT)

1341 table.add_column("Action")

1342 table.add_column("Reason")

1343 for rec in report.records:

1344 action_style = theme.ERROR if rec.action.value == "archived" else theme.WARNING

1345 action_text = f"[{action_style}]{rec.action.value}[/{action_style}]"

1346 table.add_row(rec.wiki_source, action_text, rec.reason)

1347 console.print(table)

1348

1349

1350def _count_md_files(directory: Path) -> int:

1351 """Count markdown files in a directory."""

1352 if not directory.exists():

1353 return 0

1354 return len(list(directory.rglob("*.md")))

1355

1356

1357def _fail_wiki_disabled() -> None:

1358 """Emit the standard wiki-disabled message in the caller's output mode."""

1359 if cfg.json_mode:

1360 json_output({"error": msg.CMD_WIKI_DISABLED})

1361 return

1362 console.print(msg.CMD_WIKI_DISABLED)

1363

1364

1365@wiki_app.command(name="build")

1366def wiki_build(

1367 data_dir: Path | None = data_dir_option,

1368 use_global: bool = global_option,

1369 dry_run: bool = typer.Option(

1370 False,

1371 "--dry-run",

1372 help=(

1373 "Run extraction only; skip every LLM call. Prints the NER entity candidates. "

1374 "LLM-curated concept pages require a build call and are not shown in dry-run."

1375 ),

1376 ),

1377) -> None:

1378 """Build the concept and entity wiki across all ingested sources."""

1379 apply_overrides(data_dir=data_dir, use_global=use_global)

1380 if not cfg.wiki:

1381 _fail_wiki_disabled()

1382 return

1383

1384 if dry_run:

1385 from lilbee.store import SearchChunk

1386 from lilbee.wiki.entity_extractor import get_entity_extractor

1387

1388 svc = get_services()

1389 chunks: list[SearchChunk] = []

1390 for record in svc.store.get_sources():

1391 chunks.extend(svc.store.get_chunks_by_source(record["filename"]))

1392 extractor = get_entity_extractor(cfg.wiki_entity_mode, svc.provider, cfg)

1393 entities = extractor.extract(chunks)

1394 _wiki_build_dry_run_output(entities)

1395 return

1396

1397 from lilbee.wiki import run_full_build

1398

1399 result = run_full_build(cfg)

1400

1401 if cfg.json_mode:

1402 json_output({"command": "wiki_build", **result})

1403 return

1404

1405 pages = result["paths"]

1406 if not pages:

1407 console.print("No concept or entity pages generated.")

1408 return

1409

1410 console.print(

1411 f"Generated [{theme.LABEL}]{result['count']}[/{theme.LABEL}] "

1412 f"wiki pages from {result['entities']} extracted records:"

1413 )

1414 for path in pages:

1415 console.print(f" {path}")

1416

1417

1418_DRY_RUN_CONCEPT_NOTE = (

1419 "Note: LLM-curated concepts are not shown in --dry-run. "

1420 "Run `lilbee wiki build` to see which concepts the LLM proposes."

1421)

1422

1423

1424def _wiki_build_dry_run_output(entities: list[ExtractedEntity]) -> None:

1425 """Render the extraction result as JSON or table without calling any LLM.

1426

1427 Phase D: concepts come from the per-source batched LLM call, so

1428 listing them would require the call we are trying to avoid. The

1429 dry-run surface is NER-entity only, with a trailing note so a

1430 user who expected concepts in the output knows why they are

1431 missing.

1432 """

1433 rows: list[dict[str, Any]] = [

1434 {

1435 "slug": e.slug,

1436 "label": e.label,

1437 "kind": e.kind.value,

1438 "type_hint": e.type_hint,

1439 "mentions": len(e.chunk_refs),

1440 "sources": sorted({r.source for r in e.chunk_refs}),

1441 }

1442 for e in entities

1443 ]

1444

1445 if cfg.json_mode:

1446 json_output(

1447 {

1448 "command": "wiki_build",

1449 "dry_run": True,

1450 "entities": rows,

1451 "count": len(rows),

1452 "note": _DRY_RUN_CONCEPT_NOTE,

1453 }

1454 )

1455 return

1456

1457 if not rows:

1458 console.print("No candidate entities extracted. Run sync first.")

1459 console.print(f"[{theme.MUTED}]{_DRY_RUN_CONCEPT_NOTE}[/{theme.MUTED}]")

1460 return

1461

1462 table = Table(title=f"Wiki build dry-run ({len(rows)} NER entity candidates)")

1463 table.add_column("Slug", style=theme.ACCENT)

1464 table.add_column("Kind", style=theme.MUTED)

1465 table.add_column("Type")

1466 table.add_column("Mentions")

1467 table.add_column("Sources")

1468 for row in rows:

1469 sources_list: list[str] = row["sources"]

1470 table.add_row(

1471 str(row["slug"]),

1472 str(row["kind"]),

1473 str(row["type_hint"]),

1474 str(row["mentions"]),

1475 ", ".join(sources_list[:3]) + (", ..." if len(sources_list) > 3 else ""),

1476 )

1477 console.print(table)

1478 console.print(

1479 f"Dry run: [{theme.LABEL}]{len(rows)}[/{theme.LABEL}] candidate entities. "

1480 "No LLM calls were made."

1481 )

1482 console.print(f"[{theme.MUTED}]{_DRY_RUN_CONCEPT_NOTE}[/{theme.MUTED}]")

1483

1484

1485@wiki_app.command(name="update")

1486def wiki_update(

1487 data_dir: Path | None = data_dir_option,

1488 use_global: bool = global_option,

1489) -> None:

1490 """Refresh the concept and entity wiki after an ingest.

1491

1492 Currently a full rebuild. The incremental touched-slug regeneration

1493 lands in the ingest-hook task and will re-route this command then.

1494 """

1495 wiki_build(data_dir=data_dir, use_global=use_global, dry_run=False)

1496

1497

1498drafts_app = typer.Typer(help="Review wiki drafts: list, diff, accept, reject.")

1499wiki_app.add_typer(drafts_app, name="drafts")

1500

1501

1502@drafts_app.command(name="list")

1503def wiki_drafts_list(

1504 data_dir: Path | None = data_dir_option,

1505 use_global: bool = global_option,

1506) -> None:

1507 """List pending wiki drafts with drift, faithfulness, and pairing info."""

1508 apply_overrides(data_dir=data_dir, use_global=use_global)

1509 from lilbee.wiki.drafts import PENDING_KIND_DRIFT, list_drafts

1510

1511 wiki_root = cfg.data_root / cfg.wiki_dir

1512 drafts = list_drafts(wiki_root)

1513

1514 if cfg.json_mode:

1515 json_output(

1516 {

1517 "command": "wiki_drafts_list",

1518 "drafts": [d.to_dict() for d in drafts],

1519 "total": len(drafts),

1520 }

1521 )

1522 return

1523

1524 if not drafts:

1525 console.print("No drafts pending review.")

1526 return

1527

1528 table = Table(title="Wiki Drafts")

1529 table.add_column("Slug", style=theme.ACCENT)

1530 table.add_column("Kind", style=theme.MUTED)

1531 table.add_column("Drift")

1532 table.add_column("Faithfulness")

1533 table.add_column("Published?", style=theme.MUTED)

1534 for d in drafts:

1535 kind = d.pending_kind or PENDING_KIND_DRIFT

1536 drift = f"{d.drift_ratio:.0%}" if d.drift_ratio is not None else "-"

1537 faith = f"{d.faithfulness_score:.2f}" if d.faithfulness_score is not None else "-"

1538 published = "yes" if d.published_exists else "no"

1539 table.add_row(d.slug, kind, drift, faith, published)

1540 console.print(table)

1541

1542

1543@drafts_app.command(name="diff")

1544def wiki_drafts_diff(

1545 slug: str = typer.Argument(..., help="Draft slug (e.g. chevrolet)."),

1546 data_dir: Path | None = data_dir_option,

1547 use_global: bool = global_option,

1548) -> None:

1549 """Show a unified diff of the draft against its published counterpart."""

1550 apply_overrides(data_dir=data_dir, use_global=use_global)

1551 from lilbee.wiki.drafts import diff_draft

1552

1553 wiki_root = cfg.data_root / cfg.wiki_dir

1554 try:

1555 diff = diff_draft(slug, wiki_root)

1556 except FileNotFoundError as exc:

1557 if cfg.json_mode:

1558 json_output({"error": str(exc)})

1559 else:

1560 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]")

1561 raise typer.Exit(1) from None

1562

1563 if cfg.json_mode:

1564 json_output({"command": "wiki_drafts_diff", "slug": slug, "diff": diff})

1565 return

1566 console.print(diff or "(no differences)")

1567

1568

1569@drafts_app.command(name="accept")

1570def wiki_drafts_accept(

1571 slug: str = typer.Argument(..., help="Draft slug to accept."),

1572 data_dir: Path | None = data_dir_option,

1573 use_global: bool = global_option,

1574) -> None:

1575 """Overwrite the published page with the draft and re-index its chunks."""

1576 apply_overrides(data_dir=data_dir, use_global=use_global)

1577 from lilbee.wiki.drafts import accept_draft

1578

1579 wiki_root = cfg.data_root / cfg.wiki_dir

1580 try:

1581 result = accept_draft(slug, wiki_root, get_services().store)

1582 except FileNotFoundError as exc:

1583 if cfg.json_mode:

1584 json_output({"error": str(exc)})

1585 else:

1586 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]")

1587 raise typer.Exit(1) from None

1588

1589 if cfg.json_mode:

1590 json_output({"command": "wiki_drafts_accept", **result.to_dict()})

1591 return

1592 console.print(

1593 f"Accepted [{theme.ACCENT}]{slug}[/{theme.ACCENT}] -> "

1594 f"{result.moved_to} ({result.reindexed_chunks} chunks re-indexed)"

1595 )

1596

1597

1598@drafts_app.command(name="reject")

1599def wiki_drafts_reject(

1600 slug: str = typer.Argument(..., help="Draft slug to reject."),

1601 data_dir: Path | None = data_dir_option,

1602 use_global: bool = global_option,

1603) -> None:

1604 """Delete the draft file. Does not touch the published page or index."""

1605 apply_overrides(data_dir=data_dir, use_global=use_global)

1606 from lilbee.wiki.drafts import reject_draft

1607

1608 wiki_root = cfg.data_root / cfg.wiki_dir

1609 try:

1610 reject_draft(slug, wiki_root)

1611 except FileNotFoundError as exc:

1612 if cfg.json_mode:

1613 json_output({"error": str(exc)})

1614 else:

1615 console.print(f"[{theme.ERROR}]{exc}[/{theme.ERROR}]")

1616 raise typer.Exit(1) from None

1617

1618 if cfg.json_mode:

1619 json_output({"command": "wiki_drafts_reject", "slug": slug})

1620 return

1621 console.print(f"Rejected [{theme.ACCENT}]{slug}[/{theme.ACCENT}]")

Coverage for src / lilbee / cli / commands.py: 100%

827 statements