Coverage for src/lilbee/config.py: 100%

1"""Application configuration for lilbee.

3All settings can be overridden via environment variables prefixed with LILBEE_.

4Uses pydantic-settings for automatic env var loading with TOML config file support.

5"""

7import logging

8import os

9import sys

10from enum import StrEnum

11from pathlib import Path

12from typing import Any, ClassVar

14from pydantic import Field, ValidationInfo, field_validator, model_validator

15from pydantic_settings import BaseSettings, SettingsConfigDict

17from lilbee.providers.model_ref import PROVIDER_PREFIXES

20class ClustererBackend(StrEnum):

21 """Known wiki clusterer backends."""

23 EMBEDDING = "embedding"

24 CONCEPTS = "concepts"

27class KvCacheType(StrEnum):

28 """KV cache element type. q8_0 / q4_0 require flash attention."""

30 F16 = "f16"

31 F32 = "f32"

32 Q8_0 = "q8_0"

33 Q4_0 = "q4_0"

36# Bytes per KV element for memory budgeting. q* shapes are 1 byte of data

37# plus shared scales, close enough for budgeting.

38KV_CACHE_TYPE_BYTES: dict[KvCacheType, int] = {

39 KvCacheType.F16: 2,

40 KvCacheType.F32: 4,

41 KvCacheType.Q8_0: 1,

42 KvCacheType.Q4_0: 1,

43}

46class WikiEntityMode(StrEnum):

47 """Strategy used to extract entities for the wiki.

49 Phase D: the extractor no longer emits concepts — concept pages

50 are proposed by the LLM inside the per-source batched call in

51 ``wiki.gen``. The enum values reflect the extractor's current

52 responsibility (typed NER entities only).

53 """

55 NER_ENTITIES = "ner_entities"

56 NER_CONCEPTS_PLUS_LLM_TYPES = "ner_concepts_plus_llm_types"

57 LLM_TAGGED = "llm_tagged"

60def ConfigField(

61 *args: Any,

62 writable: bool = False,

63 reindex: bool = False,

64 write_only: bool = False,

65 public: bool = True,

66 **kwargs: Any,

67) -> Any:

68 """Wrap pydantic ``Field`` and attach metadata via ``json_schema_extra``."""

69 extra: dict[str, bool] = {}

70 if writable:

71 extra["writable"] = True

72 if reindex:

73 extra["reindex"] = True

74 if write_only:

75 extra["write_only"] = True

76 if not public:

77 extra["public"] = False

78 if extra:

79 kwargs["json_schema_extra"] = extra

80 return Field(*args, **kwargs)

83log = logging.getLogger(__name__)

85# Test-only bypass. Both the env var and pytest must be present so a

86# leaked env var cannot disable validation in production.

87_SKIP_MODEL_TASK_VALIDATION_ENV = "LILBEE_SKIP_MODEL_TASK_VALIDATION"

90def _model_task_validation_bypassed() -> bool:

91 if not os.environ.get(_SKIP_MODEL_TASK_VALIDATION_ENV):

92 return False

93 return sys.modules.get("pytest") is not None

96_MODEL_FIELD_TO_TASK: dict[str, str] = {

97 "chat_model": "chat",

98 "embedding_model": "embedding",

99 "vision_model": "vision",

100 "reranker_model": "rerank",

101}

102

103

104def _find_model_catalog_entry(ref: str) -> Any:

105 # circular import: catalog imports cfg.

106 from lilbee.catalog import find_catalog_entry

107

108 return find_catalog_entry(ref)

109

110

111def _enforce_role_match(ref: str, entry: Any, field_name: str) -> None:

112 from lilbee.models import ModelTask

113

114 want = ModelTask(_MODEL_FIELD_TO_TASK[field_name])

115 if entry.task == want:

116 return

117 from lilbee.server.handlers import format_task_mismatch

118

119 raise ValueError(format_task_mismatch(ref, ModelTask(entry.task), want))

120

121

122def _skips_catalog_check(ref: str, *, allow_bypass: bool) -> bool:

123 """True when *ref* should bypass the featured-catalog assignment check."""

124 if not ref or not ref.strip():

125 return True

126 if allow_bypass and _model_task_validation_bypassed():

127 return True

128 return ref.split("/", 1)[0] in PROVIDER_PREFIXES

129

130

131def validate_model_task_assignment(field_name: str, ref: str, *, allow_bypass: bool = True) -> str:

132 """Check *ref* is a featured-catalog entry whose task matches *field_name*.

133

134 Provider-prefixed refs (``ollama/``, ``openai/`` ...) skip the catalog

135 check; routing enforces task taxonomy for them. ``allow_bypass=True``

136 honors ``LILBEE_SKIP_MODEL_TASK_VALIDATION`` for tests; explicit user

137 actions pass ``allow_bypass=False`` to force the check.

138 """

139 if _skips_catalog_check(ref, allow_bypass=allow_bypass):

140 return ref

141 entry = _find_model_catalog_entry(ref)

142 if entry is None:

143 raise ValueError(

144 f"Model '{ref}' is not in the featured catalog. "

145 "Pick a featured model for this role, or install one via "

146 "POST /api/models/pull with a known catalog ref."

147 )

148 _enforce_role_match(ref, entry, field_name)

149 # Keep a full ``<repo>/<file>.gguf`` so resolve_model_path lands on

150 # the exact installed quant; fall back to the catalog ref otherwise.

151 if ref.endswith(".gguf") and ref.count("/") >= 2:

152 return ref

153 canonical: str = entry.ref

154 return canonical

155

156

157_BOOL_TRUE = frozenset({"true", "1", "yes"})

158_BOOL_FALSE = frozenset({"false", "0", "no"})

159

160

161def _parse_bool(raw: str) -> bool:

162 """Parse true/1/yes or false/0/no; raises ValueError on anything else."""

163 normalized = raw.strip().lower()

164 if normalized in _BOOL_TRUE:

165 return True

166 if normalized in _BOOL_FALSE:

167 return False

168 raise ValueError(f"Invalid boolean: {raw!r}")

169

170

171DEFAULT_IGNORE_DIRS = frozenset(

172 {

173 "node_modules",

174 "__pycache__",

175 "venv",

176 "build",

177 "dist",

178 "target",

179 "vendor",

180 "_build",

181 "coverage",

182 "htmlcov",

183 }

184)

185

186# spaCy NER labels that map onto something wiki-shaped. Excludes

187# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT /

188# LANGUAGE / LAW because pages for "42" or "2021" are never useful.

189# FAC (buildings / airports) and NORP (nationalities / political /

190# religious groups) are included because corpora routinely surface

191# them as wiki-worthy topics.

192DEFAULT_ALLOWED_NER_LABELS = frozenset(

193 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"}

194)

195

196# Timeout for backend catalog / management HTTP calls.

197DEFAULT_HTTP_TIMEOUT = 30.0

198

199# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops.

200DEFAULT_NUM_CTX = 8192

201

202CHUNKS_TABLE = "chunks"

203SOURCES_TABLE = "_sources"

204CITATIONS_TABLE = "_citations"

205META_TABLE = "_meta"

206CONCEPT_NODES_TABLE = "concept_nodes"

207CONCEPT_EDGES_TABLE = "concept_edges"

208CHUNK_CONCEPTS_TABLE = "chunk_concepts"

209

210# Default URL-exclusion regexes for recursive crawls. Grouped by source

211# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS

212# (newline-separated) or config.toml.

213

214# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor.

215_WP_EXCLUDE: tuple[str, ...] = (

216 r"/wp-admin/",

217 r"/wp-login(\.php)?",

218 r"/wp-json/",

219 r"/xmlrpc\.php",

220 r"/wp-cron\.php",

221 r"/wp-includes/",

222 r"/wp-content/uploads/",

223 r"\?p=\d+",

224 r"\?page_id=\d+",

225 r"\?cat=\d+",

226 r"/elementor-\d+",

227 r"\?elementor_library",

228)

229

230# Pagination and archive permalinks (WP + other CMSes share this shape).

231_ARCHIVE_EXCLUDE: tuple[str, ...] = (

232 r"/page/\d+/?$",

233 r"\?paged?=\d+",

234 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$",

235 r"/tag/",

236 r"/category/",

237 r"/author/",

238 r"/archives?/?$",

239 r"/comment-page-\d+",

240)

241

242# Syndication feeds (content-duplicated in HTML pages).

243_FEED_EXCLUDE: tuple[str, ...] = (

244 r"/feed/?$",

245 r"/feed/atom/?$",

246 r"/feed/rdf/?$",

247 r"/comments/feed/?$",

248 r"/rss/?$",

249)

250

251# Duplicate views of the same canonical page (AMP, print, preview).

252_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = (

253 r"/amp/?$",

254 r"\?amp=",

255 r"\?print=",

256 r"/print/?$",

257 r"\?preview=",

258)

259

260# WP attachment URLs (point at media, not content pages).

261_ATTACHMENT_EXCLUDE: tuple[str, ...] = (

262 r"/attachment/",

263 r"\?attachment_id=",

264)

265

266# Auth and account flows (generic across CMSes and e-commerce platforms).

267_AUTH_EXCLUDE: tuple[str, ...] = (

268 r"/login",

269 r"/logout",

270 r"/register",

271 r"/signup",

272 r"/signin",

273 r"/account",

274 r"/my-account/",

275 r"/profile",

276 r"/password-reset",

277 r"/forgot-password",

278)

279

280# E-commerce transactional flows (cart / checkout / compare / etc.).

281_ECOMMERCE_EXCLUDE: tuple[str, ...] = (

282 r"/cart",

283 r"/checkout",

284 r"/wishlist",

285 r"/orders?",

286 r"/compare",

287 r"/products\.json",

288 r"/collections/.+/products/.+\?page=",

289)

290

291# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.).

292_TRACKING_EXCLUDE: tuple[str, ...] = (

293 (

294 r"[?&]("

295 r"utm_[a-z_]+"

296 r"|fbclid|gclid|msclkid|yclid"

297 r"|mc_cid|mc_eid"

298 r"|_hsenc|_hsmi|hsCtaTracking"

299 r"|mkt_tok|mkt_[a-z_]+"

300 r"|trk|trkInfo"

301 r"|dm_i"

302 r"|vero_id|vero_conv"

303 r"|oly_anon_id|oly_enc_id"

304 r"|igshid"

305 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+"

306 r"|_ga"

307 r"|ref|referrer"

309 r"|srsltid"

310 r"|share|replytocom"

311 r")="

312 ),

313)

314

315# Site-meta URLs and non-HTML resources; skipped before fetch.

316_META_EXCLUDE: tuple[str, ...] = (

317 r"/sitemap[^/]*\.xml",

318 r"/robots\.txt",

319 r"/humans\.txt",

320 r"/favicon\.ico",

321 r"/\.well-known/",

322 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$",

323)

324

325# Mediawiki/Wikipedia navlinks that dominate BFS before the article body.

326_MEDIAWIKI_EXCLUDE: tuple[str, ...] = (

327 r"/wiki/Main_Page$",

328 r"/wiki/Wikipedia:",

329 r"/wiki/Portal:",

330 r"/wiki/Help:",

331 r"/wiki/Special:",

332 r"/wiki/Category:",

333 r"/wiki/Template:",

334 r"/wiki/Template_talk:",

335 r"/wiki/Talk:",

336 r"/wiki/File:",

337 r"/wiki/File_talk:",

338 r"/wiki/User:",

339 r"/wiki/User_talk:",

340 r"/w/index\.php",

341)

342

343DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = (

344 *_WP_EXCLUDE,

345 *_ARCHIVE_EXCLUDE,

346 *_FEED_EXCLUDE,

347 *_DUPLICATE_VIEW_EXCLUDE,

348 *_ATTACHMENT_EXCLUDE,

349 *_AUTH_EXCLUDE,

350 *_ECOMMERCE_EXCLUDE,

351 *_TRACKING_EXCLUDE,

352 *_META_EXCLUDE,

353 *_MEDIAWIKI_EXCLUDE,

354)

355

356

357_DEFAULT_SYSTEM_PROMPT = (

358 "You are a precise, direct assistant grounded in the provided context. "

359 "Answer using only the context — if it doesn't contain enough information, "

360 "say so rather than guessing. Be specific: quote relevant passages and "

361 "reference context by number (e.g. [1], [2]) inline. Prefer exact values "

362 "over approximations. For code, prefer working examples over abstract "

363 "explanations. Keep responses concise unless asked to elaborate."

364)

365

366# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback.

367# Mutating endpoints still require auth regardless of origin.

368_DEFAULT_CORS_ORIGIN_REGEX = (

369 r"^(app://obsidian\.md"

370 r"|capacitor://localhost"

371 r"|https?://localhost(:\d+)?"

372 r"|https?://127\.0\.0\.1(:\d+)?"

373 r"|https?://\[::1\](:\d+)?)$"

374)

375

376

377class Config(BaseSettings):

378 """Runtime configuration — one singleton instance, mutated by CLI overrides."""

379

380 model_config = SettingsConfigDict(

381 env_prefix="LILBEE_",

382 validate_assignment=True,

383 arbitrary_types_allowed=True,

384 extra="ignore",

385 )

386

387 # Paths — resolved from env/defaults in model_validator(mode='before')

388 data_root: Path = Field(default=Path())

389 # Writable so plugin-managed servers can pivot storage to a vault path on

390 # first boot; rebuild the index after migrating.

391 documents_dir: Path = ConfigField(default=Path(), writable=True)

392 data_dir: Path = Field(default=Path())

393 lancedb_dir: Path = Field(default=Path())

394 models_dir: Path = Field(default=Path())

395 # Obsidian vault root; when set, search results carry a vault-relative

396 # ``vault_path`` for native-UI deep-links.

397 vault_base: Path | None = ConfigField(default=None, writable=True)

398

399 chat_model: str = Field(default="Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q4_K_M.gguf", min_length=1)

400 embedding_model: str = Field(

401 default="nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",

402 min_length=1,

403 )

404 # Vision OCR model for scanned PDFs and image-only pages. Empty = disabled;

405 # there is no cross-role fallback onto the chat model even if multimodal.

406 vision_model: str = ConfigField(default="", public=True)

407 embedding_dim: int = Field(default=768, ge=1)

408 chunk_size: int = ConfigField(default=512, ge=64, writable=True, reindex=True)

409 chunk_overlap: int = ConfigField(default=100, ge=0, writable=True, reindex=True)

410 max_embed_chars: int = Field(default=2000, ge=1)

411 top_k: int = ConfigField(default=10, ge=1, writable=True)

412 max_distance: float = ConfigField(default=0.9, ge=0.0, writable=True)

413 # Minimum RRF relevance score for hybrid search results (0.0 = no filtering).

414 min_relevance_score: float = ConfigField(default=0.0, ge=0.0, writable=True)

415 adaptive_threshold: bool = Field(default=False)

416 system_prompt: str = ConfigField(default=_DEFAULT_SYSTEM_PROMPT, min_length=1, writable=True)

417 ignore_dirs: frozenset[str] = Field(default=DEFAULT_IGNORE_DIRS)

418 # OCR for scanned PDFs via vision-capable chat model.

419 # None = auto-detect (use OCR if chat model is vision-capable).

420 # True = force OCR regardless of detection.

421 # False = disable OCR entirely.

422 enable_ocr: bool | None = ConfigField(default=None, writable=True)

423 # Per-page timeout in seconds for vision OCR (0 = no limit).

424 ocr_timeout: float = ConfigField(default=120.0, ge=0.0, writable=True)

425 # Max concurrent vision-OCR requests per PDF. Default 1 (serial) — raise

426 # only when the vision model is network-hosted with meaningful latency

427 # (remote API, separate Ollama host). Local GPU models contend on a

428 # single device and get slower with concurrency > 1.

429 vision_concurrency: int = ConfigField(default=1, ge=1, writable=True)

430

431 # Tesseract fallback wall-clock timeout per file, seconds. 0 = no cap.

432 tesseract_timeout: float = ConfigField(default=60.0, ge=0.0, writable=True)

433 semantic_chunking: bool = ConfigField(default=False, writable=True)

434 topic_threshold: float = ConfigField(default=0.75, ge=0.0, le=1.0, writable=True)

435 server_host: str = "127.0.0.1"

436 server_port: int = Field(default=0, ge=0, le=65535)

437 cors_origins: list[str] = Field(default_factory=list)

438 cors_origin_regex: str = Field(default=_DEFAULT_CORS_ORIGIN_REGEX)

439 # Seconds between SSE heartbeat events when the producer queue is idle.

440 # Must stay well below the plugin's STREAM_IDLE_TIMEOUT_MS (120s) so a

441 # single long-running vision OCR page can't starve the client into aborting.

442 sse_heartbeat_interval: float = ConfigField(default=30.0, ge=0.0, writable=True)

443 json_mode: bool = False

444 temperature: float | None = ConfigField(default=None, ge=0.0, writable=True)

445 top_p: float | None = ConfigField(default=None, ge=0.0, le=1.0, writable=True)

446 top_k_sampling: int | None = ConfigField(default=None, ge=1, writable=True)

447 # 1.1 is llama.cpp's default. Leaving this at None caused n-gram loops

448 # ("tire tire tire...") on some open-weights models.

449 repeat_penalty: float | None = ConfigField(default=1.1, ge=0.0, writable=True)

450 num_ctx: int | None = ConfigField(default=None, ge=1, writable=True)

451 max_tokens: int | None = ConfigField(default=4096, ge=1, writable=True)

452 seed: int | None = ConfigField(default=None, writable=True)

453 llm_provider: str = ConfigField(default="auto", writable=True)

454 remote_base_url: str = ConfigField(default="http://localhost:11434", writable=True)

455 llm_api_key: str = ConfigField(default="", writable=True, write_only=True)

456 openai_api_key: str = ConfigField(default="", writable=True, write_only=True)

457 anthropic_api_key: str = ConfigField(default="", writable=True, write_only=True)

458 gemini_api_key: str = ConfigField(default="", writable=True, write_only=True)

459

460 # Retrieval quality knobs.

461

462 # Max chunks per source in top-k; prevents one large file monopolizing results.

463 diversity_max_per_source: int = ConfigField(default=3, ge=1, writable=True)

464

465 # MMR relevance/diversity tradeoff; 0 = max diversity, 1 = pure relevance

466 # (Carbonell & Goldstein 1998).

467 mmr_lambda: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)

468

469 # Extra candidates retrieved for MMR reranking (multiplies top_k).

470 candidate_multiplier: int = ConfigField(default=3, ge=1, writable=True)

471

472 # LLM-generated alternative queries for expansion. 0 disables.

473 query_expansion_count: int = ConfigField(default=3, ge=0, writable=True)

474

475 # Skip LLM expansion when tokenized query length ≤ this. The LLM round-trip

476 # dominates latency on small local models; short queries already have strong

477 # BM25/vector signal. Concept-graph expansion still runs. 0 disables the skip.

478 expansion_short_query_tokens: int = ConfigField(default=2, ge=0, writable=True)

479

480 # Cosine-distance step when adaptive-widening retry kicks in.

481 adaptive_threshold_step: float = ConfigField(default=0.2, gt=0.0, writable=True)

482

483 # Reject expansion variants below expansion_similarity_threshold.

484 expansion_guardrails: bool = ConfigField(default=True, writable=True)

485

486 # Min cosine similarity between question and variant embeddings.

487 expansion_similarity_threshold: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)

488

489 # Sigmoid-normalized BM25 score above which query expansion is skipped.

490 expansion_skip_threshold: float = Field(default=0.8, ge=0.0, le=1.0)

491

492 # Min BM25 top-1 vs top-2 gap to skip expansion.

493 expansion_skip_gap: float = Field(default=0.15, ge=0.0, le=1.0)

494

495 # Chunks included in LLM context after adaptive selection.

496 max_context_sources: int = ConfigField(default=5, ge=1, writable=True)

497

498 # HyDE (Gao et al. 2022): hypothetical-answer embedding search. +~500ms.

499 hyde: bool = ConfigField(default=False, writable=True)

500

501 # HyDE result weight relative to real-doc search (0.0-1.0).

502 hyde_weight: float = ConfigField(default=0.7, ge=0.0, le=1.0, writable=True)

503

504 # HyDE prompt template. Must contain {question} placeholder.

505 hyde_prompt: str = (

506 "Write a 50-100 word passage that directly answers this question as if "

507 "it were an excerpt from a real document. Do not include any preamble, "

508 "just write the passage.\n\nQuestion: {question}"

509 )

510

511 # Reranker model ref. Empty disables reranking. Native GGUFs use

512 # llama-cpp rank pooling; hosted refs (cohere/voyage/jina/together/hf-tei)

513 # need the backend extra.

514 reranker_model: str = ConfigField(default="", public=True)

515

516 # Candidate count sent to the reranker.

517 rerank_candidates: int = ConfigField(default=20, ge=1, writable=True, public=True)

518

519 # Date-range filter; only fires when a temporal keyword is detected.

520 temporal_filtering: bool = ConfigField(default=True, writable=True)

521

522 # If True, emit <think>…</think> content as separate SSE reasoning events;

523 # if False, strip it silently.

524 show_reasoning: bool = ConfigField(default=False, writable=True)

525

526 # Web crawling.

527

528 # Optional global ceilings. None = no ceiling.

529 crawl_max_depth: int | None = ConfigField(default=None, ge=0, writable=True)

530 crawl_max_pages: int | None = ConfigField(default=None, ge=1, writable=True)

531

532 # Per-URL fetch timeout, seconds.

533 crawl_timeout: int = ConfigField(default=30, ge=1, writable=True)

534

535 # 0 = unlimited, default = CPU count.

536 crawl_max_concurrent: int = Field(default=0, ge=0)

537

538 # Seconds between periodic syncs during crawl. 0 = sync only at end.

539 crawl_sync_interval: int = ConfigField(default=30, ge=0, writable=True)

540

541 # Per-request delay + jitter (defaults chosen to be gentler than crawl4ai's).

542 crawl_mean_delay: float = ConfigField(default=0.5, ge=0.0, writable=True)

543 crawl_max_delay_range: float = ConfigField(default=0.5, ge=0.0, writable=True)

544

545 # In-flight requests per crawl.

546 crawl_concurrent_requests: int = ConfigField(default=3, ge=1, writable=True)

547

548 # Per-domain rate-limiter that backs off on HTTP 429/503 and retries.

549 crawl_retry_on_rate_limit: bool = ConfigField(default=True, writable=True)

550 crawl_retry_base_delay_min: float = ConfigField(default=1.0, ge=0.0, writable=True)

551 crawl_retry_base_delay_max: float = ConfigField(default=3.0, ge=0.0, writable=True)

552 crawl_retry_max_backoff: float = ConfigField(default=30.0, ge=0.0, writable=True)

553 crawl_retry_max_attempts: int = ConfigField(default=3, ge=0, writable=True)

554

555 # Regex patterns dropped at link-discovery time. Defaults block CMS

556 # scaffolding (WordPress admin, archives, tracking params, etc.).

557 crawl_exclude_patterns: list[str] = ConfigField(

558 default_factory=lambda: list(DEFAULT_CRAWL_EXCLUDE_PATTERNS),

559 writable=True,

560 )

561

562 # Fraction of GPU/unified memory reserved for loaded models.

563 gpu_memory_fraction: float = ConfigField(default=0.75, ge=0.1, le=1.0, writable=True)

564

565 # Seconds a model stays loaded after last use. 0 = unload immediately.

566 model_keep_alive: int = ConfigField(default=300, ge=0, writable=True)

567

568 # Run embedding and vision inference in a subprocess (llama-cpp only).

569 subprocess_embed: bool = ConfigField(default=False, writable=True)

570

571 # Upper bound for the dynamic n_ctx picker. The picker chooses the

572 # largest 256-multiple ctx that fits in available memory and the

573 # model's training window; this caps it at a sane ceiling.

574 num_ctx_max: int = ConfigField(default=16384, ge=512, writable=True)

575

576 # Flash attention. None (default) = on with TypeError fallback for

577 # older llama-cpp-python builds, True = force on, False = off.

578 # Resolves the 'padding V cache to 1024' warning on models with

579 # uneven per-layer V dims (e.g. Gemma3) and saves ~25% KV memory.

580 flash_attention: bool | None = ConfigField(default=None, writable=True)

581

582 # KV cache element type. q8_0 / q4_0 halve or quarter cache memory

583 # but require flash attention to be enabled.

584 kv_cache_type: KvCacheType = ConfigField(default=KvCacheType.F16, writable=True)

585

586 # Number of model layers to offload to GPU. None (default) = all

587 # layers, 0 = CPU only, positive int = partial offload. Useful when a

588 # discrete GPU has less VRAM than the model needs.

589 n_gpu_layers: int | None = ConfigField(default=None, writable=True)

590

591 # True = Markdown widget for chat; False = plain Static (faster).

592 markdown_rendering: bool = True

593

594 # TUI theme name; persists the last Ctrl+T pick across sessions.

595 theme: str = ConfigField(default="gruvbox", writable=True)

596

597 # Per-model generation defaults set via apply_model_defaults().

598 _model_defaults: Any = None

599

600 # Wiki layer. LLM-maintained synthesis pages with citation provenance.

601 # Off by default; flip to True (or set LILBEE_WIKI=1) to enable. When off,

602 # the Wiki view tab and the chat ModelBar's scope picker are both hidden.

603 wiki: bool = ConfigField(default=False, writable=True)

604 wiki_dir: str = "wiki"

605 wiki_prune_raw: bool = ConfigField(default=False, writable=True)

606

607 # Minimum cosine similarity between a page body and the mean of its

608 # source chunk vectors before a page is published (below → drafts).

609 # Replaces the old LLM-based faithfulness score: mean-of-chunks is a

610 # deterministic, zero-LLM-call signal that routes topic-drifted

611 # pages to drafts without the 0.0 to 1.0 ambiguity of a model-emitted

612 # number. Tuning knob: swap to per-chunk max or top-K-mean if the

613 # default 0.5 produces false drafts.

614 wiki_embedding_faithfulness_threshold: float = ConfigField(

615 default=0.5, ge=0.0, le=1.0, writable=True

616 )

617

618 # Per-call output token cap for wiki generation. Without this a

619 # reasoning model (Qwen3, DeepSeek-R1) can burn the full context

620 # window emitting <think> tokens before the actual answer, taking

621 # minutes per page. Default leaves headroom for a typical reasoning

622 # budget plus a real response (~1000 output + ~1000 slack).

623 wiki_summary_max_tokens: int = ConfigField(default=2048, ge=256, writable=True)

624

625 # Wiki generation is a structured-output task: the model must emit the

626 # block separators, the citation footnotes, and verbatim quotes. The

627 # usual chat default (~0.8) is too creative for that. Lowering the

628 # sampling temperature makes the model stick to the template and quote

629 # more faithfully. 0.1 leaves just enough slack to avoid hard loops.

630 wiki_temperature: float = ConfigField(default=0.1, ge=0.0, le=2.0, writable=True)

631

632 # Fraction of citations that must be stale before a wiki page is flagged.

633 wiki_stale_citation_threshold: float = Field(default=0.5, ge=0.0, le=1.0)

634

635 # Fraction of content changed that triggers human-review drift guard.

636 wiki_drift_threshold: float = Field(default=0.3, ge=0.0, le=1.0)

637

638 # LLM prompt templates for wiki page generation. Writable so advanced

639 # users can override them from /settings, config.toml, or

640 # ``LILBEE_WIKI_*_PROMPT`` env vars. Templates must keep the expected

641 # ``{placeholders}``. If you remove one the generator will crash on

642 # first use. The defaults below are the only reason the pipeline

643 # works out of the box.

644 wiki_summary_prompt: str = ConfigField(

645 writable=True,

646 default=(

647 "You are a knowledge compiler. Given the source chunks below from a single "

648 "document, write a concise wiki summary page in markdown.\n\n"

649 "Rules:\n"

650 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

651 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

652 "3. For interpretations or connections not directly stated in the source, "

653 "mark with [*inference*].\n"

654 "4. Use blockquotes (>) for directly cited facts.\n"

655 "5. End with a citation block in this format:\n\n"

656 "---\n"

657 "\n"

658 '[^src1]: {source_name}, excerpt: "exact quoted text"\n'

659 '[^src2]: {source_name}, excerpt: "exact quoted text"\n\n'

660 "Source document: {source_name}\n\n"

661 "Chunks:\n{chunks_text}\n\n"

662 "Write the wiki summary page now. Start with a heading."

663 ),

664 )

665 wiki_synthesis_prompt: str = ConfigField(

666 writable=True,

667 default=(

668 "You are a knowledge compiler. Given source chunks from MULTIPLE documents "

669 "about related concepts, write a synthesis wiki page in markdown that connects "

670 "ideas across sources.\n\n"

671 "Rules:\n"

672 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

673 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

674 "3. For connections, interpretations, or patterns you identify across sources, "

675 "mark with [*inference*].\n"

676 "4. Use blockquotes (>) for directly cited facts.\n"

677 "5. Reference each source by its filename when drawing connections.\n"

678 "6. End with a citation block in this format:\n\n"

679 "---\n"

680 "\n"

681 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'

682 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'

683 "Topic: {topic}\n\n"

684 "Sources:\n{source_list}\n\n"

685 "Chunks:\n{chunks_text}\n\n"

686 "Write the synthesis page now. Start with a heading."

687 ),

688 )

689

690 # Wiki synthesis clusterer backend. CONCEPTS requires the [graph] extra

691 # and falls back to EMBEDDING when unavailable.

692 wiki_clusterer: ClustererBackend = ConfigField(

693 default=ClustererBackend.EMBEDDING, writable=True

694 )

695

696 # Neighborhood size for the mutual-kNN graph. 0 = auto-scale from corpus size.

697 wiki_clusterer_k: int = ConfigField(default=0, ge=0, writable=True)

698

699 # LazyGraphRAG-style concept graph. Requires the [graph] extra.

700 concept_graph: bool = ConfigField(default=True, writable=True)

701

702 # Weight of concept overlap boost relative to vector similarity.

703 concept_boost_weight: float = ConfigField(default=0.3, ge=0.0, le=1.0, writable=True)

704

705 # Floor on post-boost distance to stop weak boosts from promoting marginal hits.

706 concept_boost_floor: float = ConfigField(default=0.05, ge=0.0, writable=True)

707

708 # Max noun-phrase concepts extracted per chunk.

709 concept_max_per_chunk: int = ConfigField(default=10, ge=1, writable=True)

710

711 # spaCy NER labels kept by the wiki entity extractor. Anything not

712 # in this set (QUANTITY, CARDINAL, DATE, TIME, MONEY, PERCENT,

713 # ORDINAL, ...) is dropped before aggregation. Override via

714 # LILBEE_CONCEPT_ALLOWED_ENT_TYPES as a comma-separated list.

715 concept_allowed_ent_types: frozenset[str] = Field(default=DEFAULT_ALLOWED_NER_LABELS)

716

717 # Strategy used to extract entities for the concept/entity wiki.

718 # NER_ENTITIES (default) pulls typed NER entities with spaCy; concept

719 # pages are proposed by the LLM inside the per-source batched call,

720 # not by the extractor. NER_CONCEPTS_PLUS_LLM_TYPES layers an

721 # LLM-proposed domain schema on top. LLM_TAGGED asks the LLM to tag

722 # every chunk (most expensive). Unimplemented modes fall back to

723 # NER_ENTITIES.

724 wiki_entity_mode: WikiEntityMode = ConfigField(

725 default=WikiEntityMode.NER_ENTITIES, writable=True

726 )

727

728 # Minimum distinct chunk mentions before an entity or concept earns

729 # its own wiki page. Filters one-off noise.

730 wiki_entity_min_mentions: int = ConfigField(default=3, ge=1, writable=True)

731

732 # Maximum chunks passed into each concept or entity page generation

733 # call. Caps context size so one page does not blow the context

734 # window on a prolific topic.

735 wiki_concept_max_chunks_per_page: int = ConfigField(default=25, ge=1, writable=True)

736

737 # Maximum number of related concepts the model is asked to list in

738 # the `## Related` section of each page.

739 wiki_related_max: int = ConfigField(default=8, ge=0, writable=True)

740

741 # Auto-update cap: if a single sync touches more than this many

742 # concept or entity pages, skip the per-slug regeneration and tell

743 # the user to run `lilbee wiki update` explicitly. Keeps a surprise

744 # bulk import from firing hundreds of LLM calls.

745 wiki_ingest_update_cap: int = ConfigField(default=20, ge=1, writable=True)

746

747 # Whether the per-source batched call asks the LLM to curate

748 # concept pages alongside the pre-extracted entity list. False →

749 # entity sections only, no concept curation (incremental ingest

750 # path uses this to avoid churning concept slugs per source-touch).

751 wiki_extract_concepts: bool = ConfigField(default=True, writable=True)

752

753 # Minimum chunk count a source must contribute before it is eligible

754 # for concept curation. Sources below the floor still get a batched

755 # call when they have entities (the prompt writes entity-only

756 # sections); sources below the floor with zero entities are skipped

757 # entirely. Prevents boilerplate / TOC / appendix documents from

758 # burning an LLM call to invent "concepts".

759 wiki_batch_min_chunks: int = ConfigField(default=3, ge=1, writable=True)

760

761 # Prompt template for the per-source batched call. Placeholders:

762 # {source}, {entity_list}, {chunks_text}, {concept_instruction}.

763 # {concept_instruction} is filled with a concept-curation paragraph

764 # when concepts are requested, or the empty string otherwise.

765 wiki_entity_batch_prompt: str = ConfigField(

766 writable=True,

767 default=(

768 "You are writing wiki sections based on these chunks from {source}.\n\n"

769 "{concept_instruction}"

770 "Write a wiki section for each of these NER ENTITIES: {entity_list}\n\n"

771 "Format each section exactly as:\n"

772 "## Name\n"

773 "{{content with [^src1]-style citations}}\n\n"

774 "Rules:\n"

775 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"

776 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"

777 "3. For interpretations or connections not directly stated, mark with [*inference*].\n"

778 "4. Use blockquotes (>) for directly cited facts.\n"

779 "5. End the response with a citation block in this format:\n\n"

780 "---\n"

781 "\n"

782 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'

783 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'

784 "Source chunks:\n{chunks_text}\n"

785 ),

786 )

787

788 # Class variable — not a settings field

789 _toml_cache: ClassVar[dict[str, Any]] = {}

790

791 @field_validator(

792 "temperature",

793 "top_p",

794 "repeat_penalty",

795 "top_k_sampling",

796 "num_ctx",

797 "seed",

798 mode="before",

799 )

800 @classmethod

801 def _empty_string_to_none(cls, v: Any) -> Any:

802 if isinstance(v, str) and v.strip() == "":

803 return None

804 return v

805

806 @field_validator("enable_ocr", mode="before")

807 @classmethod

808 def _parse_enable_ocr(cls, v: Any) -> bool | None:

809 """Parse enable_ocr from env var string or direct value.

810

811 Accepts: true/false/1/0/yes/no (case-insensitive), empty string

812 or None for auto-detect.

813 """

814 if v is None:

815 return None

816 if isinstance(v, bool):

817 return v

818 if isinstance(v, str):

819 if v.strip().lower() in ("", "auto", "none"):

820 return None

821 try:

822 return _parse_bool(v)

823 except ValueError:

824 pass

825 return bool(v)

826

827 @field_validator("flash_attention", mode="before")

828 @classmethod

829 def _parse_flash_attention(cls, v: Any) -> bool | None:

830 """Auto/on/off tri-state: empty/auto/none -> None, else parse bool."""

831 if v is None:

832 return None

833 if isinstance(v, bool):

834 return v

835 if isinstance(v, str):

836 if v.strip().lower() in ("", "auto", "none"):

837 return None

838 try:

839 return _parse_bool(v)

840 except ValueError:

841 return None

842 return bool(v)

843

844 @field_validator("n_gpu_layers", mode="before")

845 @classmethod

846 def _parse_n_gpu_layers(cls, v: Any) -> int | None:

847 """Auto -> None, ``cpu`` alias -> 0, integers parsed verbatim."""

848 if v is None:

849 return None

850 if isinstance(v, str):

851 label = v.strip().lower()

852 if label in ("", "auto", "none"):

853 return None

854 if label == "cpu":

855 return 0

856 try:

857 return int(label)

858 except ValueError:

859 log.warning("Invalid LILBEE_N_GPU_LAYERS=%r, using auto", v)

860 return None

861 return int(v)

862

863 @field_validator("semantic_chunking", mode="before")

864 @classmethod

865 def _parse_semantic_chunking(cls, v: Any) -> bool:

866 """Parse from env string; invalid values warn and fall back to False."""

867 if isinstance(v, bool):

868 return v

869 if isinstance(v, str):

870 try:

871 return _parse_bool(v)

872 except ValueError:

873 log.warning("Invalid LILBEE_SEMANTIC_CHUNKING=%r, using default False", v)

874 return False

875 return bool(v)

876

877 @field_validator(

878 "chat_model", "embedding_model", "vision_model", "reranker_model", mode="after"

879 )

880 @classmethod

881 def _normalize_model_tag(cls, v: str, info: ValidationInfo) -> str:

882 """Validate and canonicalize a model ref; blank clears optional roles."""

883 if not v or not v.strip():

884 if info.field_name in {"chat_model", "embedding_model"}:

885 raise ValueError(f"{info.field_name} must not be blank")

886 return ""

887 from lilbee.providers.model_ref import parse_model_ref

888

889 return parse_model_ref(v).for_openai_prefix()

890

891 @field_validator("cors_origins", mode="before")

892 @classmethod

893 def _split_cors_origins(cls, v: Any) -> Any:

894 if isinstance(v, str):

895 return [o.strip() for o in v.split(",") if o.strip()]

896 return v

897

898 @field_validator("crawl_exclude_patterns", mode="before")

899 @classmethod

900 def _split_crawl_exclude_patterns(cls, v: Any) -> Any:

901 """Accept newline-separated strings from env vars / plain-text config.

902

903 Regex commonly uses commas (e.g. `{2,4}`) and pipes (alternation), so

904 newline is the only separator safe to use for this field. TOML lists

905 and JSON arrays pass through unchanged.

906 """

907 if isinstance(v, str):

908 return [p.strip() for p in v.splitlines() if p.strip()]

909 return v

910

911 @field_validator("crawl_exclude_patterns", mode="after")

912 @classmethod

913 def _validate_crawl_exclude_patterns(cls, v: list[str]) -> list[str]:

914 """Reject any entry that isn't a valid Python regex.

915

916 These patterns are compiled at crawl time. An invalid pattern there

917 surfaces as an opaque mid-crawl error; catching it at PATCH time gives

918 the user a 400 with a pointer to the bad entry.

919 """

920 import re

921

922 bad: list[str] = []

923 for i, pattern in enumerate(v):

924 try:

925 re.compile(pattern)

926 except re.error as exc:

927 bad.append(f"[{i}] {pattern!r}: {exc}")

928 if bad:

929 raise ValueError("invalid regex in crawl_exclude_patterns:\n " + "\n ".join(bad))

930 return v

931

932 @field_validator("ignore_dirs", mode="before")

933 @classmethod

934 def _merge_ignore_dirs(cls, v: Any) -> frozenset[str]:

935 if isinstance(v, str):

936 extra = frozenset(name.strip() for name in v.split(",") if name.strip())

937 return DEFAULT_IGNORE_DIRS | extra

938 if isinstance(v, (set, frozenset, list)):

939 return DEFAULT_IGNORE_DIRS | frozenset(v)

940 return DEFAULT_IGNORE_DIRS

941

942 @field_validator("concept_allowed_ent_types", mode="before")

943 @classmethod

944 def _parse_ent_types(cls, v: Any) -> frozenset[str]:

945 """Replace-semantics override: a narrowed set is used as-is,

946 not unioned with defaults. A user asking for ``PERSON,ORG``

947 wants exactly those kinds. Accepts comma-separated strings

948 from env and list / set / frozenset from code. Empty input

949 falls back to :data:`DEFAULT_ALLOWED_NER_LABELS` so an empty

950 env var does not silently disable the gate.

951 """

952 if isinstance(v, str):

953 parts = frozenset(name.strip().upper() for name in v.split(",") if name.strip())

954 return parts or DEFAULT_ALLOWED_NER_LABELS

955 if isinstance(v, (set, frozenset, list)):

956 parts = frozenset(str(x).upper() for x in v)

957 return parts or DEFAULT_ALLOWED_NER_LABELS

958 return DEFAULT_ALLOWED_NER_LABELS

959

960 @model_validator(mode="before")

961 @classmethod

962 def _resolve_defaults(cls, data: Any) -> Any:

963 from lilbee.platform import canonical_models_dir, default_data_dir, find_local_root

964

965 if not isinstance(data, dict): # pragma: no cover

966 return data

967

968 _UNSET = Path()

969

970 if data.get("data_root") in (None, _UNSET):

971 data_env = os.environ.get("LILBEE_DATA", "").strip()

972 if data_env:

973 data["data_root"] = Path(data_env)

974 else:

975 local = find_local_root()

976 data["data_root"] = local if local is not None else default_data_dir()

977 root = data["data_root"]

978 if data.get("documents_dir") in (None, _UNSET):

979 data["documents_dir"] = root / "documents"

980 if data.get("data_dir") in (None, _UNSET):

981 data["data_dir"] = root / "data"

982 if data.get("lancedb_dir") in (None, _UNSET):

983 data["lancedb_dir"] = root / "data" / "lancedb"

984 if data.get("models_dir") in (None, _UNSET):

985 data["models_dir"] = canonical_models_dir()

986

987 return data

988

989 @classmethod

990 def settings_customise_sources(

991 cls,

992 settings_cls: type[BaseSettings],

993 init_settings: Any,

994 env_settings: Any,

995 dotenv_settings: Any,

996 file_secret_settings: Any,

997 ) -> tuple[Any, ...]:

998 from lilbee.platform import default_data_dir, find_local_root

999

1000 data_env = os.environ.get("LILBEE_DATA", "")

1001 if data_env:

1002 toml_dir = Path(data_env)

1003 else:

1004 local = find_local_root()

1005 toml_dir = local if local else default_data_dir()

1006 toml_path = toml_dir / "config.toml"

1007

1008 plain_env = _PlainEnvSource(settings_cls, env_prefix="LILBEE_", env_ignore_empty=True)

1009 sources: list[Any] = [init_settings, plain_env]

1010 if toml_path.exists() and os.environ.get("LILBEE_SKIP_TOML_CONFIG") != "1":

1011 sources.append(_TomlSource(settings_cls, toml_path))

1012 return tuple(sources)

1013

1014 @property

1015 def model_defaults(self) -> Any:

1016 """Per-model generation defaults (read-only). Set via apply_model_defaults()."""

1017 return self._model_defaults

1018

1019 def apply_model_defaults(self, defaults: Any) -> None:

1020 """Store per-model generation defaults for 3-layer merge."""

1021 object.__setattr__(self, "_model_defaults", defaults)

1022

1023 def clear_model_defaults(self) -> None:

1024 """Reset per-model defaults to None."""

1025 object.__setattr__(self, "_model_defaults", None)

1026

1027 def generation_options(self, **overrides: Any) -> dict[str, Any]:

1028 """Merge model defaults, user config, and per-call overrides, dropping None."""

1029 result = _model_defaults_dict(self._model_defaults)

1030 user_fields: dict[str, Any] = {

1031 "temperature": self.temperature,

1032 "top_p": self.top_p,

1033 "top_k": self.top_k_sampling,

1034 "repeat_penalty": self.repeat_penalty,

1035 "num_ctx": self.num_ctx,

1036 "seed": self.seed,

1037 "max_tokens": self.max_tokens,

1038 }

1039 for k, v in user_fields.items():

1040 if v is not None:

1041 result[k] = v

1042 for k, v in overrides.items():

1043 if v is not None:

1044 result[k] = v

1045 return result

1046

1047

1048def _model_defaults_dict(defaults: Any) -> dict[str, Any]:

1049 """Non-None fields of a ModelDefaults instance as a dict."""

1050 if defaults is None:

1051 return {}

1052 from dataclasses import fields as dc_fields

1053

1054 return {

1055 f.name: getattr(defaults, f.name)

1056 for f in dc_fields(defaults)

1057 if getattr(defaults, f.name) is not None

1058 }

1059

1060

1061class _PlainEnvSource:

1062 """Reads LILBEE_* env vars as plain strings so field validators handle parsing."""

1063

1064 def __init__(

1065 self,

1066 settings_cls: type[BaseSettings],

1067 env_prefix: str,

1068 env_ignore_empty: bool = True,

1069 ) -> None:

1070 self._prefix = env_prefix

1071 self._ignore_empty = env_ignore_empty

1072 self._fields = set(settings_cls.model_fields)

1073

1074 def __call__(self) -> dict[str, Any]:

1075 result: dict[str, Any] = {}

1076 for field_name in self._fields:

1077 env_key = f"{self._prefix}{field_name.upper()}"

1078 raw = os.environ.get(env_key)

1079 if raw is None:

1080 continue

1081 if self._ignore_empty and raw == "":

1082 continue

1083 result[field_name] = raw

1084 return result

1085

1086

1087class _TomlSource:

1088 """Custom pydantic-settings source that reads config.toml."""

1089

1090 def __init__(self, settings_cls: type[BaseSettings], path: Path) -> None:

1091 self._path = path

1092

1093 def __call__(self) -> dict[str, Any]:

1094 import tomllib

1095

1096 try:

1097 with self._path.open("rb") as f:

1098 data = tomllib.load(f)

1099 except (ValueError, OSError):

1100 log.warning("Failed to read %s, ignoring", self._path)

1101 return {}

1102 return {k: str(v) for k, v in data.items()}

1103

1104

1105def _build_cfg() -> tuple[Config, Exception | None]:

1106 """Build cfg; on stale-config validation failure, fall back to defaults.

1107

1108 A persisted ``config.toml`` from before a breaking schema change can

1109 contain values the new validators reject. Crashing at module import

1110 means every command (``lilbee --help`` included) emits a Python

1111 traceback. Falling back to env+defaults lets the package load; the

1112 CLI / TUI surfaces the original error before doing real work.

1113 """

1114 try:

1115 return Config(), None

1116 except Exception as exc:

1117 os.environ["LILBEE_SKIP_TOML_CONFIG"] = "1"

1118 try:

1119 return Config(), exc

1120 finally:

1121 os.environ.pop("LILBEE_SKIP_TOML_CONFIG", None)

1122

1123

1124cfg, config_load_error = _build_cfg()

Coverage for src / lilbee / config.py: 100%

428 statements