Coverage for src / lilbee / config.py: 100%
428 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Application configuration for lilbee.
3All settings can be overridden via environment variables prefixed with LILBEE_.
4Uses pydantic-settings for automatic env var loading with TOML config file support.
5"""
7import logging
8import os
9import sys
10from enum import StrEnum
11from pathlib import Path
12from typing import Any, ClassVar
14from pydantic import Field, ValidationInfo, field_validator, model_validator
15from pydantic_settings import BaseSettings, SettingsConfigDict
17from lilbee.providers.model_ref import PROVIDER_PREFIXES
20class ClustererBackend(StrEnum):
21 """Known wiki clusterer backends."""
23 EMBEDDING = "embedding"
24 CONCEPTS = "concepts"
27class KvCacheType(StrEnum):
28 """KV cache element type. q8_0 / q4_0 require flash attention."""
30 F16 = "f16"
31 F32 = "f32"
32 Q8_0 = "q8_0"
33 Q4_0 = "q4_0"
36# Bytes per KV element for memory budgeting. q* shapes are 1 byte of data
37# plus shared scales, close enough for budgeting.
38KV_CACHE_TYPE_BYTES: dict[KvCacheType, int] = {
39 KvCacheType.F16: 2,
40 KvCacheType.F32: 4,
41 KvCacheType.Q8_0: 1,
42 KvCacheType.Q4_0: 1,
43}
46class WikiEntityMode(StrEnum):
47 """Strategy used to extract entities for the wiki.
49 Phase D: the extractor no longer emits concepts — concept pages
50 are proposed by the LLM inside the per-source batched call in
51 ``wiki.gen``. The enum values reflect the extractor's current
52 responsibility (typed NER entities only).
53 """
55 NER_ENTITIES = "ner_entities"
56 NER_CONCEPTS_PLUS_LLM_TYPES = "ner_concepts_plus_llm_types"
57 LLM_TAGGED = "llm_tagged"
60def ConfigField(
61 *args: Any,
62 writable: bool = False,
63 reindex: bool = False,
64 write_only: bool = False,
65 public: bool = True,
66 **kwargs: Any,
67) -> Any:
68 """Wrap pydantic ``Field`` and attach metadata via ``json_schema_extra``."""
69 extra: dict[str, bool] = {}
70 if writable:
71 extra["writable"] = True
72 if reindex:
73 extra["reindex"] = True
74 if write_only:
75 extra["write_only"] = True
76 if not public:
77 extra["public"] = False
78 if extra:
79 kwargs["json_schema_extra"] = extra
80 return Field(*args, **kwargs)
83log = logging.getLogger(__name__)
85# Test-only bypass. Both the env var and pytest must be present so a
86# leaked env var cannot disable validation in production.
87_SKIP_MODEL_TASK_VALIDATION_ENV = "LILBEE_SKIP_MODEL_TASK_VALIDATION"
90def _model_task_validation_bypassed() -> bool:
91 if not os.environ.get(_SKIP_MODEL_TASK_VALIDATION_ENV):
92 return False
93 return sys.modules.get("pytest") is not None
96_MODEL_FIELD_TO_TASK: dict[str, str] = {
97 "chat_model": "chat",
98 "embedding_model": "embedding",
99 "vision_model": "vision",
100 "reranker_model": "rerank",
101}
104def _find_model_catalog_entry(ref: str) -> Any:
105 # circular import: catalog imports cfg.
106 from lilbee.catalog import find_catalog_entry
108 return find_catalog_entry(ref)
111def _enforce_role_match(ref: str, entry: Any, field_name: str) -> None:
112 from lilbee.models import ModelTask
114 want = ModelTask(_MODEL_FIELD_TO_TASK[field_name])
115 if entry.task == want:
116 return
117 from lilbee.server.handlers import format_task_mismatch
119 raise ValueError(format_task_mismatch(ref, ModelTask(entry.task), want))
122def _skips_catalog_check(ref: str, *, allow_bypass: bool) -> bool:
123 """True when *ref* should bypass the featured-catalog assignment check."""
124 if not ref or not ref.strip():
125 return True
126 if allow_bypass and _model_task_validation_bypassed():
127 return True
128 return ref.split("/", 1)[0] in PROVIDER_PREFIXES
131def validate_model_task_assignment(field_name: str, ref: str, *, allow_bypass: bool = True) -> str:
132 """Check *ref* is a featured-catalog entry whose task matches *field_name*.
134 Provider-prefixed refs (``ollama/``, ``openai/`` ...) skip the catalog
135 check; routing enforces task taxonomy for them. ``allow_bypass=True``
136 honors ``LILBEE_SKIP_MODEL_TASK_VALIDATION`` for tests; explicit user
137 actions pass ``allow_bypass=False`` to force the check.
138 """
139 if _skips_catalog_check(ref, allow_bypass=allow_bypass):
140 return ref
141 entry = _find_model_catalog_entry(ref)
142 if entry is None:
143 raise ValueError(
144 f"Model '{ref}' is not in the featured catalog. "
145 "Pick a featured model for this role, or install one via "
146 "POST /api/models/pull with a known catalog ref."
147 )
148 _enforce_role_match(ref, entry, field_name)
149 # Keep a full ``<repo>/<file>.gguf`` so resolve_model_path lands on
150 # the exact installed quant; fall back to the catalog ref otherwise.
151 if ref.endswith(".gguf") and ref.count("/") >= 2:
152 return ref
153 canonical: str = entry.ref
154 return canonical
157_BOOL_TRUE = frozenset({"true", "1", "yes"})
158_BOOL_FALSE = frozenset({"false", "0", "no"})
161def _parse_bool(raw: str) -> bool:
162 """Parse true/1/yes or false/0/no; raises ValueError on anything else."""
163 normalized = raw.strip().lower()
164 if normalized in _BOOL_TRUE:
165 return True
166 if normalized in _BOOL_FALSE:
167 return False
168 raise ValueError(f"Invalid boolean: {raw!r}")
171DEFAULT_IGNORE_DIRS = frozenset(
172 {
173 "node_modules",
174 "__pycache__",
175 "venv",
176 "build",
177 "dist",
178 "target",
179 "vendor",
180 "_build",
181 "coverage",
182 "htmlcov",
183 }
184)
186# spaCy NER labels that map onto something wiki-shaped. Excludes
187# QUANTITY / ORDINAL / CARDINAL / DATE / TIME / MONEY / PERCENT /
188# LANGUAGE / LAW because pages for "42" or "2021" are never useful.
189# FAC (buildings / airports) and NORP (nationalities / political /
190# religious groups) are included because corpora routinely surface
191# them as wiki-worthy topics.
192DEFAULT_ALLOWED_NER_LABELS = frozenset(
193 {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "PRODUCT", "FAC", "NORP"}
194)
196# Timeout for backend catalog / management HTTP calls.
197DEFAULT_HTTP_TIMEOUT = 30.0
199# Safe default + cap for chat-mode n_ctx; full 128K+ training contexts OOM laptops.
200DEFAULT_NUM_CTX = 8192
202CHUNKS_TABLE = "chunks"
203SOURCES_TABLE = "_sources"
204CITATIONS_TABLE = "_citations"
205META_TABLE = "_meta"
206CONCEPT_NODES_TABLE = "concept_nodes"
207CONCEPT_EDGES_TABLE = "concept_edges"
208CHUNK_CONCEPTS_TABLE = "chunk_concepts"
210# Default URL-exclusion regexes for recursive crawls. Grouped by source
211# CMS / category. User overrides come from LILBEE_CRAWL_EXCLUDE_PATTERNS
212# (newline-separated) or config.toml.
214# WordPress scaffolding: admin UIs, APIs, RPC, numeric permalinks, Elementor.
215_WP_EXCLUDE: tuple[str, ...] = (
216 r"/wp-admin/",
217 r"/wp-login(\.php)?",
218 r"/wp-json/",
219 r"/xmlrpc\.php",
220 r"/wp-cron\.php",
221 r"/wp-includes/",
222 r"/wp-content/uploads/",
223 r"\?p=\d+",
224 r"\?page_id=\d+",
225 r"\?cat=\d+",
226 r"/elementor-\d+",
227 r"\?elementor_library",
228)
230# Pagination and archive permalinks (WP + other CMSes share this shape).
231_ARCHIVE_EXCLUDE: tuple[str, ...] = (
232 r"/page/\d+/?$",
233 r"\?paged?=\d+",
234 r"/20\d{2}(/\d{2}(/\d{2})?)?/?$",
235 r"/tag/",
236 r"/category/",
237 r"/author/",
238 r"/archives?/?$",
239 r"/comment-page-\d+",
240)
242# Syndication feeds (content-duplicated in HTML pages).
243_FEED_EXCLUDE: tuple[str, ...] = (
244 r"/feed/?$",
245 r"/feed/atom/?$",
246 r"/feed/rdf/?$",
247 r"/comments/feed/?$",
248 r"/rss/?$",
249)
251# Duplicate views of the same canonical page (AMP, print, preview).
252_DUPLICATE_VIEW_EXCLUDE: tuple[str, ...] = (
253 r"/amp/?$",
254 r"\?amp=",
255 r"\?print=",
256 r"/print/?$",
257 r"\?preview=",
258)
260# WP attachment URLs (point at media, not content pages).
261_ATTACHMENT_EXCLUDE: tuple[str, ...] = (
262 r"/attachment/",
263 r"\?attachment_id=",
264)
266# Auth and account flows (generic across CMSes and e-commerce platforms).
267_AUTH_EXCLUDE: tuple[str, ...] = (
268 r"/login",
269 r"/logout",
270 r"/register",
271 r"/signup",
272 r"/signin",
273 r"/account",
274 r"/my-account/",
275 r"/profile",
276 r"/password-reset",
277 r"/forgot-password",
278)
280# E-commerce transactional flows (cart / checkout / compare / etc.).
281_ECOMMERCE_EXCLUDE: tuple[str, ...] = (
282 r"/cart",
283 r"/checkout",
284 r"/wishlist",
285 r"/orders?",
286 r"/compare",
287 r"/products\.json",
288 r"/collections/.+/products/.+\?page=",
289)
291# Marketing / tracking query parameters (utm_*, fbclid, gclid, etc.).
292_TRACKING_EXCLUDE: tuple[str, ...] = (
293 (
294 r"[?&]("
295 r"utm_[a-z_]+"
296 r"|fbclid|gclid|msclkid|yclid"
297 r"|mc_cid|mc_eid"
298 r"|_hsenc|_hsmi|hsCtaTracking"
299 r"|mkt_tok|mkt_[a-z_]+"
300 r"|trk|trkInfo"
301 r"|dm_i"
302 r"|vero_id|vero_conv"
303 r"|oly_anon_id|oly_enc_id"
304 r"|igshid"
305 r"|pk_campaign|pk_source|pk_medium|pk_[a-z_]+"
306 r"|_ga"
307 r"|ref|referrer"
308 r"|affiliate|aff_id|aff_ref|aff|partner"
309 r"|srsltid"
310 r"|share|replytocom"
311 r")="
312 ),
313)
315# Site-meta URLs and non-HTML resources; skipped before fetch.
316_META_EXCLUDE: tuple[str, ...] = (
317 r"/sitemap[^/]*\.xml",
318 r"/robots\.txt",
319 r"/humans\.txt",
320 r"/favicon\.ico",
321 r"/\.well-known/",
322 r"\.(jpe?g|png|gif|webp|avif|svg|ico|pdf|docx?|xlsx?|pptx?|zip|tar|gz|mp3|mp4|webm|ogg|ttf|woff2?|css|js|map|json|xml)(\?.*)?$",
323)
325# Mediawiki/Wikipedia navlinks that dominate BFS before the article body.
326_MEDIAWIKI_EXCLUDE: tuple[str, ...] = (
327 r"/wiki/Main_Page$",
328 r"/wiki/Wikipedia:",
329 r"/wiki/Portal:",
330 r"/wiki/Help:",
331 r"/wiki/Special:",
332 r"/wiki/Category:",
333 r"/wiki/Template:",
334 r"/wiki/Template_talk:",
335 r"/wiki/Talk:",
336 r"/wiki/File:",
337 r"/wiki/File_talk:",
338 r"/wiki/User:",
339 r"/wiki/User_talk:",
340 r"/w/index\.php",
341)
343DEFAULT_CRAWL_EXCLUDE_PATTERNS: tuple[str, ...] = (
344 *_WP_EXCLUDE,
345 *_ARCHIVE_EXCLUDE,
346 *_FEED_EXCLUDE,
347 *_DUPLICATE_VIEW_EXCLUDE,
348 *_ATTACHMENT_EXCLUDE,
349 *_AUTH_EXCLUDE,
350 *_ECOMMERCE_EXCLUDE,
351 *_TRACKING_EXCLUDE,
352 *_META_EXCLUDE,
353 *_MEDIAWIKI_EXCLUDE,
354)
357_DEFAULT_SYSTEM_PROMPT = (
358 "You are a precise, direct assistant grounded in the provided context. "
359 "Answer using only the context — if it doesn't contain enough information, "
360 "say so rather than guessing. Be specific: quote relevant passages and "
361 "reference context by number (e.g. [1], [2]) inline. Prefer exact values "
362 "over approximations. For code, prefer working examples over abstract "
363 "explanations. Keep responses concise unless asked to elaborate."
364)
366# CORS allow-origin regex: Obsidian (desktop + iOS) and localhost loopback.
367# Mutating endpoints still require auth regardless of origin.
368_DEFAULT_CORS_ORIGIN_REGEX = (
369 r"^(app://obsidian\.md"
370 r"|capacitor://localhost"
371 r"|https?://localhost(:\d+)?"
372 r"|https?://127\.0\.0\.1(:\d+)?"
373 r"|https?://\[::1\](:\d+)?)$"
374)
377class Config(BaseSettings):
378 """Runtime configuration — one singleton instance, mutated by CLI overrides."""
380 model_config = SettingsConfigDict(
381 env_prefix="LILBEE_",
382 validate_assignment=True,
383 arbitrary_types_allowed=True,
384 extra="ignore",
385 )
387 # Paths — resolved from env/defaults in model_validator(mode='before')
388 data_root: Path = Field(default=Path())
389 # Writable so plugin-managed servers can pivot storage to a vault path on
390 # first boot; rebuild the index after migrating.
391 documents_dir: Path = ConfigField(default=Path(), writable=True)
392 data_dir: Path = Field(default=Path())
393 lancedb_dir: Path = Field(default=Path())
394 models_dir: Path = Field(default=Path())
395 # Obsidian vault root; when set, search results carry a vault-relative
396 # ``vault_path`` for native-UI deep-links.
397 vault_base: Path | None = ConfigField(default=None, writable=True)
399 chat_model: str = Field(default="Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q4_K_M.gguf", min_length=1)
400 embedding_model: str = Field(
401 default="nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",
402 min_length=1,
403 )
404 # Vision OCR model for scanned PDFs and image-only pages. Empty = disabled;
405 # there is no cross-role fallback onto the chat model even if multimodal.
406 vision_model: str = ConfigField(default="", public=True)
407 embedding_dim: int = Field(default=768, ge=1)
408 chunk_size: int = ConfigField(default=512, ge=64, writable=True, reindex=True)
409 chunk_overlap: int = ConfigField(default=100, ge=0, writable=True, reindex=True)
410 max_embed_chars: int = Field(default=2000, ge=1)
411 top_k: int = ConfigField(default=10, ge=1, writable=True)
412 max_distance: float = ConfigField(default=0.9, ge=0.0, writable=True)
413 # Minimum RRF relevance score for hybrid search results (0.0 = no filtering).
414 min_relevance_score: float = ConfigField(default=0.0, ge=0.0, writable=True)
415 adaptive_threshold: bool = Field(default=False)
416 system_prompt: str = ConfigField(default=_DEFAULT_SYSTEM_PROMPT, min_length=1, writable=True)
417 ignore_dirs: frozenset[str] = Field(default=DEFAULT_IGNORE_DIRS)
418 # OCR for scanned PDFs via vision-capable chat model.
419 # None = auto-detect (use OCR if chat model is vision-capable).
420 # True = force OCR regardless of detection.
421 # False = disable OCR entirely.
422 enable_ocr: bool | None = ConfigField(default=None, writable=True)
423 # Per-page timeout in seconds for vision OCR (0 = no limit).
424 ocr_timeout: float = ConfigField(default=120.0, ge=0.0, writable=True)
425 # Max concurrent vision-OCR requests per PDF. Default 1 (serial) — raise
426 # only when the vision model is network-hosted with meaningful latency
427 # (remote API, separate Ollama host). Local GPU models contend on a
428 # single device and get slower with concurrency > 1.
429 vision_concurrency: int = ConfigField(default=1, ge=1, writable=True)
431 # Tesseract fallback wall-clock timeout per file, seconds. 0 = no cap.
432 tesseract_timeout: float = ConfigField(default=60.0, ge=0.0, writable=True)
433 semantic_chunking: bool = ConfigField(default=False, writable=True)
434 topic_threshold: float = ConfigField(default=0.75, ge=0.0, le=1.0, writable=True)
435 server_host: str = "127.0.0.1"
436 server_port: int = Field(default=0, ge=0, le=65535)
437 cors_origins: list[str] = Field(default_factory=list)
438 cors_origin_regex: str = Field(default=_DEFAULT_CORS_ORIGIN_REGEX)
439 # Seconds between SSE heartbeat events when the producer queue is idle.
440 # Must stay well below the plugin's STREAM_IDLE_TIMEOUT_MS (120s) so a
441 # single long-running vision OCR page can't starve the client into aborting.
442 sse_heartbeat_interval: float = ConfigField(default=30.0, ge=0.0, writable=True)
443 json_mode: bool = False
444 temperature: float | None = ConfigField(default=None, ge=0.0, writable=True)
445 top_p: float | None = ConfigField(default=None, ge=0.0, le=1.0, writable=True)
446 top_k_sampling: int | None = ConfigField(default=None, ge=1, writable=True)
447 # 1.1 is llama.cpp's default. Leaving this at None caused n-gram loops
448 # ("tire tire tire...") on some open-weights models.
449 repeat_penalty: float | None = ConfigField(default=1.1, ge=0.0, writable=True)
450 num_ctx: int | None = ConfigField(default=None, ge=1, writable=True)
451 max_tokens: int | None = ConfigField(default=4096, ge=1, writable=True)
452 seed: int | None = ConfigField(default=None, writable=True)
453 llm_provider: str = ConfigField(default="auto", writable=True)
454 remote_base_url: str = ConfigField(default="http://localhost:11434", writable=True)
455 llm_api_key: str = ConfigField(default="", writable=True, write_only=True)
456 openai_api_key: str = ConfigField(default="", writable=True, write_only=True)
457 anthropic_api_key: str = ConfigField(default="", writable=True, write_only=True)
458 gemini_api_key: str = ConfigField(default="", writable=True, write_only=True)
460 # Retrieval quality knobs.
462 # Max chunks per source in top-k; prevents one large file monopolizing results.
463 diversity_max_per_source: int = ConfigField(default=3, ge=1, writable=True)
465 # MMR relevance/diversity tradeoff; 0 = max diversity, 1 = pure relevance
466 # (Carbonell & Goldstein 1998).
467 mmr_lambda: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)
469 # Extra candidates retrieved for MMR reranking (multiplies top_k).
470 candidate_multiplier: int = ConfigField(default=3, ge=1, writable=True)
472 # LLM-generated alternative queries for expansion. 0 disables.
473 query_expansion_count: int = ConfigField(default=3, ge=0, writable=True)
475 # Skip LLM expansion when tokenized query length ≤ this. The LLM round-trip
476 # dominates latency on small local models; short queries already have strong
477 # BM25/vector signal. Concept-graph expansion still runs. 0 disables the skip.
478 expansion_short_query_tokens: int = ConfigField(default=2, ge=0, writable=True)
480 # Cosine-distance step when adaptive-widening retry kicks in.
481 adaptive_threshold_step: float = ConfigField(default=0.2, gt=0.0, writable=True)
483 # Reject expansion variants below expansion_similarity_threshold.
484 expansion_guardrails: bool = ConfigField(default=True, writable=True)
486 # Min cosine similarity between question and variant embeddings.
487 expansion_similarity_threshold: float = ConfigField(default=0.5, ge=0.0, le=1.0, writable=True)
489 # Sigmoid-normalized BM25 score above which query expansion is skipped.
490 expansion_skip_threshold: float = Field(default=0.8, ge=0.0, le=1.0)
492 # Min BM25 top-1 vs top-2 gap to skip expansion.
493 expansion_skip_gap: float = Field(default=0.15, ge=0.0, le=1.0)
495 # Chunks included in LLM context after adaptive selection.
496 max_context_sources: int = ConfigField(default=5, ge=1, writable=True)
498 # HyDE (Gao et al. 2022): hypothetical-answer embedding search. +~500ms.
499 hyde: bool = ConfigField(default=False, writable=True)
501 # HyDE result weight relative to real-doc search (0.0-1.0).
502 hyde_weight: float = ConfigField(default=0.7, ge=0.0, le=1.0, writable=True)
504 # HyDE prompt template. Must contain {question} placeholder.
505 hyde_prompt: str = (
506 "Write a 50-100 word passage that directly answers this question as if "
507 "it were an excerpt from a real document. Do not include any preamble, "
508 "just write the passage.\n\nQuestion: {question}"
509 )
511 # Reranker model ref. Empty disables reranking. Native GGUFs use
512 # llama-cpp rank pooling; hosted refs (cohere/voyage/jina/together/hf-tei)
513 # need the backend extra.
514 reranker_model: str = ConfigField(default="", public=True)
516 # Candidate count sent to the reranker.
517 rerank_candidates: int = ConfigField(default=20, ge=1, writable=True, public=True)
519 # Date-range filter; only fires when a temporal keyword is detected.
520 temporal_filtering: bool = ConfigField(default=True, writable=True)
522 # If True, emit <think>…</think> content as separate SSE reasoning events;
523 # if False, strip it silently.
524 show_reasoning: bool = ConfigField(default=False, writable=True)
526 # Web crawling.
528 # Optional global ceilings. None = no ceiling.
529 crawl_max_depth: int | None = ConfigField(default=None, ge=0, writable=True)
530 crawl_max_pages: int | None = ConfigField(default=None, ge=1, writable=True)
532 # Per-URL fetch timeout, seconds.
533 crawl_timeout: int = ConfigField(default=30, ge=1, writable=True)
535 # 0 = unlimited, default = CPU count.
536 crawl_max_concurrent: int = Field(default=0, ge=0)
538 # Seconds between periodic syncs during crawl. 0 = sync only at end.
539 crawl_sync_interval: int = ConfigField(default=30, ge=0, writable=True)
541 # Per-request delay + jitter (defaults chosen to be gentler than crawl4ai's).
542 crawl_mean_delay: float = ConfigField(default=0.5, ge=0.0, writable=True)
543 crawl_max_delay_range: float = ConfigField(default=0.5, ge=0.0, writable=True)
545 # In-flight requests per crawl.
546 crawl_concurrent_requests: int = ConfigField(default=3, ge=1, writable=True)
548 # Per-domain rate-limiter that backs off on HTTP 429/503 and retries.
549 crawl_retry_on_rate_limit: bool = ConfigField(default=True, writable=True)
550 crawl_retry_base_delay_min: float = ConfigField(default=1.0, ge=0.0, writable=True)
551 crawl_retry_base_delay_max: float = ConfigField(default=3.0, ge=0.0, writable=True)
552 crawl_retry_max_backoff: float = ConfigField(default=30.0, ge=0.0, writable=True)
553 crawl_retry_max_attempts: int = ConfigField(default=3, ge=0, writable=True)
555 # Regex patterns dropped at link-discovery time. Defaults block CMS
556 # scaffolding (WordPress admin, archives, tracking params, etc.).
557 crawl_exclude_patterns: list[str] = ConfigField(
558 default_factory=lambda: list(DEFAULT_CRAWL_EXCLUDE_PATTERNS),
559 writable=True,
560 )
562 # Fraction of GPU/unified memory reserved for loaded models.
563 gpu_memory_fraction: float = ConfigField(default=0.75, ge=0.1, le=1.0, writable=True)
565 # Seconds a model stays loaded after last use. 0 = unload immediately.
566 model_keep_alive: int = ConfigField(default=300, ge=0, writable=True)
568 # Run embedding and vision inference in a subprocess (llama-cpp only).
569 subprocess_embed: bool = ConfigField(default=False, writable=True)
571 # Upper bound for the dynamic n_ctx picker. The picker chooses the
572 # largest 256-multiple ctx that fits in available memory and the
573 # model's training window; this caps it at a sane ceiling.
574 num_ctx_max: int = ConfigField(default=16384, ge=512, writable=True)
576 # Flash attention. None (default) = on with TypeError fallback for
577 # older llama-cpp-python builds, True = force on, False = off.
578 # Resolves the 'padding V cache to 1024' warning on models with
579 # uneven per-layer V dims (e.g. Gemma3) and saves ~25% KV memory.
580 flash_attention: bool | None = ConfigField(default=None, writable=True)
582 # KV cache element type. q8_0 / q4_0 halve or quarter cache memory
583 # but require flash attention to be enabled.
584 kv_cache_type: KvCacheType = ConfigField(default=KvCacheType.F16, writable=True)
586 # Number of model layers to offload to GPU. None (default) = all
587 # layers, 0 = CPU only, positive int = partial offload. Useful when a
588 # discrete GPU has less VRAM than the model needs.
589 n_gpu_layers: int | None = ConfigField(default=None, writable=True)
591 # True = Markdown widget for chat; False = plain Static (faster).
592 markdown_rendering: bool = True
594 # TUI theme name; persists the last Ctrl+T pick across sessions.
595 theme: str = ConfigField(default="gruvbox", writable=True)
597 # Per-model generation defaults set via apply_model_defaults().
598 _model_defaults: Any = None
600 # Wiki layer. LLM-maintained synthesis pages with citation provenance.
601 # Off by default; flip to True (or set LILBEE_WIKI=1) to enable. When off,
602 # the Wiki view tab and the chat ModelBar's scope picker are both hidden.
603 wiki: bool = ConfigField(default=False, writable=True)
604 wiki_dir: str = "wiki"
605 wiki_prune_raw: bool = ConfigField(default=False, writable=True)
607 # Minimum cosine similarity between a page body and the mean of its
608 # source chunk vectors before a page is published (below → drafts).
609 # Replaces the old LLM-based faithfulness score: mean-of-chunks is a
610 # deterministic, zero-LLM-call signal that routes topic-drifted
611 # pages to drafts without the 0.0 to 1.0 ambiguity of a model-emitted
612 # number. Tuning knob: swap to per-chunk max or top-K-mean if the
613 # default 0.5 produces false drafts.
614 wiki_embedding_faithfulness_threshold: float = ConfigField(
615 default=0.5, ge=0.0, le=1.0, writable=True
616 )
618 # Per-call output token cap for wiki generation. Without this a
619 # reasoning model (Qwen3, DeepSeek-R1) can burn the full context
620 # window emitting <think> tokens before the actual answer, taking
621 # minutes per page. Default leaves headroom for a typical reasoning
622 # budget plus a real response (~1000 output + ~1000 slack).
623 wiki_summary_max_tokens: int = ConfigField(default=2048, ge=256, writable=True)
625 # Wiki generation is a structured-output task: the model must emit the
626 # block separators, the citation footnotes, and verbatim quotes. The
627 # usual chat default (~0.8) is too creative for that. Lowering the
628 # sampling temperature makes the model stick to the template and quote
629 # more faithfully. 0.1 leaves just enough slack to avoid hard loops.
630 wiki_temperature: float = ConfigField(default=0.1, ge=0.0, le=2.0, writable=True)
632 # Fraction of citations that must be stale before a wiki page is flagged.
633 wiki_stale_citation_threshold: float = Field(default=0.5, ge=0.0, le=1.0)
635 # Fraction of content changed that triggers human-review drift guard.
636 wiki_drift_threshold: float = Field(default=0.3, ge=0.0, le=1.0)
638 # LLM prompt templates for wiki page generation. Writable so advanced
639 # users can override them from /settings, config.toml, or
640 # ``LILBEE_WIKI_*_PROMPT`` env vars. Templates must keep the expected
641 # ``{placeholders}``. If you remove one the generator will crash on
642 # first use. The defaults below are the only reason the pipeline
643 # works out of the box.
644 wiki_summary_prompt: str = ConfigField(
645 writable=True,
646 default=(
647 "You are a knowledge compiler. Given the source chunks below from a single "
648 "document, write a concise wiki summary page in markdown.\n\n"
649 "Rules:\n"
650 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"
651 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"
652 "3. For interpretations or connections not directly stated in the source, "
653 "mark with [*inference*].\n"
654 "4. Use blockquotes (>) for directly cited facts.\n"
655 "5. End with a citation block in this format:\n\n"
656 "---\n"
657 "<!-- citations (auto-generated from _citations table -- do not edit) -->\n"
658 '[^src1]: {source_name}, excerpt: "exact quoted text"\n'
659 '[^src2]: {source_name}, excerpt: "exact quoted text"\n\n'
660 "Source document: {source_name}\n\n"
661 "Chunks:\n{chunks_text}\n\n"
662 "Write the wiki summary page now. Start with a heading."
663 ),
664 )
665 wiki_synthesis_prompt: str = ConfigField(
666 writable=True,
667 default=(
668 "You are a knowledge compiler. Given source chunks from MULTIPLE documents "
669 "about related concepts, write a synthesis wiki page in markdown that connects "
670 "ideas across sources.\n\n"
671 "Rules:\n"
672 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"
673 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"
674 "3. For connections, interpretations, or patterns you identify across sources, "
675 "mark with [*inference*].\n"
676 "4. Use blockquotes (>) for directly cited facts.\n"
677 "5. Reference each source by its filename when drawing connections.\n"
678 "6. End with a citation block in this format:\n\n"
679 "---\n"
680 "<!-- citations (auto-generated from _citations table -- do not edit) -->\n"
681 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'
682 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'
683 "Topic: {topic}\n\n"
684 "Sources:\n{source_list}\n\n"
685 "Chunks:\n{chunks_text}\n\n"
686 "Write the synthesis page now. Start with a heading."
687 ),
688 )
690 # Wiki synthesis clusterer backend. CONCEPTS requires the [graph] extra
691 # and falls back to EMBEDDING when unavailable.
692 wiki_clusterer: ClustererBackend = ConfigField(
693 default=ClustererBackend.EMBEDDING, writable=True
694 )
696 # Neighborhood size for the mutual-kNN graph. 0 = auto-scale from corpus size.
697 wiki_clusterer_k: int = ConfigField(default=0, ge=0, writable=True)
699 # LazyGraphRAG-style concept graph. Requires the [graph] extra.
700 concept_graph: bool = ConfigField(default=True, writable=True)
702 # Weight of concept overlap boost relative to vector similarity.
703 concept_boost_weight: float = ConfigField(default=0.3, ge=0.0, le=1.0, writable=True)
705 # Floor on post-boost distance to stop weak boosts from promoting marginal hits.
706 concept_boost_floor: float = ConfigField(default=0.05, ge=0.0, writable=True)
708 # Max noun-phrase concepts extracted per chunk.
709 concept_max_per_chunk: int = ConfigField(default=10, ge=1, writable=True)
711 # spaCy NER labels kept by the wiki entity extractor. Anything not
712 # in this set (QUANTITY, CARDINAL, DATE, TIME, MONEY, PERCENT,
713 # ORDINAL, ...) is dropped before aggregation. Override via
714 # LILBEE_CONCEPT_ALLOWED_ENT_TYPES as a comma-separated list.
715 concept_allowed_ent_types: frozenset[str] = Field(default=DEFAULT_ALLOWED_NER_LABELS)
717 # Strategy used to extract entities for the concept/entity wiki.
718 # NER_ENTITIES (default) pulls typed NER entities with spaCy; concept
719 # pages are proposed by the LLM inside the per-source batched call,
720 # not by the extractor. NER_CONCEPTS_PLUS_LLM_TYPES layers an
721 # LLM-proposed domain schema on top. LLM_TAGGED asks the LLM to tag
722 # every chunk (most expensive). Unimplemented modes fall back to
723 # NER_ENTITIES.
724 wiki_entity_mode: WikiEntityMode = ConfigField(
725 default=WikiEntityMode.NER_ENTITIES, writable=True
726 )
728 # Minimum distinct chunk mentions before an entity or concept earns
729 # its own wiki page. Filters one-off noise.
730 wiki_entity_min_mentions: int = ConfigField(default=3, ge=1, writable=True)
732 # Maximum chunks passed into each concept or entity page generation
733 # call. Caps context size so one page does not blow the context
734 # window on a prolific topic.
735 wiki_concept_max_chunks_per_page: int = ConfigField(default=25, ge=1, writable=True)
737 # Maximum number of related concepts the model is asked to list in
738 # the `## Related` section of each page.
739 wiki_related_max: int = ConfigField(default=8, ge=0, writable=True)
741 # Auto-update cap: if a single sync touches more than this many
742 # concept or entity pages, skip the per-slug regeneration and tell
743 # the user to run `lilbee wiki update` explicitly. Keeps a surprise
744 # bulk import from firing hundreds of LLM calls.
745 wiki_ingest_update_cap: int = ConfigField(default=20, ge=1, writable=True)
747 # Whether the per-source batched call asks the LLM to curate
748 # concept pages alongside the pre-extracted entity list. False →
749 # entity sections only, no concept curation (incremental ingest
750 # path uses this to avoid churning concept slugs per source-touch).
751 wiki_extract_concepts: bool = ConfigField(default=True, writable=True)
753 # Minimum chunk count a source must contribute before it is eligible
754 # for concept curation. Sources below the floor still get a batched
755 # call when they have entities (the prompt writes entity-only
756 # sections); sources below the floor with zero entities are skipped
757 # entirely. Prevents boilerplate / TOC / appendix documents from
758 # burning an LLM call to invent "concepts".
759 wiki_batch_min_chunks: int = ConfigField(default=3, ge=1, writable=True)
761 # Prompt template for the per-source batched call. Placeholders:
762 # {source}, {entity_list}, {chunks_text}, {concept_instruction}.
763 # {concept_instruction} is filled with a concept-curation paragraph
764 # when concepts are requested, or the empty string otherwise.
765 wiki_entity_batch_prompt: str = ConfigField(
766 writable=True,
767 default=(
768 "You are writing wiki sections based on these chunks from {source}.\n\n"
769 "{concept_instruction}"
770 "Write a wiki section for each of these NER ENTITIES: {entity_list}\n\n"
771 "Format each section exactly as:\n"
772 "## Name\n"
773 "{{content with [^src1]-style citations}}\n\n"
774 "Rules:\n"
775 "1. Every factual claim MUST have an inline citation [^src1], [^src2], etc.\n"
776 "2. Cite the EXACT text from the source that supports each claim by quoting it.\n"
777 "3. For interpretations or connections not directly stated, mark with [*inference*].\n"
778 "4. Use blockquotes (>) for directly cited facts.\n"
779 "5. End the response with a citation block in this format:\n\n"
780 "---\n"
781 "<!-- citations (auto-generated from _citations table -- do not edit) -->\n"
782 '[^src1]: {{source_name}}, excerpt: "exact quoted text"\n'
783 '[^src2]: {{source_name}}, excerpt: "exact quoted text"\n\n'
784 "Source chunks:\n{chunks_text}\n"
785 ),
786 )
788 # Class variable — not a settings field
789 _toml_cache: ClassVar[dict[str, Any]] = {}
791 @field_validator(
792 "temperature",
793 "top_p",
794 "repeat_penalty",
795 "top_k_sampling",
796 "num_ctx",
797 "seed",
798 mode="before",
799 )
800 @classmethod
801 def _empty_string_to_none(cls, v: Any) -> Any:
802 if isinstance(v, str) and v.strip() == "":
803 return None
804 return v
806 @field_validator("enable_ocr", mode="before")
807 @classmethod
808 def _parse_enable_ocr(cls, v: Any) -> bool | None:
809 """Parse enable_ocr from env var string or direct value.
811 Accepts: true/false/1/0/yes/no (case-insensitive), empty string
812 or None for auto-detect.
813 """
814 if v is None:
815 return None
816 if isinstance(v, bool):
817 return v
818 if isinstance(v, str):
819 if v.strip().lower() in ("", "auto", "none"):
820 return None
821 try:
822 return _parse_bool(v)
823 except ValueError:
824 pass
825 return bool(v)
827 @field_validator("flash_attention", mode="before")
828 @classmethod
829 def _parse_flash_attention(cls, v: Any) -> bool | None:
830 """Auto/on/off tri-state: empty/auto/none -> None, else parse bool."""
831 if v is None:
832 return None
833 if isinstance(v, bool):
834 return v
835 if isinstance(v, str):
836 if v.strip().lower() in ("", "auto", "none"):
837 return None
838 try:
839 return _parse_bool(v)
840 except ValueError:
841 return None
842 return bool(v)
844 @field_validator("n_gpu_layers", mode="before")
845 @classmethod
846 def _parse_n_gpu_layers(cls, v: Any) -> int | None:
847 """Auto -> None, ``cpu`` alias -> 0, integers parsed verbatim."""
848 if v is None:
849 return None
850 if isinstance(v, str):
851 label = v.strip().lower()
852 if label in ("", "auto", "none"):
853 return None
854 if label == "cpu":
855 return 0
856 try:
857 return int(label)
858 except ValueError:
859 log.warning("Invalid LILBEE_N_GPU_LAYERS=%r, using auto", v)
860 return None
861 return int(v)
863 @field_validator("semantic_chunking", mode="before")
864 @classmethod
865 def _parse_semantic_chunking(cls, v: Any) -> bool:
866 """Parse from env string; invalid values warn and fall back to False."""
867 if isinstance(v, bool):
868 return v
869 if isinstance(v, str):
870 try:
871 return _parse_bool(v)
872 except ValueError:
873 log.warning("Invalid LILBEE_SEMANTIC_CHUNKING=%r, using default False", v)
874 return False
875 return bool(v)
877 @field_validator(
878 "chat_model", "embedding_model", "vision_model", "reranker_model", mode="after"
879 )
880 @classmethod
881 def _normalize_model_tag(cls, v: str, info: ValidationInfo) -> str:
882 """Validate and canonicalize a model ref; blank clears optional roles."""
883 if not v or not v.strip():
884 if info.field_name in {"chat_model", "embedding_model"}:
885 raise ValueError(f"{info.field_name} must not be blank")
886 return ""
887 from lilbee.providers.model_ref import parse_model_ref
889 return parse_model_ref(v).for_openai_prefix()
891 @field_validator("cors_origins", mode="before")
892 @classmethod
893 def _split_cors_origins(cls, v: Any) -> Any:
894 if isinstance(v, str):
895 return [o.strip() for o in v.split(",") if o.strip()]
896 return v
898 @field_validator("crawl_exclude_patterns", mode="before")
899 @classmethod
900 def _split_crawl_exclude_patterns(cls, v: Any) -> Any:
901 """Accept newline-separated strings from env vars / plain-text config.
903 Regex commonly uses commas (e.g. `{2,4}`) and pipes (alternation), so
904 newline is the only separator safe to use for this field. TOML lists
905 and JSON arrays pass through unchanged.
906 """
907 if isinstance(v, str):
908 return [p.strip() for p in v.splitlines() if p.strip()]
909 return v
911 @field_validator("crawl_exclude_patterns", mode="after")
912 @classmethod
913 def _validate_crawl_exclude_patterns(cls, v: list[str]) -> list[str]:
914 """Reject any entry that isn't a valid Python regex.
916 These patterns are compiled at crawl time. An invalid pattern there
917 surfaces as an opaque mid-crawl error; catching it at PATCH time gives
918 the user a 400 with a pointer to the bad entry.
919 """
920 import re
922 bad: list[str] = []
923 for i, pattern in enumerate(v):
924 try:
925 re.compile(pattern)
926 except re.error as exc:
927 bad.append(f"[{i}] {pattern!r}: {exc}")
928 if bad:
929 raise ValueError("invalid regex in crawl_exclude_patterns:\n " + "\n ".join(bad))
930 return v
932 @field_validator("ignore_dirs", mode="before")
933 @classmethod
934 def _merge_ignore_dirs(cls, v: Any) -> frozenset[str]:
935 if isinstance(v, str):
936 extra = frozenset(name.strip() for name in v.split(",") if name.strip())
937 return DEFAULT_IGNORE_DIRS | extra
938 if isinstance(v, (set, frozenset, list)):
939 return DEFAULT_IGNORE_DIRS | frozenset(v)
940 return DEFAULT_IGNORE_DIRS
942 @field_validator("concept_allowed_ent_types", mode="before")
943 @classmethod
944 def _parse_ent_types(cls, v: Any) -> frozenset[str]:
945 """Replace-semantics override: a narrowed set is used as-is,
946 not unioned with defaults. A user asking for ``PERSON,ORG``
947 wants exactly those kinds. Accepts comma-separated strings
948 from env and list / set / frozenset from code. Empty input
949 falls back to :data:`DEFAULT_ALLOWED_NER_LABELS` so an empty
950 env var does not silently disable the gate.
951 """
952 if isinstance(v, str):
953 parts = frozenset(name.strip().upper() for name in v.split(",") if name.strip())
954 return parts or DEFAULT_ALLOWED_NER_LABELS
955 if isinstance(v, (set, frozenset, list)):
956 parts = frozenset(str(x).upper() for x in v)
957 return parts or DEFAULT_ALLOWED_NER_LABELS
958 return DEFAULT_ALLOWED_NER_LABELS
960 @model_validator(mode="before")
961 @classmethod
962 def _resolve_defaults(cls, data: Any) -> Any:
963 from lilbee.platform import canonical_models_dir, default_data_dir, find_local_root
965 if not isinstance(data, dict): # pragma: no cover
966 return data
968 _UNSET = Path()
970 if data.get("data_root") in (None, _UNSET):
971 data_env = os.environ.get("LILBEE_DATA", "").strip()
972 if data_env:
973 data["data_root"] = Path(data_env)
974 else:
975 local = find_local_root()
976 data["data_root"] = local if local is not None else default_data_dir()
977 root = data["data_root"]
978 if data.get("documents_dir") in (None, _UNSET):
979 data["documents_dir"] = root / "documents"
980 if data.get("data_dir") in (None, _UNSET):
981 data["data_dir"] = root / "data"
982 if data.get("lancedb_dir") in (None, _UNSET):
983 data["lancedb_dir"] = root / "data" / "lancedb"
984 if data.get("models_dir") in (None, _UNSET):
985 data["models_dir"] = canonical_models_dir()
987 return data
989 @classmethod
990 def settings_customise_sources(
991 cls,
992 settings_cls: type[BaseSettings],
993 init_settings: Any,
994 env_settings: Any,
995 dotenv_settings: Any,
996 file_secret_settings: Any,
997 ) -> tuple[Any, ...]:
998 from lilbee.platform import default_data_dir, find_local_root
1000 data_env = os.environ.get("LILBEE_DATA", "")
1001 if data_env:
1002 toml_dir = Path(data_env)
1003 else:
1004 local = find_local_root()
1005 toml_dir = local if local else default_data_dir()
1006 toml_path = toml_dir / "config.toml"
1008 plain_env = _PlainEnvSource(settings_cls, env_prefix="LILBEE_", env_ignore_empty=True)
1009 sources: list[Any] = [init_settings, plain_env]
1010 if toml_path.exists() and os.environ.get("LILBEE_SKIP_TOML_CONFIG") != "1":
1011 sources.append(_TomlSource(settings_cls, toml_path))
1012 return tuple(sources)
1014 @property
1015 def model_defaults(self) -> Any:
1016 """Per-model generation defaults (read-only). Set via apply_model_defaults()."""
1017 return self._model_defaults
1019 def apply_model_defaults(self, defaults: Any) -> None:
1020 """Store per-model generation defaults for 3-layer merge."""
1021 object.__setattr__(self, "_model_defaults", defaults)
1023 def clear_model_defaults(self) -> None:
1024 """Reset per-model defaults to None."""
1025 object.__setattr__(self, "_model_defaults", None)
1027 def generation_options(self, **overrides: Any) -> dict[str, Any]:
1028 """Merge model defaults, user config, and per-call overrides, dropping None."""
1029 result = _model_defaults_dict(self._model_defaults)
1030 user_fields: dict[str, Any] = {
1031 "temperature": self.temperature,
1032 "top_p": self.top_p,
1033 "top_k": self.top_k_sampling,
1034 "repeat_penalty": self.repeat_penalty,
1035 "num_ctx": self.num_ctx,
1036 "seed": self.seed,
1037 "max_tokens": self.max_tokens,
1038 }
1039 for k, v in user_fields.items():
1040 if v is not None:
1041 result[k] = v
1042 for k, v in overrides.items():
1043 if v is not None:
1044 result[k] = v
1045 return result
1048def _model_defaults_dict(defaults: Any) -> dict[str, Any]:
1049 """Non-None fields of a ModelDefaults instance as a dict."""
1050 if defaults is None:
1051 return {}
1052 from dataclasses import fields as dc_fields
1054 return {
1055 f.name: getattr(defaults, f.name)
1056 for f in dc_fields(defaults)
1057 if getattr(defaults, f.name) is not None
1058 }
1061class _PlainEnvSource:
1062 """Reads LILBEE_* env vars as plain strings so field validators handle parsing."""
1064 def __init__(
1065 self,
1066 settings_cls: type[BaseSettings],
1067 env_prefix: str,
1068 env_ignore_empty: bool = True,
1069 ) -> None:
1070 self._prefix = env_prefix
1071 self._ignore_empty = env_ignore_empty
1072 self._fields = set(settings_cls.model_fields)
1074 def __call__(self) -> dict[str, Any]:
1075 result: dict[str, Any] = {}
1076 for field_name in self._fields:
1077 env_key = f"{self._prefix}{field_name.upper()}"
1078 raw = os.environ.get(env_key)
1079 if raw is None:
1080 continue
1081 if self._ignore_empty and raw == "":
1082 continue
1083 result[field_name] = raw
1084 return result
1087class _TomlSource:
1088 """Custom pydantic-settings source that reads config.toml."""
1090 def __init__(self, settings_cls: type[BaseSettings], path: Path) -> None:
1091 self._path = path
1093 def __call__(self) -> dict[str, Any]:
1094 import tomllib
1096 try:
1097 with self._path.open("rb") as f:
1098 data = tomllib.load(f)
1099 except (ValueError, OSError):
1100 log.warning("Failed to read %s, ignoring", self._path)
1101 return {}
1102 return {k: str(v) for k, v in data.items()}
1105def _build_cfg() -> tuple[Config, Exception | None]:
1106 """Build cfg; on stale-config validation failure, fall back to defaults.
1108 A persisted ``config.toml`` from before a breaking schema change can
1109 contain values the new validators reject. Crashing at module import
1110 means every command (``lilbee --help`` included) emits a Python
1111 traceback. Falling back to env+defaults lets the package load; the
1112 CLI / TUI surfaces the original error before doing real work.
1113 """
1114 try:
1115 return Config(), None
1116 except Exception as exc:
1117 os.environ["LILBEE_SKIP_TOML_CONFIG"] = "1"
1118 try:
1119 return Config(), exc
1120 finally:
1121 os.environ.pop("LILBEE_SKIP_TOML_CONFIG", None)
1124cfg, config_load_error = _build_cfg()