Coverage for src / lilbee / wiki / gen.py: 100%
706 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Wiki page generation — LLM-driven synthesis with citation provenance.
3Generates summary pages (1:1 with sources) and synthesis pages (cross-source,
4concept-graph-driven) from raw chunks. Each page carries inline citations
5([^srcN]) for facts and [*inference*] markers for LLM synthesis. The
6_citations table is the source of truth; markdown footnotes are rendered from it.
7"""
9from __future__ import annotations
11import difflib
12import functools
13import hashlib
14import logging
15import re
16from collections.abc import Callable
17from datetime import UTC, datetime
18from pathlib import Path
19from typing import TypedDict, cast
21import numpy as np
22import yaml
24from lilbee.chunk import chunk_text
25from lilbee.clustering import SourceClusterer
26from lilbee.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config, cfg
27from lilbee.ingest import file_hash
28from lilbee.providers.base import LLMProvider
29from lilbee.reasoning import strip_reasoning
30from lilbee.services import get_services
31from lilbee.store import (
32 CHUNK_TYPE_WIKI,
33 CitationRecord,
34 SearchChunk,
35 Store,
36 escape_sql_string,
37)
38from lilbee.wiki.citation import (
39 ParsedCitation,
40 extract_body,
41 parse_wiki_citations,
42 render_citation_block,
43 strip_citation_block,
44)
45from lilbee.wiki.entity_extractor import EntityKind, ExtractedEntity
46from lilbee.wiki.index import append_wiki_log, update_wiki_index
47from lilbee.wiki.links import apply_rewriter, compile_rewriter
48from lilbee.wiki.shared import (
49 ARCHIVE_SUBDIR,
50 CONCEPTS_SUBDIR,
51 DRAFTS_SUBDIR,
52 ENTITIES_SUBDIR,
53 MIN_CLUSTER_SOURCES,
54 PENDING_KIND_PARSE,
55 PENDING_MARKER_KEYWORD_COLLISION,
56 PENDING_MARKER_KEYWORD_PARSE,
57 SUMMARIES_SUBDIR,
58 SYNTHESIS_SUBDIR,
59 WIKI_CONTENT_SUBDIRS,
60 WIKI_LOG_ACTION_GENERATED,
61 PageTarget,
62 clean_label_for_display,
63 is_valid_label,
64 make_slug,
65 parse_frontmatter,
66)
68log = logging.getLogger(__name__)
70WikiProgressCallback = Callable[[str, dict[str, object]], None]
71"""Callback for wiki generation progress: (stage, data) -> None."""
73_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings
76# Fraction of context window reserved for chunks. The remainder leaves
77# room for the system/user prompt template and generation output.
78_CONTEXT_BUDGET_FRACTION = 0.75
80# Approximate characters per token for budget estimation. 4 chars/token
81# is a widely used heuristic for English text.
82_CHARS_PER_TOKEN = 4
84# Directive recognized by chat templates that support a reasoning mode
85# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task
86# where chain-of-thought adds wall-clock cost without improving output,
87# so we suppress it whenever the provider reports the capability.
88_NO_THINK_DIRECTIVE = "/no_think"
90# Capability string returned by llama-cpp providers for reasoning models
91# (Qwen3, DeepSeek-R1). Defined locally so gen.py doesn't depend on a
92# specific provider-layer constant name.
93_CAPABILITY_THINKING = "thinking"
95# JSON-style escape sequences that may appear inside quoted excerpts the
96# model emits. Any backslash-prefixed character not in this map stays
97# verbatim (e.g. ``\\x`` passes through unchanged).
98_EXCERPT_ESCAPES: dict[str, str] = {"n": "\n", "t": "\t", '"': '"', "\\": "\\"}
101def _build_wiki_messages(
102 prompt: str, provider: LLMProvider, config: Config
103) -> list[dict[str, str]]:
104 """Build the chat messages list for a wiki-gen call.
106 When the provider reports the ``thinking`` capability for the active
107 chat model, prepends ``/no_think`` so the chat template disables the
108 reasoning mode. Otherwise the prompt passes through unchanged.
109 """
110 capabilities = provider.get_capabilities(config.chat_model)
111 if _CAPABILITY_THINKING in capabilities:
112 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}"
113 return [{"role": "user", "content": prompt}]
116def _truncate_chunks_to_budget(
117 chunks: list[SearchChunk],
118 config: Config,
119) -> list[SearchChunk]:
120 """Drop trailing chunks so the total text fits within the model's context budget.
122 Uses a chars/4 heuristic for token estimation. Returns the original list
123 unchanged when all chunks fit.
124 """
125 context_window = config.num_ctx or DEFAULT_NUM_CTX
126 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION)
127 budget_chars = budget_tokens * _CHARS_PER_TOKEN
129 total_chars = 0
130 kept: list[SearchChunk] = []
131 for chunk in chunks:
132 chunk_chars = len(chunk.chunk)
133 if total_chars + chunk_chars > budget_chars and kept:
134 break
135 kept.append(chunk)
136 total_chars += chunk_chars
138 if len(kept) < len(chunks):
139 log.warning(
140 "Truncated chunks from %d to %d to fit context window (%d tokens)",
141 len(chunks),
142 len(kept),
143 context_window,
144 )
145 return kept
148def _group_chunks_by_page(
149 chunks: list[SearchChunk],
150) -> list[tuple[int, list[SearchChunk]]]:
151 """Group chunks by ``page_start``, preserving in-document order within a page.
153 Returns ``(page_start, chunks)`` tuples sorted ascending by page number.
154 Chunks with ``page_start=0`` (non-paginated sources) collapse to a single
155 entry keyed at 0, so a markdown or code source still emits exactly one
156 summary file until structure detection arrives in a later stage.
157 """
158 grouped: dict[int, list[SearchChunk]] = {}
159 for chunk in chunks:
160 grouped.setdefault(chunk.page_start, []).append(chunk)
161 return sorted(grouped.items())
164def _leaf_hash(chunks: list[SearchChunk]) -> str:
165 """SHA-256 over concatenated chunk content (null-separated, in given order).
167 Acts as the cache key for incremental rebuild: an existing page whose
168 frontmatter ``leaf_hash`` matches this value has already summarized the
169 exact same input and can be reused without a new LLM call.
170 """
171 h = hashlib.sha256()
172 for chunk in chunks:
173 h.update(chunk.chunk.encode("utf-8"))
174 h.update(b"\0")
175 return h.hexdigest()
178def _find_cached_leaf(wiki_root: Path, slug: str, leaf_hash: str) -> Path | None:
179 """Return an existing page whose ``leaf_hash`` frontmatter matches, or ``None``.
181 Checks both ``summaries/`` and ``drafts/`` so an unchanged draft stays in
182 drafts rather than triggering a speculative regeneration.
183 """
184 for subdir in (SUMMARIES_SUBDIR, DRAFTS_SUBDIR):
185 candidate = wiki_root / subdir / f"{slug}.md"
186 if not candidate.is_file():
187 continue
188 fm = parse_frontmatter(candidate.read_text(encoding="utf-8"))
189 if fm.get("leaf_hash") == leaf_hash:
190 return candidate
191 return None
194def _chunks_to_text(chunks: list[SearchChunk]) -> str:
195 """Format chunks as numbered text blocks for the LLM prompt."""
196 parts: list[str] = []
197 for i, chunk in enumerate(chunks):
198 location = ""
199 if chunk.page_start:
200 location = f" (page {chunk.page_start})"
201 elif chunk.line_start:
202 location = f" (lines {chunk.line_start}-{chunk.line_end})"
203 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}")
204 return "\n\n".join(parts)
207def _extract_excerpt(source_ref: str) -> str:
208 """Extract the quoted excerpt from a citation source_ref string.
209 e.g. 'doc.md, excerpt: "Python supports typing."' → 'Python supports typing.'
211 Common JSON-style escape sequences inside the quoted span (``\\n``,
212 ``\\t``, ``\\"``, ``\\\\``) are decoded to their literal characters so
213 they round-trip against the source text. Some models "helpfully"
214 encode real newlines as ``\\n`` when emitting a quoted excerpt; the
215 source chunk they came from has real newlines, so skipping this
216 step leaves otherwise-faithful citations unverifiable.
217 """
218 marker = 'excerpt: "'
219 idx = source_ref.find(marker)
220 if idx == -1:
221 return ""
222 start = idx + len(marker)
223 end = source_ref.find('"', start)
224 raw = source_ref[start:].strip() if end == -1 else source_ref[start:end].strip()
225 return _decode_excerpt_escapes(raw)
228def _decode_excerpt_escapes(raw: str) -> str:
229 """Decode the JSON-style escapes models commonly emit inside quoted strings."""
230 if "\\" not in raw:
231 return raw
232 result: list[str] = []
233 i = 0
234 while i < len(raw):
235 ch = raw[i]
236 mapped = _EXCERPT_ESCAPES.get(raw[i + 1]) if ch == "\\" and i + 1 < len(raw) else None
237 if mapped is not None:
238 result.append(mapped)
239 i += 2
240 else:
241 result.append(ch)
242 i += 1
243 return "".join(result)
246def _find_excerpt_location(
247 excerpt: str,
248 chunks: list[SearchChunk],
249) -> tuple[int, int, int, int]:
250 """Find page/line location of an excerpt within chunks."""
251 if excerpt:
252 for chunk in chunks:
253 if excerpt in chunk.chunk:
254 return chunk.page_start, chunk.page_end, chunk.line_start, chunk.line_end
255 return 0, 0, 0, 0
258def _build_citation_record(
259 citation_key: str,
260 excerpt: str,
261 source_filename: str,
262 source_hash: str,
263 page_start: int,
264 page_end: int,
265 line_start: int,
266 line_end: int,
267 created_at: str,
268) -> CitationRecord:
269 """Build a single CitationRecord with consistent defaults."""
270 return CitationRecord(
271 wiki_source="", # filled by caller
272 wiki_chunk_index=0,
273 citation_key=citation_key,
274 claim_type="fact" if excerpt else "inference",
275 source_filename=source_filename,
276 source_hash=source_hash,
277 page_start=page_start,
278 page_end=page_end,
279 line_start=line_start,
280 line_end=line_end,
281 excerpt=excerpt,
282 created_at=created_at,
283 )
286def _resolve_citations(
287 parsed_citations: list[ParsedCitation],
288 source_name: str,
289 source_hash: str,
290 chunks: list[SearchChunk],
291) -> list[CitationRecord]:
292 """Resolve parsed citation refs to CitationRecord objects.
293 Searches for each citation's excerpt in the source chunks to find
294 the best matching location (page/line numbers).
295 """
296 records: list[CitationRecord] = []
297 now = datetime.now(UTC).isoformat()
299 for parsed in parsed_citations:
300 excerpt = _extract_excerpt(parsed.source_ref)
301 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, chunks)
302 records.append(
303 _build_citation_record(
304 parsed.citation_key,
305 excerpt,
306 source_name,
307 source_hash,
308 page_start,
309 page_end,
310 line_start,
311 line_end,
312 now,
313 )
314 )
315 return records
318def _content_change_ratio(old_text: str, new_text: str) -> float:
319 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite)."""
320 old_lines = old_text.splitlines()
321 new_lines = new_text.splitlines()
322 if not old_lines and not new_lines:
323 return 0.0
324 total = max(len(old_lines), len(new_lines))
325 matcher = difflib.SequenceMatcher(None, old_lines, new_lines)
326 changed = total - sum(block.size for block in matcher.get_matching_blocks())
327 return changed / total
330def _diff_summary(old_text: str, new_text: str) -> str:
331 """Human-readable unified diff summary (first 20 diff lines)."""
332 diff = difflib.unified_diff(
333 old_text.splitlines(),
334 new_text.splitlines(),
335 lineterm="",
336 fromfile="old",
337 tofile="new",
338 )
339 lines = list(diff)
340 if len(lines) > _MAX_DIFF_PREVIEW_LINES:
341 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES
342 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)"
343 return "\n".join(lines)
346def _divert_to_drafts(
347 new_content: str,
348 drafts_dir: Path,
349 slug: str,
350 change_ratio: float,
351 diff_text: str,
352) -> Path:
353 """Write new content to wiki/drafts/ with a drift note instead of overwriting."""
354 draft_path = drafts_dir / f"{slug}.md"
355 draft_path.parent.mkdir(parents=True, exist_ok=True)
356 note = f"<!-- DRIFT: {change_ratio:.0%} content changed - flagged for human review -->\n\n"
357 draft_path.write_text(note + new_content, encoding="utf-8")
358 log.warning(
359 "Drift detected for %s (%.0f%% changed), diverted to drafts. Diff:\n%s",
360 slug,
361 change_ratio * 100,
362 diff_text,
363 )
364 return draft_path
367_WHITESPACE_RE = re.compile(r"\s+")
370def _normalize_whitespace(text: str) -> str:
371 """Collapse runs of whitespace to a single space and strip the edges.
373 PDF extractors preserve line breaks mid-sentence (``vehicle,\\nthe greater``)
374 while LLMs paraphrase the same quote as a single-spaced string
375 (``vehicle, the greater``). A strict substring check rejects a faithful
376 citation on whitespace alone, so both sides are normalized before
377 comparison.
378 """
379 return _WHITESPACE_RE.sub(" ", text).strip()
382def _verify_citations(
383 citation_records: list[CitationRecord],
384 chunks: list[SearchChunk],
385 label: str,
386 config: Config,
387) -> list[CitationRecord]:
388 """Filter citation records, keeping only those whose excerpts are in the chunks."""
389 wiki_prefix = config.wiki_dir + "/"
390 all_chunk_text = _normalize_whitespace(" ".join(c.chunk for c in chunks))
391 verified: list[CitationRecord] = []
392 for rec in citation_records:
393 if rec["source_filename"].startswith(wiki_prefix):
394 log.debug("Skipping wiki-sourced citation %s", rec["citation_key"])
395 continue
396 if rec["claim_type"] == "inference" or not rec["excerpt"]:
397 verified.append(rec)
398 continue
399 if _normalize_whitespace(rec["excerpt"]) in all_chunk_text:
400 verified.append(rec)
401 else:
402 log.debug("Citation %s excerpt not found in %s, dropping", rec["citation_key"], label)
403 return verified
406def _title_content_coherence(wiki_text: str, label: str) -> bool:
407 """Deterministic pre-check: title and body must reference the concept.
409 The LLM faithfulness score evaluates whether the prose reflects
410 the source chunks but does not penalize structural noise in the
411 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body
412 was coherent). This pre-check asserts three invariants:
414 1. The first ``# `` heading must be a sanity-valid label per
415 :func:`is_valid_label`. A heading like ``| | designer`` fails
416 the structural-char gate even though it contains the cleaned
417 display name as a substring.
418 2. The cleaned display name must appear in the heading as a
419 case-insensitive substring. Covers LLM drift where the
420 heading names a different concept than requested.
421 3. The body must mention the display name at least once outside
422 the heading. Covers the "LLM talked about something adjacent
423 but never named the concept" regression.
425 Returns True when all three hold, False otherwise.
426 """
427 display = clean_label_for_display(label).lower()
428 if not display:
429 return False
430 heading: str | None = None
431 body_parts: list[str] = []
432 for line in wiki_text.splitlines():
433 if heading is None and line.startswith("# "):
434 heading = line[2:].strip()
435 continue
436 body_parts.append(line)
437 if heading is None:
438 return False
439 if not is_valid_label(heading):
440 return False
441 if display not in heading.lower():
442 return False
443 body = "\n".join(body_parts).lower()
444 return display in body
447def _mean_vector(vectors: list[list[float]]) -> list[float]:
448 """Compute the element-wise mean of a non-empty vector list.
450 Empty input returns an empty list; callers must check before any
451 downstream dot-product so we do not leak a shape mismatch.
453 Routes through numpy so the inner loop runs in C: for the typical
454 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python
455 ops to a single SIMD-backed reduction.
456 """
457 if not vectors:
458 return []
459 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist()
460 return result
463def _embedding_faithfulness_score(
464 body_vec: list[float],
465 source_vectors: list[list[float]],
466) -> float:
467 """Cosine-similarity score between the body and the mean source vector.
469 Assumes L2-normalized vectors (both the embedder and the store
470 return normalized vectors); cosine reduces to a dot product.
471 Falls through to :func:`cosine_sim` so a non-normalized vector
472 does not silently produce an out-of-range value. Result is
473 clamped at zero because a negative cosine means the body vector
474 points the other way from the mean of the sources — treat that
475 the same as uncorrelated for threshold purposes.
477 Returns 0.0 on a dimension mismatch between the body vector and
478 the source-vector mean. That is not expected in production (the
479 embedder and the chunk vectors come from the same model), but a
480 stub-driven test may hand in off-shape vectors and crashing the
481 whole pipeline on the shape-check hides the real assertion.
482 """
483 from lilbee.store import cosine_sim
485 mean_vec = _mean_vector(source_vectors)
486 if not mean_vec or not body_vec:
487 return 0.0
488 if len(mean_vec) != len(body_vec):
489 log.warning(
490 "Body vector dim %d does not match source vector dim %d; scoring 0.0",
491 len(body_vec),
492 len(mean_vec),
493 )
494 return 0.0
495 return max(0.0, cosine_sim(body_vec, mean_vec))
498def _check_faithfulness(
499 chunks: list[SearchChunk],
500 wiki_text: str,
501 label: str,
502 config: Config | None = None,
503) -> float:
504 """Score the wiki body's similarity to its source chunks, 0.0 on failure.
506 Phase D: replaces the LLM-based faithfulness call with a
507 deterministic cosine-similarity score between the page body and
508 the mean of its source chunk vectors. The B3 title/body coherence
509 pre-check still runs first as a hard gate: a garbage H1 returns
510 0.0 regardless of embedding similarity, so structurally broken
511 pages route to drafts even when the prose happens to be coherent.
513 ``chunks`` carries ``.vector`` populated by LanceDB (see
514 ``SearchChunk`` in ``store.py``), so no extra embedder call is
515 needed for the source side. The body is embedded once via the
516 shared services embedder. Any exception in the embedder (model
517 missing, network issue, invalid config) is caught and reported as
518 0.0 so a single faulty page drops to drafts instead of aborting
519 the whole build.
520 """
521 if not _title_content_coherence(wiki_text, label):
522 log.info(
523 "Faithfulness title/body coherence failed for %r; scoring 0.0",
524 label,
525 )
526 return 0.0
527 source_vectors = [c.vector for c in chunks if c.vector]
528 if not source_vectors:
529 log.warning("No source vectors for %s; scoring 0.0", label)
530 return 0.0
532 # Strip the frontmatter + citation block so we embed only the body
533 # prose. render_citation_block may not have run yet when the score
534 # is computed (it is appended later), but strip_citation_block is
535 # idempotent on missing trailers.
536 body_text = strip_citation_block(wiki_text).strip()
537 if not body_text:
538 log.warning("Empty body for %s; scoring 0.0", label)
539 return 0.0
541 try:
542 body_vectors = get_services().embedder.embed_batch([body_text])
543 except Exception as exc:
544 log.warning("Body embedding failed for %s: %s", label, exc)
545 return 0.0
546 if not body_vectors:
547 return 0.0
548 return _embedding_faithfulness_score(body_vectors[0], source_vectors)
551def _build_frontmatter(
552 config: Config,
553 source_names: list[str],
554 score: float,
555 leaf_hash: str = "",
556 chunks: list[SearchChunk] | None = None,
557) -> str:
558 """Build YAML frontmatter for a wiki page.
560 When ``leaf_hash`` is non-empty it is written so incremental rebuild
561 can skip regeneration on a subsequent sync whose chunks produce the
562 same hash. When ``chunks`` is provided the frontmatter carries a
563 ``provenance`` block naming the source/chunk-index pairs that fed
564 the generator and the extraction method from config, so a bad page
565 is auditable without re-running the pipeline.
566 """
567 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names))
568 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else ""
569 provenance_block = _render_provenance(config, chunks) if chunks is not None else ""
570 return (
571 f"---\n"
572 f"generated_by: {config.chat_model}\n"
573 f"generated_at: {datetime.now(UTC).isoformat()}\n"
574 f"sources: [{sources_yaml}]\n"
575 f"faithfulness_score: {score:.2f}\n"
576 f"{hash_line}"
577 f"{provenance_block}"
578 f"---\n\n"
579 )
582def _render_provenance(config: Config, chunks: list[SearchChunk]) -> str:
583 """Render the provenance block: chunk references + extraction method.
585 Routes through ``yaml.safe_dump`` rather than hand-rolled string
586 formatting so a chunk source containing a quote, backslash,
587 colon, or newline does not produce invalid YAML that
588 ``parse_frontmatter`` would silently drop on read.
589 """
590 block = {
591 "provenance": {
592 "extraction_method": config.wiki_entity_mode.value,
593 "chunks": [{"source": c.source, "chunk_index": c.chunk_index} for c in chunks],
594 }
595 }
596 return yaml.safe_dump(block, sort_keys=False)
599def _write_page(
600 wiki_root: Path,
601 subdir: str,
602 slug: str,
603 full_content: str,
604 drift_threshold: float,
605) -> Path:
606 """Write page to disk with drift detection. Returns path written to.
608 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``);
609 any intermediate directories are created before writing.
610 """
611 page_path = wiki_root / subdir / f"{slug}.md"
612 page_path.parent.mkdir(parents=True, exist_ok=True)
614 if page_path.exists():
615 old_content = page_path.read_text(encoding="utf-8")
616 ratio = _content_change_ratio(old_content, full_content)
617 if ratio > drift_threshold:
618 drafts_dir = wiki_root / DRAFTS_SUBDIR
619 diff_text = _diff_summary(old_content, full_content)
620 return _divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text)
622 page_path.write_text(full_content, encoding="utf-8")
623 return page_path
626def _assemble_content(
627 frontmatter: str,
628 wiki_text: str,
629 citation_block: str,
630) -> str:
631 """Combine frontmatter, body, and citations into the full page content."""
632 full = frontmatter + wiki_text
633 if citation_block:
634 full += "\n\n" + citation_block
635 return full
638def index_wiki_page(content: str, wiki_source: str, store: Store) -> int:
639 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``.
641 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md``
642 shape (see :attr:`PageTarget.wiki_source`). Three branches:
644 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk,
645 embed, write. Returns the row count.
646 - subdir is ``drafts/`` or ``archive/``: skip without touching the
647 store. Returns 0.
648 - malformed ``wiki_source`` (no subdir component): log.warning and
649 return 0. Does not raise because the caller set is narrow (only
650 internal wiki paths reach here) and surfacing the bad input in
651 the log is sufficient triage.
653 Record shape matches the markdown-ingest convention in
654 ``ingest.py``: ``content_type="text"``, all four page/line
655 positions ``0`` (wiki pages are not paginated).
656 """
657 subdir = _subdir_from_wiki_source(wiki_source)
658 if subdir is None:
659 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source)
660 return 0
661 if subdir not in WIKI_CONTENT_SUBDIRS:
662 return 0
664 body = extract_body(content).strip()
665 store.clear_table(
666 CHUNKS_TABLE,
667 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'",
668 )
669 if not body:
670 return 0
672 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True)
673 if not chunks:
674 return 0
676 vectors = get_services().embedder.embed_batch(chunks)
677 records = [
678 {
679 "source": wiki_source,
680 "content_type": "text",
681 "chunk_type": CHUNK_TYPE_WIKI,
682 "page_start": 0,
683 "page_end": 0,
684 "line_start": 0,
685 "line_end": 0,
686 "chunk": text,
687 "chunk_index": idx,
688 "vector": vector,
689 }
690 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True))
691 ]
692 store.add_chunks(records)
693 return len(records)
696def _subdir_from_wiki_source(wiki_source: str) -> str | None:
697 """Return the subdir component (``summaries``, ``concepts``, ...) of *wiki_source*.
699 ``wiki_source`` is the ``<wiki_dir>/<subdir>/<slug>.md`` path
700 stored in citations and chunks. Returns None when the path has
701 fewer than two components.
702 """
703 parts = wiki_source.split("/")
704 return parts[1] if len(parts) >= 2 else None
707def _persist_and_finalize(
708 content: str,
709 target: PageTarget,
710 verified: list[CitationRecord],
711 source_names: list[str],
712 store: Store,
713 config: Config,
714) -> Path:
715 """Write page to disk, persist citations, index body chunks, update index and log."""
716 page_path = _write_page(
717 target.wiki_root, target.subdir, target.slug, content, config.wiki_drift_threshold
718 )
719 for rec in verified:
720 rec["wiki_source"] = target.wiki_source
721 store.delete_citations_for_wiki(target.wiki_source)
722 store.add_citations(verified)
724 index_wiki_page(content, target.wiki_source, store)
726 if config.wiki_prune_raw:
727 for name in source_names:
728 store.delete_by_source(name)
730 update_wiki_index(config)
731 append_wiki_log(
732 WIKI_LOG_ACTION_GENERATED,
733 f"{target.page_type} page for {target.label} -> {target.subdir}/{target.slug}.md",
734 config,
735 )
736 return page_path
739def _generate_page(
740 label: str,
741 prompt: str,
742 chunks: list[SearchChunk],
743 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],
744 page_type: str,
745 slug: str,
746 source_names: list[str],
747 provider: LLMProvider,
748 store: Store,
749 config: Config,
750 on_progress: WikiProgressCallback | None = None,
751 leaf_hash: str = "",
752) -> Path | None:
753 """Core generation pipeline shared by summary and synthesis pages."""
755 def _emit(stage: str, **data: object) -> None:
756 if on_progress is not None:
757 on_progress(stage, data)
759 _emit("preparing", chunks=len(chunks), source=label)
761 messages = _build_wiki_messages(prompt, provider, config)
762 _emit("generating", source=label)
763 options = config.generation_options(
764 temperature=config.wiki_temperature,
765 max_tokens=config.wiki_summary_max_tokens,
766 )
767 try:
768 response = provider.chat(messages, stream=False, options=options)
769 wiki_text = strip_reasoning(cast(str, response)).strip()
770 except Exception as exc:
771 log.warning("LLM failed to generate wiki page for %s: %s", label, exc)
772 _emit("failed", error=str(exc))
773 return None
775 if not wiki_text:
776 log.warning("LLM returned empty response for wiki page %s", label)
777 _emit("failed", error="Model returned empty response")
778 return None
780 parsed_citations = parse_wiki_citations(wiki_text)
781 verified = _verify_citations(citation_resolver(parsed_citations), chunks, label, config)
782 if not verified:
783 log.warning("No valid citations for %s, skipping", label)
784 _emit("failed", error="No valid citations found")
785 return None
787 _emit("faithfulness_check")
788 score = _check_faithfulness(chunks, wiki_text, label, config)
789 threshold = config.wiki_embedding_faithfulness_threshold
790 subdir = page_type if score >= threshold else DRAFTS_SUBDIR
791 if subdir == DRAFTS_SUBDIR:
792 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold)
794 wiki_text = strip_citation_block(wiki_text)
795 frontmatter = _build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks)
796 citation_block = render_citation_block(verified)
797 full_content = _assemble_content(frontmatter, wiki_text, citation_block)
799 wiki_root = config.data_root / config.wiki_dir
800 target = PageTarget(
801 wiki_root=wiki_root,
802 subdir=subdir,
803 slug=slug,
804 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",
805 page_type=page_type,
806 label=label,
807 )
808 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config)
810 log.info(
811 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)",
812 label,
813 target.subdir,
814 score,
815 len(verified),
816 )
817 return page_path
820def _resolve_multi_source_citations(
821 parsed_citations: list[ParsedCitation],
822 source_names: list[str],
823 source_hashes: dict[str, str],
824 chunks_by_source: dict[str, list[SearchChunk]],
825) -> list[CitationRecord]:
826 """Resolve citations from a synthesis page that cites multiple sources.
827 Each citation's source_ref is matched against the source list to
828 determine which source document it references.
829 """
830 records: list[CitationRecord] = []
831 now = datetime.now(UTC).isoformat()
833 all_chunks = [c for cs in chunks_by_source.values() for c in cs]
835 for parsed in parsed_citations:
836 excerpt = _extract_excerpt(parsed.source_ref)
838 matched_source = _match_citation_source(parsed.source_ref, source_names)
839 if not matched_source:
840 matched_source = _find_excerpt_source(excerpt, chunks_by_source)
841 if not matched_source and source_names:
842 # No citation match found; default to first listed source
843 log.warning(
844 "No citation match for chunk — defaulting to first source: %s",
845 source_names[0],
846 )
847 matched_source = source_names[0]
849 search_chunks = chunks_by_source.get(matched_source, all_chunks)
850 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, search_chunks)
851 records.append(
852 _build_citation_record(
853 parsed.citation_key,
854 excerpt,
855 matched_source,
856 source_hashes.get(matched_source, ""),
857 page_start,
858 page_end,
859 line_start,
860 line_end,
861 now,
862 )
863 )
864 return records
867def _match_citation_source(source_ref: str, source_names: list[str]) -> str:
868 """Find which source a citation references by matching filenames in the ref."""
869 for name in source_names:
870 if name in source_ref:
871 return name
872 return ""
875def _find_excerpt_source(excerpt: str, chunks_by_source: dict[str, list[SearchChunk]]) -> str:
876 """Find which source contains a given excerpt by searching chunks."""
877 if not excerpt:
878 return ""
879 for source, chunks in chunks_by_source.items():
880 for chunk in chunks:
881 if excerpt in chunk.chunk:
882 return source
883 return ""
886def _generate_synthesis_page(
887 topic: str,
888 source_names: list[str],
889 chunks_by_source: dict[str, list[SearchChunk]],
890 provider: LLMProvider,
891 store: Store,
892 config: Config,
893) -> Path | None:
894 """Generate a single synthesis page for a concept cluster.
895 Returns the path to the generated page, or None on failure.
896 """
897 all_chunks = [c for cs in chunks_by_source.values() for c in cs]
898 if not all_chunks:
899 log.warning("No chunks for synthesis topic %r, skipping", topic)
900 return None
902 all_chunks = _truncate_chunks_to_budget(all_chunks, config)
903 chunks_text = _chunks_to_text(all_chunks)
904 source_list = "\n".join(f"- {name}" for name in sorted(source_names))
905 template = config.wiki_synthesis_prompt
906 display_topic = clean_label_for_display(topic)
907 prompt = template.format(topic=display_topic, source_list=source_list, chunks_text=chunks_text)
908 slug = make_slug(topic)
910 source_hashes: dict[str, str] = {}
911 for name in source_names:
912 source_path = config.documents_dir / name
913 if source_path.exists():
914 source_hashes[name] = file_hash(source_path)
916 def resolver(parsed: list[ParsedCitation]) -> list[CitationRecord]:
917 return _resolve_multi_source_citations(
918 parsed, source_names, source_hashes, chunks_by_source
919 )
921 return _generate_page(
922 label=topic,
923 prompt=prompt,
924 chunks=all_chunks,
925 citation_resolver=resolver,
926 page_type=SYNTHESIS_SUBDIR,
927 slug=slug,
928 source_names=source_names,
929 provider=provider,
930 store=store,
931 config=config,
932 )
935def _generate_for_cluster(
936 label: str,
937 sources: frozenset[str],
938 provider: LLMProvider,
939 store: Store,
940 config: Config,
941) -> Path | None:
942 """Gather chunks for a cluster and generate a synthesis page."""
943 source_names = sorted(sources)
944 chunks_by_source: dict[str, list[SearchChunk]] = {}
945 for name in source_names:
946 chunks = store.get_chunks_by_source(name)
947 if chunks:
948 chunks_by_source[name] = chunks
950 if len(chunks_by_source) < MIN_CLUSTER_SOURCES:
951 return None
953 return _generate_synthesis_page(label, source_names, chunks_by_source, provider, store, config)
956def generate_synthesis_pages(
957 provider: LLMProvider,
958 store: Store,
959 clusterer: SourceClusterer,
960 config: Config | None = None,
961) -> list[Path]:
962 """Generate synthesis pages for source clusters spanning 3+ documents."""
963 if config is None:
964 config = cfg
966 clusters = clusterer.get_clusters(min_sources=MIN_CLUSTER_SOURCES)
967 if not clusters:
968 log.info("No source clusters span %d+ sources, skipping synthesis", MIN_CLUSTER_SOURCES)
969 return []
971 pages: list[Path] = []
972 for cluster in clusters:
973 page = _generate_for_cluster(cluster.label, cluster.sources, provider, store, config)
974 if page is not None:
975 pages.append(page)
977 log.info("Generated %d synthesis pages", len(pages))
978 return pages
981def _hash_existing_sources(source_names: list[str], documents_dir: Path) -> dict[str, str]:
982 """Hash each source file that still exists on disk (used for citation staleness)."""
983 out: dict[str, str] = {}
984 for name in source_names:
985 source_path = documents_dir / name
986 if source_path.exists():
987 out[name] = file_hash(source_path)
988 return out
991# Phase D: archive-migration sentinel and helpers. The sentinel lives
992# under data_dir (NOT inside wiki/) so Obsidian sync and wiki
993# tree-walkers never surface it.
994_PHASE_D_SENTINEL_NAME = ".phase-d-migrated"
996# Pre-Phase-D wiki concepts that we move to archive/ as part of the
997# one-time migration. Matches wiki/<CONCEPTS_SUBDIR>/*.md recursively.
998_ARCHIVE_CONCEPTS_SUBPATH = Path(ARCHIVE_SUBDIR) / CONCEPTS_SUBDIR
1001def _maybe_run_phase_d_migration(wiki_root: Path, data_dir: Path) -> None:
1002 """One-time migration: archive pre-Phase-D concept pages.
1004 Runs idempotently, gated by ``{data_dir}/.phase-d-migrated``:
1006 1. Move every ``wiki/concepts/*.md`` to ``wiki/archive/concepts/``
1007 preserving relative subpaths. Older concept pages stay
1008 readable but drop out of the active wiki browse surface.
1009 2. Unwrap stale ``[[archived-slug]]`` references across the
1010 remaining pages so a reader clicking a link does not hit a
1011 404. Archived slugs become plain text.
1012 3. Write the sentinel so future builds skip this path.
1014 D3's freshly LLM-curated concept pages written AFTER the sentinel
1015 exists are never touched.
1016 """
1017 sentinel = data_dir / _PHASE_D_SENTINEL_NAME
1018 if sentinel.exists():
1019 return
1020 concepts_dir = wiki_root / CONCEPTS_SUBDIR
1021 archive_dir = wiki_root / _ARCHIVE_CONCEPTS_SUBPATH
1022 archived_slugs: list[str] = []
1023 if concepts_dir.is_dir():
1024 for src in sorted(concepts_dir.rglob("*.md")):
1025 rel = src.relative_to(concepts_dir)
1026 dest = archive_dir / rel
1027 dest.parent.mkdir(parents=True, exist_ok=True)
1028 src.replace(dest)
1029 archived_slugs.append(str(rel.with_suffix("")).replace("\\", "/"))
1031 if archived_slugs:
1032 _unwrap_archived_links(wiki_root, archived_slugs)
1034 data_dir.mkdir(parents=True, exist_ok=True)
1035 sentinel.write_text(datetime.now(UTC).isoformat(), encoding="utf-8")
1036 if archived_slugs:
1037 log.info(
1038 "Phase D migration: archived %d concept pages, sentinel written at %s",
1039 len(archived_slugs),
1040 sentinel,
1041 )
1044def _unwrap_archived_links(wiki_root: Path, archived_slugs: list[str]) -> None:
1045 """Rewrite ``[[slug]]`` → ``slug`` (plain text) across remaining wiki pages.
1047 The existing ``_rewrite_links_across_wiki`` path is the wrong
1048 tool here: it compiles an *additive* surface map, not a
1049 removal pass. Walk the active wiki content subdirs once per
1050 archived slug is acceptable because the archive count is
1051 bounded (concepts that existed pre-migration). Pages whose body
1052 did not change are not rewritten.
1053 """
1054 if not archived_slugs:
1055 return
1056 patterns = [(re.compile(r"\[\[" + re.escape(slug) + r"\]\]"), slug) for slug in archived_slugs]
1057 for subdir in WIKI_CONTENT_SUBDIRS:
1058 subdir_path = wiki_root / subdir
1059 if not subdir_path.is_dir():
1060 continue
1061 for md_path in subdir_path.rglob("*.md"):
1062 original = md_path.read_text(encoding="utf-8")
1063 rewritten = original
1064 for pattern, replacement in patterns:
1065 rewritten = pattern.sub(replacement, rewritten)
1066 if rewritten != original:
1067 md_path.write_text(rewritten, encoding="utf-8")
1070# Pending-marker conventions: the drafts listing surface
1071# (``lilbee.wiki.drafts``) scans for these prefixes to classify a
1072# draft as PARSE or COLLISION instead of a drift-routed regen. The
1073# keyword phrases live in ``wiki.shared`` so writer (gen) and reader
1074# (drafts) stay in sync on the exact wording.
1075_PENDING_PARSE_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_PARSE}"
1076_PENDING_COLLISION_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_COLLISION}"
1079def _write_pending_marker(
1080 drafts_dir: Path,
1081 slug: str,
1082 marker_line: str,
1083 frontmatter: str = "",
1084) -> Path:
1085 """Write a PENDING marker page under ``drafts/<slug>.md``.
1087 ``marker_line`` is the leading HTML comment that both identifies
1088 the marker kind and carries the context (source, label). The
1089 optional ``frontmatter`` preserves minimal metadata for the
1090 drafts surface to round-trip (e.g. ``bad_title``-style fields).
1091 """
1092 drafts_dir.mkdir(parents=True, exist_ok=True)
1093 draft_path = drafts_dir / f"{slug}.md"
1094 body = marker_line + "\n"
1095 if frontmatter:
1096 body += "\n" + frontmatter
1097 draft_path.write_text(body, encoding="utf-8")
1098 return draft_path
1101def _delete_pending_marker_if_present(drafts_dir: Path, slug: str) -> bool:
1102 """Delete an existing PENDING marker for *slug*; return whether one was removed.
1104 Match is slug-equality (not fuzzy): an LLM that rephrases a
1105 label on retry (``brake system`` → ``braking system``) leaves
1106 the old marker behind for the user to drain via ``wiki drafts
1107 reject``. Documented limitation; follow-up if the pattern
1108 matters.
1109 """
1110 draft_path = drafts_dir / f"{slug}.md"
1111 if not draft_path.is_file():
1112 return False
1113 try:
1114 body = draft_path.read_text(encoding="utf-8")
1115 except OSError:
1116 return False
1117 first_line = body.splitlines()[0] if body else ""
1118 is_pending = first_line.startswith(_PENDING_PARSE_MARKER_PREFIX) or first_line.startswith(
1119 _PENDING_COLLISION_MARKER_PREFIX
1120 )
1121 if not is_pending:
1122 return False
1123 draft_path.unlink()
1124 return True
1127def _group_entities_by_primary_source(
1128 entities: list[ExtractedEntity],
1129) -> dict[str, list[ExtractedEntity]]:
1130 """Group entities under the source that mentions them most.
1132 Primary source = source with the highest chunk-ref count;
1133 lexicographic tiebreak. An entity with no refs is dropped
1134 silently (defensive: extractor always attaches refs, but a
1135 future extractor might not).
1136 """
1137 grouped: dict[str, list[ExtractedEntity]] = {}
1138 for entity in entities:
1139 if not entity.chunk_refs:
1140 continue
1141 counts: dict[str, int] = {}
1142 for ref in entity.chunk_refs:
1143 counts[ref.source] = counts.get(ref.source, 0) + 1
1144 primary = min(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0]
1145 grouped.setdefault(primary, []).append(entity)
1146 return grouped
1149# Regex that matches section headers the batch parser recognizes:
1150# H1 (``# Name``), H2 (``## Name``), or a bold-line heading
1151# (``**Name**``) at line start. The name capture is anchored to the
1152# rest of the line (stripped of trailing whitespace) so labels like
1153# ``## Brake System (hydraulic)`` still parse.
1154_SECTION_HEADER_RE = re.compile(
1155 r"^(?:(?:##?)\s+(?P<hashname>[^\n]+)|\*\*(?P<boldname>[^\*\n]+)\*\*)\s*$",
1156 re.MULTILINE,
1157)
1159# In-body ``[^keyN]`` footnote-marker pattern. Module-scope so the
1160# batched-generation hot path (`_finalize_section`) does not recompile
1161# it on every recovered section.
1162_FOOTNOTE_MARKER_RE = re.compile(r"\[\^([a-zA-Z0-9_\-]+)\]")
1165def _split_batched_output(
1166 text: str,
1167 expected_entity_labels: set[str],
1168 expected_concept_labels: set[str] | None = None,
1169) -> dict[str, tuple[EntityKind, str]]:
1170 """Best-effort parse of the batched LLM response into per-label bodies.
1172 Splits on H1/H2/bold-line headers, then matches each header
1173 against the expected entity and concept label sets via
1174 case-insensitive substring. Known labels are tagged with the
1175 right ``EntityKind``; unknown headers are dropped. Labels whose
1176 section could not be recovered at all are surfaced to the caller
1177 (they show up as *missing from the return dict* rather than a
1178 separate list — caller loops over the expected sets to write
1179 PENDING markers).
1180 """
1181 concepts = expected_concept_labels or set()
1182 recovered: dict[str, tuple[EntityKind, str]] = {}
1183 matches = list(_SECTION_HEADER_RE.finditer(text))
1184 if not matches:
1185 return recovered
1186 for i, match in enumerate(matches):
1187 name = match.group("hashname") or match.group("boldname") or ""
1188 name = name.strip()
1189 start = match.end()
1190 end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
1191 body = text[start:end].strip()
1192 if not body:
1193 continue
1194 lowered = name.lower()
1195 kind_label = _match_label(lowered, expected_entity_labels, EntityKind.ENTITY)
1196 if kind_label is None:
1197 kind_label = _match_label(lowered, concepts, EntityKind.CONCEPT)
1198 if kind_label is None:
1199 # Concept labels come from the LLM itself — tag any
1200 # unmatched section as CONCEPT only when the caller is
1201 # expecting concept curation; otherwise drop it as
1202 # noise.
1203 if concepts is not None and expected_concept_labels is not None:
1204 recovered.setdefault(name, (EntityKind.CONCEPT, _prefix_heading(name, body)))
1205 continue
1206 kind, label = kind_label
1207 recovered[label] = (kind, _prefix_heading(name, body))
1208 return recovered
1211def _match_label(
1212 lowered_name: str,
1213 expected: set[str],
1214 kind: EntityKind,
1215) -> tuple[EntityKind, str] | None:
1216 """Case-insensitive substring match of *lowered_name* against *expected*.
1218 Returns ``(kind, original_label)`` on hit, ``None`` otherwise.
1219 A substring match (not equality) accommodates the LLM adding
1220 qualifiers ("Brake System (hydraulic)" vs "brake system").
1221 """
1222 for label in expected:
1223 low = label.lower()
1224 if low and (low in lowered_name or lowered_name in low):
1225 return (kind, label)
1226 return None
1229def _prefix_heading(name: str, body: str) -> str:
1230 """Ensure the extracted body starts with a ``# Name`` H1.
1232 The batched prompt instructs the model to emit ``## Name`` per
1233 section. After splitting, the per-section body has lost its
1234 header. Rebuild an H1 so the B3 title/body coherence gate still
1235 has a heading to match.
1236 """
1237 stripped = body.lstrip()
1238 if stripped.startswith("# "):
1239 return body
1240 return f"# {name}\n\n{body}"
1243def _chunks_for_source(chunks: list[SearchChunk], source: str) -> list[SearchChunk]:
1244 """Return the subset of *chunks* whose ``source`` matches, preserving order."""
1245 return [c for c in chunks if c.source == source]
1248def _build_batch_prompt(
1249 source: str,
1250 entities: list[ExtractedEntity],
1251 chunks_text: str,
1252 extract_concepts: bool,
1253 config: Config,
1254) -> str:
1255 """Render :attr:`Config.wiki_entity_batch_prompt` for one source call.
1257 ``extract_concepts`` controls whether the concept-curation
1258 paragraph is injected: True adds a "identify 3-5 concepts" block;
1259 False leaves ``{concept_instruction}`` empty so the LLM writes
1260 entity sections only. Keeps the per-source batched call the
1261 single entry point whether or not concepts are requested.
1262 """
1263 entity_labels = ", ".join(clean_label_for_display(e.label) for e in entities) or "(none)"
1264 if extract_concepts:
1265 concept_instruction = (
1266 "First, identify 3-5 CONCEPTS — abstract topics or domain terms "
1267 "from the source that deserve a standalone wiki page. Do NOT include "
1268 "pronouns, articles, or generic nouns.\n\n"
1269 "Then write a wiki section for each of the concepts you identified, "
1270 "PLUS one section for each NER ENTITY listed below.\n\n"
1271 )
1272 else:
1273 concept_instruction = ""
1274 return config.wiki_entity_batch_prompt.format(
1275 source=source,
1276 entity_list=entity_labels,
1277 chunks_text=chunks_text,
1278 concept_instruction=concept_instruction,
1279 )
1282def _short_source_hash(source: str) -> str:
1283 """8-char sha256 digest of *source* (stable collision-marker suffix)."""
1284 return hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]
1287def _generate_source_batch(
1288 source: str,
1289 entities: list[ExtractedEntity],
1290 chunks: list[SearchChunk],
1291 provider: LLMProvider,
1292 store: Store,
1293 config: Config,
1294 *,
1295 extract_concepts: bool,
1296 written_concept_slugs: dict[str, str],
1297) -> list[Path]:
1298 """Issue one LLM call for *source* and finalize every recovered section.
1300 Returns the list of page paths written (entities + concepts
1301 combined). Labels not recovered by the parser become PENDING
1302 markers under ``wiki/drafts/`` so the next build can retry.
1303 Concept slugs already written by an earlier source produce a
1304 PENDING-COLLISION marker on the losing side (see
1305 :func:`_handle_concept_write`).
1307 ``written_concept_slugs`` is the per-build ledger of
1308 slug → first_source. Callers share one dict across the per-source
1309 loop. The second source to propose a slug is the one that gets
1310 diverted to a collision marker.
1311 """
1312 if not chunks:
1313 return []
1314 budgeted = _truncate_chunks_to_budget(chunks, config)
1315 chunks_text = _chunks_to_text(budgeted)
1316 prompt = _build_batch_prompt(source, entities, chunks_text, extract_concepts, config)
1317 messages = _build_wiki_messages(prompt, provider, config)
1318 options = config.generation_options(
1319 temperature=config.wiki_temperature,
1320 max_tokens=config.wiki_summary_max_tokens,
1321 )
1322 try:
1323 response = provider.chat(messages, stream=False, options=options)
1324 text = strip_reasoning(cast(str, response)).strip()
1325 except Exception as exc:
1326 log.warning("Batched LLM call failed for source %s: %s", source, exc)
1327 return []
1329 if not text:
1330 log.warning("Batched LLM call returned empty response for source %s", source)
1331 return []
1333 expected_entity_labels = {e.label for e in entities}
1334 expected_concepts: set[str] | None = set() if extract_concepts else None
1335 parsed = _split_batched_output(text, expected_entity_labels, expected_concepts)
1337 wiki_root = config.data_root / config.wiki_dir
1338 drafts_dir = wiki_root / DRAFTS_SUBDIR
1339 source_names = [source]
1340 source_hashes = _hash_existing_sources(source_names, config.documents_dir)
1341 chunks_by_source = {source: budgeted}
1343 # Citation definitions live in the trailing block of the WHOLE
1344 # response, not inside any one section body. Parse once over the
1345 # full text and replay the same list for every section, so each
1346 # page sees its own citations even when only the last section
1347 # carries the definition trailer.
1348 shared_parsed_citations = parse_wiki_citations(text)
1350 pages: list[Path] = []
1351 seen_labels: set[str] = set()
1352 for header_label, (kind, body) in parsed.items():
1353 seen_labels.add(header_label)
1354 resolver = functools.partial(
1355 _resolve_multi_source_citations,
1356 source_names=source_names,
1357 source_hashes=source_hashes,
1358 chunks_by_source=chunks_by_source,
1359 )
1360 page = _finalize_section(
1361 header_label=header_label,
1362 kind=kind,
1363 body=body,
1364 chunks=budgeted,
1365 citation_resolver=resolver,
1366 source_names=source_names,
1367 store=store,
1368 config=config,
1369 source=source,
1370 written_concept_slugs=written_concept_slugs,
1371 drafts_dir=drafts_dir,
1372 shared_parsed_citations=shared_parsed_citations,
1373 )
1374 if page is not None:
1375 pages.append(page)
1377 for entity in entities:
1378 if entity.label not in seen_labels:
1379 marker = (
1380 f"{_PENDING_PARSE_MARKER_PREFIX} for source {source}, "
1381 f"entity/concept {entity.label} - "
1382 "run wiki build again or manually accept via wiki drafts accept -->"
1383 )
1384 # Route through ``yaml.safe_dump`` so a label or source
1385 # containing a colon, quote, or newline does not produce a
1386 # frontmatter block that ``parse_frontmatter`` silently drops.
1387 frontmatter_body = yaml.safe_dump(
1388 {
1389 "pending_source": source,
1390 "pending_label": entity.label,
1391 "pending_kind": PENDING_KIND_PARSE,
1392 },
1393 sort_keys=False,
1394 )
1395 frontmatter = f"---\n{frontmatter_body}---\n"
1396 path = _write_pending_marker(drafts_dir, entity.slug, marker, frontmatter)
1397 log.info("Wrote PENDING-PARSE marker for %s -> %s", entity.slug, path)
1399 return pages
1402def _finalize_section(
1403 *,
1404 header_label: str,
1405 kind: EntityKind,
1406 body: str,
1407 chunks: list[SearchChunk],
1408 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],
1409 source_names: list[str],
1410 store: Store,
1411 config: Config,
1412 source: str,
1413 written_concept_slugs: dict[str, str],
1414 drafts_dir: Path,
1415 shared_parsed_citations: list[ParsedCitation],
1416) -> Path | None:
1417 """Citation-check, faithfulness-check, write one batched section.
1419 Shared by entity and concept sections from the per-source batched
1420 call. Returns the written page path, or ``None`` if the section
1421 failed any gate (no citations, empty body, slug collision marker
1422 handled via side channel). ``shared_parsed_citations`` is the
1423 definition list parsed once over the whole response — every
1424 section replays it so pages other than the last one still have
1425 their footnotes resolved.
1426 """
1427 slug = make_slug(header_label)
1428 if not slug:
1429 log.info("Empty slug for batched section %r; skipping", header_label)
1430 return None
1432 # Only replay citation keys that this section actually references
1433 # in the body; otherwise every section would claim every citation.
1434 section_keys = {ref.citation_key for ref in parse_wiki_citations(body)}
1435 # Fall back to in-body ``[^keyN]`` references when no definitions
1436 # live inside the section: count occurrences of the footnote
1437 # marker against the shared definition set.
1438 section_keys.update(_FOOTNOTE_MARKER_RE.findall(body))
1439 relevant = [c for c in shared_parsed_citations if c.citation_key in section_keys]
1440 verified = _verify_citations(citation_resolver(relevant), chunks, header_label, config)
1441 if not verified:
1442 log.info("No valid citations for batched section %s, skipping", header_label)
1443 return None
1445 score = _check_faithfulness(chunks, body, header_label, config)
1446 threshold = config.wiki_embedding_faithfulness_threshold
1447 page_type = CONCEPTS_SUBDIR if kind is EntityKind.CONCEPT else ENTITIES_SUBDIR
1448 subdir = page_type if score >= threshold else DRAFTS_SUBDIR
1449 if subdir == DRAFTS_SUBDIR:
1450 log.info(
1451 "Batched section %s scored %.2f (< %.2f), sending to drafts",
1452 header_label,
1453 score,
1454 threshold,
1455 )
1457 clean_body = strip_citation_block(body)
1458 frontmatter = _build_frontmatter(config, source_names, score, chunks=chunks)
1459 citation_block = render_citation_block(verified)
1460 full_content = _assemble_content(frontmatter, clean_body, citation_block)
1462 # Concept collision: the second source proposing a slug loses
1463 # and writes to a drafts collision marker; the winning source's
1464 # page stays untouched.
1465 if kind is EntityKind.CONCEPT and subdir == CONCEPTS_SUBDIR:
1466 first_source = written_concept_slugs.get(slug)
1467 if first_source is not None and first_source != source:
1468 return _divert_concept_collision(
1469 slug=slug,
1470 source=source,
1471 first_source=first_source,
1472 content=full_content,
1473 drafts_dir=drafts_dir,
1474 )
1475 written_concept_slugs.setdefault(slug, source)
1477 # Successful regen of a previously-PENDING slug: remove the old
1478 # marker so the drafts surface no longer lists it.
1479 _delete_pending_marker_if_present(drafts_dir, slug)
1481 wiki_root = config.data_root / config.wiki_dir
1482 target = PageTarget(
1483 wiki_root=wiki_root,
1484 subdir=subdir,
1485 slug=slug,
1486 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",
1487 page_type=page_type,
1488 label=header_label,
1489 )
1490 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config)
1491 log.info(
1492 "Generated batched page for %s -> %s (score=%.2f, citations=%d)",
1493 header_label,
1494 target.subdir,
1495 score,
1496 len(verified),
1497 )
1498 return page_path
1501def _divert_concept_collision(
1502 *,
1503 slug: str,
1504 source: str,
1505 first_source: str,
1506 content: str,
1507 drafts_dir: Path,
1508) -> Path:
1509 """Write the losing concept to ``drafts/<slug>-collision-<hash>.md``.
1511 The winning source's page is unchanged on disk. Hash is the
1512 first 8 hex of sha256(source_filename); stable per source so a
1513 retry on the same two sources lands at the same draft path,
1514 letting the user iterate without marker sprawl.
1515 """
1516 short = _short_source_hash(source)
1517 collision_slug = f"{slug}-collision-{short}"
1518 marker = (
1519 f"{_PENDING_COLLISION_MARKER_PREFIX} with source {first_source}, "
1520 f"content from {source} held for review -->\n\n"
1521 )
1522 drafts_dir.mkdir(parents=True, exist_ok=True)
1523 path = drafts_dir / f"{collision_slug}.md"
1524 path.write_text(marker + content, encoding="utf-8")
1525 log.warning(
1526 "Concept slug collision: %s already written by %s; diverted %s's version to %s",
1527 slug,
1528 first_source,
1529 source,
1530 path,
1531 )
1532 return path
1535def build_wiki(
1536 entities: list[ExtractedEntity],
1537 provider: LLMProvider,
1538 store: Store,
1539 config: Config | None = None,
1540 *,
1541 extract_concepts: bool = True,
1542) -> list[Path]:
1543 """Produce entity and LLM-curated concept pages per source.
1545 Phase D replaces the per-entity / per-concept fan-out with a
1546 per-source batched call: for each source in ``entities``' chunk
1547 refs, one LLM call identifies 3-5 concepts AND writes a wiki
1548 section for every pre-extracted entity belonging to that source.
1549 Output sections are split, citation-verified, embedding-scored,
1550 and landed under ``wiki/entities/`` or ``wiki/concepts/``
1551 depending on kind.
1553 ``extract_concepts=False`` (used by the incremental-ingest hook)
1554 drops the concept-curation paragraph from the prompt so a
1555 touched source does not churn concept slugs.
1557 A one-time archive migration runs first (idempotently, gated by
1558 ``{data_dir}/.phase-d-migrated``), moving pre-Phase-D concept
1559 pages under ``wiki/archive/concepts/`` and unwrapping stale
1560 ``[[archived-slug]]`` links across the remaining pages.
1561 """
1562 if config is None:
1563 config = cfg
1564 wiki_root = config.data_root / config.wiki_dir
1565 _maybe_run_phase_d_migration(wiki_root, config.data_dir)
1567 grouped = _group_entities_by_primary_source(entities)
1568 all_sources = _all_sources_in_scope(entities, grouped, store, config, extract_concepts)
1569 written_concept_slugs: dict[str, str] = {}
1570 pages: list[Path] = []
1572 for source in sorted(all_sources):
1573 source_entities = grouped.get(source, [])
1574 chunks = store.get_chunks_by_source(source)
1575 chunk_count = len(chunks)
1576 source_extract = extract_concepts and chunk_count >= config.wiki_batch_min_chunks
1577 if not source_entities and not source_extract:
1578 log.info(
1579 "Skipping source %s: %d entities, %d chunks, min=%d, extract=%s",
1580 source,
1581 len(source_entities),
1582 chunk_count,
1583 config.wiki_batch_min_chunks,
1584 source_extract,
1585 )
1586 continue
1587 source_pages = _generate_source_batch(
1588 source=source,
1589 entities=source_entities,
1590 chunks=chunks,
1591 provider=provider,
1592 store=store,
1593 config=config,
1594 extract_concepts=source_extract,
1595 written_concept_slugs=written_concept_slugs,
1596 )
1597 pages.extend(source_pages)
1599 _rewrite_links_across_wiki(entities, config)
1600 log.info("Generated %d batched wiki pages", len(pages))
1601 return pages
1604def _all_sources_in_scope(
1605 entities: list[ExtractedEntity],
1606 grouped: dict[str, list[ExtractedEntity]],
1607 store: Store,
1608 config: Config,
1609 extract_concepts: bool,
1610) -> set[str]:
1611 """Union of sources with entities and (when enabled) eligible for concept curation.
1613 Seed the union with every entity's primary source. When
1614 ``extract_concepts`` is True AND ``wiki_batch_min_chunks`` is
1615 satisfied, add any source in the store that passes the floor.
1616 This gives concept-only sources (no extracted entities) their
1617 chance at curation while keeping zero-entity short sources
1618 skipped entirely.
1619 """
1620 sources: set[str] = set(grouped)
1621 if not extract_concepts:
1622 return sources
1623 try:
1624 records = store.get_sources()
1625 except Exception as exc:
1626 log.warning("get_sources failed; sticking to entity-grouped sources: %s", exc)
1627 return sources
1628 for record in records:
1629 name = record.get("filename", "") if isinstance(record, dict) else ""
1630 if not name:
1631 continue
1632 if name in sources:
1633 continue
1634 chunk_count = record.get("chunk_count", 0) if isinstance(record, dict) else 0
1635 if chunk_count >= config.wiki_batch_min_chunks:
1636 sources.add(name)
1637 _ = entities # silences linters on unused pass-through; kept for doc clarity
1638 return sources
1641def _entity_surface_map(entities: list[ExtractedEntity]) -> dict[str, str]:
1642 """Build the surface-form -> slug map for the ``[[link]]`` rewriter.
1644 Includes both the entity's human label (e.g. *"Henry Ford"*) and
1645 the slug-with-hyphens-as-spaces variant (*"henry ford"*) so the
1646 rewriter catches either form in body text.
1647 """
1648 mapping: dict[str, str] = {}
1649 for entity in entities:
1650 mapping[entity.label] = entity.slug
1651 spaced = entity.slug.replace("-", " ")
1652 if spaced and spaced != entity.label:
1653 mapping[spaced] = entity.slug
1654 return mapping
1657_ENTITY_LIKE_SUBDIRS: tuple[str, ...] = (CONCEPTS_SUBDIR, ENTITIES_SUBDIR)
1660def _augment_surface_map_with_existing_pages(
1661 surface_to_slug: dict[str, str], wiki_root: Path
1662) -> None:
1663 """Add slugs for pages already on disk so an incremental rebuild of
1664 one concept still links to its unchanged neighbors. **Mutates
1665 surface_to_slug in place.** Only enriches the map with the
1666 hyphen-to-space surface form because frontmatter labels aren't
1667 read here; body prose typically uses the spaced form so this
1668 covers the common case.
1669 """
1670 for subdir in _ENTITY_LIKE_SUBDIRS:
1671 subdir_path = wiki_root / subdir
1672 if not subdir_path.is_dir():
1673 continue
1674 for md_path in subdir_path.rglob("*.md"):
1675 slug = md_path.stem
1676 spaced = slug.replace("-", " ")
1677 surface_to_slug.setdefault(spaced, slug)
1680def _rewrite_links_across_wiki(entities: list[ExtractedEntity], config: Config) -> None:
1681 """Rewrite ``[[slug]]`` links on every page under ``wiki/`` content subdirs.
1683 A page never receives a link to itself: the rewriter takes the
1684 owning slug and drops it inside its match callback, so the
1685 surface map is shared unmodified across every page in the walk
1686 (no O(M) dict rebuild per file). The map is augmented with
1687 slugs from the existing on-disk corpus so a touched page still
1688 links to untouched neighbors. The alternation regex + lookup are
1689 compiled once per build and reused across pages.
1690 """
1691 surface_to_slug = _entity_surface_map(entities)
1692 wiki_root = config.data_root / config.wiki_dir
1693 _augment_surface_map_with_existing_pages(surface_to_slug, wiki_root)
1694 rewriter = compile_rewriter(surface_to_slug)
1695 if rewriter is None:
1696 return
1698 for subdir in WIKI_CONTENT_SUBDIRS:
1699 subdir_path = wiki_root / subdir
1700 if not subdir_path.is_dir():
1701 continue
1702 is_entity_subdir = subdir in _ENTITY_LIKE_SUBDIRS
1703 for md_path in subdir_path.rglob("*.md"):
1704 owning_slug = md_path.stem if is_entity_subdir else None
1705 original = md_path.read_text(encoding="utf-8")
1706 rewritten = apply_rewriter(original, rewriter, skip_slug=owning_slug)
1707 if rewritten != original:
1708 md_path.write_text(rewritten, encoding="utf-8")
1711class WikiBuildSummary(TypedDict):
1712 """Result of a full wiki build/update."""
1714 paths: list[str]
1715 entities: int
1716 count: int
1719def run_full_build(config: Config | None = None) -> WikiBuildSummary:
1720 """Extract entities + build wiki across every ingested source.
1722 Shared entry point for CLI ``wiki build`` / ``wiki update``, MCP
1723 ``wiki_build`` / ``wiki_update``, and ``POST /api/wiki/build`` /
1724 ``PATCH /api/wiki/update``.
1726 Side effects (in order):
1727 1. Reads every source via ``store.get_sources()``.
1728 2. Reads chunks for each source via ``store.get_chunks_by_source``.
1729 3. Calls the entity extractor (may invoke the LLM provider).
1730 4. Calls :func:`build_wiki` which writes wiki page files.
1731 5. Calls :func:`update_wiki_index` which rewrites ``wiki/index.md``.
1732 6. Calls :func:`append_wiki_log` which appends a build entry.
1734 Concurrency:
1735 Not safe to run concurrently with itself or with another wiki
1736 write path (drafts accept/reject, prune). Callers that share an
1737 event loop or process must serialize via an external lock — the
1738 REST routes do this with a per-process ``asyncio.Lock``; MCP and
1739 CLI run in their own processes and don't need one.
1741 Running concurrently with ``/api/sync`` (an ingest write path
1742 rather than a wiki write path) is permitted but not coherent: a
1743 sync that lands between this function's source-scan and per-source
1744 chunk-fetch may produce a wiki that's missing pages for sources
1745 ingested mid-build. The result is incomplete, not corrupt, and
1746 is repaired by re-running ``run_full_build`` after the sync
1747 finishes.
1749 A crash mid-build leaves a partial wiki on disk; the next successful
1750 build is idempotent and re-emits any pages it would have written, so
1751 recovery is "run it again."
1752 """
1753 if config is None:
1754 config = cfg
1755 from lilbee.wiki.entity_extractor import get_entity_extractor
1756 from lilbee.wiki.shared import WIKI_LOG_ACTION_BUILD
1758 svc = get_services()
1759 chunks: list[SearchChunk] = []
1760 for record in svc.store.get_sources():
1761 chunks.extend(svc.store.get_chunks_by_source(record["filename"]))
1763 extractor = get_entity_extractor(config.wiki_entity_mode, svc.provider, config)
1764 entities = extractor.extract(chunks)
1765 pages = build_wiki(
1766 entities,
1767 svc.provider,
1768 svc.store,
1769 config,
1770 extract_concepts=config.wiki_extract_concepts,
1771 )
1772 update_wiki_index()
1773 append_wiki_log(WIKI_LOG_ACTION_BUILD, f"{len(pages)} pages from {len(entities)} records")
1774 return {
1775 "paths": [str(p) for p in pages],
1776 "entities": len(entities),
1777 "count": len(pages),
1778 }
1781class WikiSynthesizeSummary(TypedDict):
1782 """Result of running synthesis-page generation."""
1784 paths: list[str]
1785 count: int
1788def run_full_synthesize(config: Config | None = None) -> WikiSynthesizeSummary:
1789 """Generate synthesis pages for cross-source clusters of 3+ documents.
1791 Shared entry point for MCP ``wiki_synthesize`` and ``POST
1792 /api/wiki/synthesize``. Concurrency contract matches
1793 :func:`run_full_build`: not safe to run in parallel with itself or
1794 with other wiki write paths; callers serialize via an external lock
1795 on shared event loops.
1796 """
1797 if config is None:
1798 config = cfg
1799 svc = get_services()
1800 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer, config)
1801 return {
1802 "paths": [str(p) for p in paths],
1803 "count": len(paths),
1804 }