Coverage for src/lilbee/wiki/gen.py: 100%

1"""Wiki page generation — LLM-driven synthesis with citation provenance.

3Generates summary pages (1:1 with sources) and synthesis pages (cross-source,

4concept-graph-driven) from raw chunks. Each page carries inline citations

5([^srcN]) for facts and [*inference*] markers for LLM synthesis. The

6_citations table is the source of truth; markdown footnotes are rendered from it.

7"""

9from __future__ import annotations

11import difflib

12import functools

13import hashlib

14import logging

15import re

16from collections.abc import Callable

17from datetime import UTC, datetime

18from pathlib import Path

19from typing import TypedDict, cast

21import numpy as np

22import yaml

24from lilbee.chunk import chunk_text

25from lilbee.clustering import SourceClusterer

26from lilbee.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config, cfg

27from lilbee.ingest import file_hash

28from lilbee.providers.base import LLMProvider

29from lilbee.reasoning import strip_reasoning

30from lilbee.services import get_services

31from lilbee.store import (

32 CHUNK_TYPE_WIKI,

33 CitationRecord,

34 SearchChunk,

35 Store,

36 escape_sql_string,

37)

38from lilbee.wiki.citation import (

39 ParsedCitation,

40 extract_body,

41 parse_wiki_citations,

42 render_citation_block,

43 strip_citation_block,

44)

45from lilbee.wiki.entity_extractor import EntityKind, ExtractedEntity

46from lilbee.wiki.index import append_wiki_log, update_wiki_index

47from lilbee.wiki.links import apply_rewriter, compile_rewriter

48from lilbee.wiki.shared import (

49 ARCHIVE_SUBDIR,

50 CONCEPTS_SUBDIR,

51 DRAFTS_SUBDIR,

52 ENTITIES_SUBDIR,

53 MIN_CLUSTER_SOURCES,

54 PENDING_KIND_PARSE,

55 PENDING_MARKER_KEYWORD_COLLISION,

56 PENDING_MARKER_KEYWORD_PARSE,

57 SUMMARIES_SUBDIR,

58 SYNTHESIS_SUBDIR,

59 WIKI_CONTENT_SUBDIRS,

60 WIKI_LOG_ACTION_GENERATED,

61 PageTarget,

62 clean_label_for_display,

63 is_valid_label,

64 make_slug,

65 parse_frontmatter,

66)

68log = logging.getLogger(__name__)

70WikiProgressCallback = Callable[[str, dict[str, object]], None]

71"""Callback for wiki generation progress: (stage, data) -> None."""

73_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings

76# Fraction of context window reserved for chunks. The remainder leaves

77# room for the system/user prompt template and generation output.

78_CONTEXT_BUDGET_FRACTION = 0.75

80# Approximate characters per token for budget estimation. 4 chars/token

81# is a widely used heuristic for English text.

82_CHARS_PER_TOKEN = 4

84# Directive recognized by chat templates that support a reasoning mode

85# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task

86# where chain-of-thought adds wall-clock cost without improving output,

87# so we suppress it whenever the provider reports the capability.

88_NO_THINK_DIRECTIVE = "/no_think"

90# Capability string returned by llama-cpp providers for reasoning models

91# (Qwen3, DeepSeek-R1). Defined locally so gen.py doesn't depend on a

92# specific provider-layer constant name.

93_CAPABILITY_THINKING = "thinking"

95# JSON-style escape sequences that may appear inside quoted excerpts the

96# model emits. Any backslash-prefixed character not in this map stays

97# verbatim (e.g. ``\\x`` passes through unchanged).

98_EXCERPT_ESCAPES: dict[str, str] = {"n": "\n", "t": "\t", '"': '"', "\\": "\\"}

100

101def _build_wiki_messages(

102 prompt: str, provider: LLMProvider, config: Config

103) -> list[dict[str, str]]:

104 """Build the chat messages list for a wiki-gen call.

105

106 When the provider reports the ``thinking`` capability for the active

107 chat model, prepends ``/no_think`` so the chat template disables the

108 reasoning mode. Otherwise the prompt passes through unchanged.

109 """

110 capabilities = provider.get_capabilities(config.chat_model)

111 if _CAPABILITY_THINKING in capabilities:

112 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}"

113 return [{"role": "user", "content": prompt}]

114

115

116def _truncate_chunks_to_budget(

117 chunks: list[SearchChunk],

118 config: Config,

119) -> list[SearchChunk]:

120 """Drop trailing chunks so the total text fits within the model's context budget.

121

122 Uses a chars/4 heuristic for token estimation. Returns the original list

123 unchanged when all chunks fit.

124 """

125 context_window = config.num_ctx or DEFAULT_NUM_CTX

126 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION)

127 budget_chars = budget_tokens * _CHARS_PER_TOKEN

128

129 total_chars = 0

130 kept: list[SearchChunk] = []

131 for chunk in chunks:

132 chunk_chars = len(chunk.chunk)

133 if total_chars + chunk_chars > budget_chars and kept:

134 break

135 kept.append(chunk)

136 total_chars += chunk_chars

137

138 if len(kept) < len(chunks):

139 log.warning(

140 "Truncated chunks from %d to %d to fit context window (%d tokens)",

141 len(chunks),

142 len(kept),

143 context_window,

144 )

145 return kept

146

147

148def _group_chunks_by_page(

149 chunks: list[SearchChunk],

150) -> list[tuple[int, list[SearchChunk]]]:

151 """Group chunks by ``page_start``, preserving in-document order within a page.

152

153 Returns ``(page_start, chunks)`` tuples sorted ascending by page number.

154 Chunks with ``page_start=0`` (non-paginated sources) collapse to a single

155 entry keyed at 0, so a markdown or code source still emits exactly one

156 summary file until structure detection arrives in a later stage.

157 """

158 grouped: dict[int, list[SearchChunk]] = {}

159 for chunk in chunks:

160 grouped.setdefault(chunk.page_start, []).append(chunk)

161 return sorted(grouped.items())

162

163

164def _leaf_hash(chunks: list[SearchChunk]) -> str:

165 """SHA-256 over concatenated chunk content (null-separated, in given order).

166

167 Acts as the cache key for incremental rebuild: an existing page whose

168 frontmatter ``leaf_hash`` matches this value has already summarized the

169 exact same input and can be reused without a new LLM call.

170 """

171 h = hashlib.sha256()

172 for chunk in chunks:

173 h.update(chunk.chunk.encode("utf-8"))

174 h.update(b"\0")

175 return h.hexdigest()

176

177

178def _find_cached_leaf(wiki_root: Path, slug: str, leaf_hash: str) -> Path | None:

179 """Return an existing page whose ``leaf_hash`` frontmatter matches, or ``None``.

180

181 Checks both ``summaries/`` and ``drafts/`` so an unchanged draft stays in

182 drafts rather than triggering a speculative regeneration.

183 """

184 for subdir in (SUMMARIES_SUBDIR, DRAFTS_SUBDIR):

185 candidate = wiki_root / subdir / f"{slug}.md"

186 if not candidate.is_file():

187 continue

188 fm = parse_frontmatter(candidate.read_text(encoding="utf-8"))

189 if fm.get("leaf_hash") == leaf_hash:

190 return candidate

191 return None

192

193

194def _chunks_to_text(chunks: list[SearchChunk]) -> str:

195 """Format chunks as numbered text blocks for the LLM prompt."""

196 parts: list[str] = []

197 for i, chunk in enumerate(chunks):

198 location = ""

199 if chunk.page_start:

200 location = f" (page {chunk.page_start})"

201 elif chunk.line_start:

202 location = f" (lines {chunk.line_start}-{chunk.line_end})"

203 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}")

204 return "\n\n".join(parts)

205

206

207def _extract_excerpt(source_ref: str) -> str:

208 """Extract the quoted excerpt from a citation source_ref string.

209 e.g. 'doc.md, excerpt: "Python supports typing."' → 'Python supports typing.'

210

211 Common JSON-style escape sequences inside the quoted span (``\\n``,

212 ``\\t``, ``\\"``, ``\\\\``) are decoded to their literal characters so

213 they round-trip against the source text. Some models "helpfully"

214 encode real newlines as ``\\n`` when emitting a quoted excerpt; the

215 source chunk they came from has real newlines, so skipping this

216 step leaves otherwise-faithful citations unverifiable.

217 """

218 marker = 'excerpt: "'

219 idx = source_ref.find(marker)

220 if idx == -1:

221 return ""

222 start = idx + len(marker)

223 end = source_ref.find('"', start)

224 raw = source_ref[start:].strip() if end == -1 else source_ref[start:end].strip()

225 return _decode_excerpt_escapes(raw)

226

227

228def _decode_excerpt_escapes(raw: str) -> str:

229 """Decode the JSON-style escapes models commonly emit inside quoted strings."""

230 if "\\" not in raw:

231 return raw

232 result: list[str] = []

233 i = 0

234 while i < len(raw):

235 ch = raw[i]

236 mapped = _EXCERPT_ESCAPES.get(raw[i + 1]) if ch == "\\" and i + 1 < len(raw) else None

237 if mapped is not None:

238 result.append(mapped)

239 i += 2

240 else:

241 result.append(ch)

242 i += 1

243 return "".join(result)

244

245

246def _find_excerpt_location(

247 excerpt: str,

248 chunks: list[SearchChunk],

249) -> tuple[int, int, int, int]:

250 """Find page/line location of an excerpt within chunks."""

251 if excerpt:

252 for chunk in chunks:

253 if excerpt in chunk.chunk:

254 return chunk.page_start, chunk.page_end, chunk.line_start, chunk.line_end

255 return 0, 0, 0, 0

256

257

258def _build_citation_record(

259 citation_key: str,

260 excerpt: str,

261 source_filename: str,

262 source_hash: str,

263 page_start: int,

264 page_end: int,

265 line_start: int,

266 line_end: int,

267 created_at: str,

268) -> CitationRecord:

269 """Build a single CitationRecord with consistent defaults."""

270 return CitationRecord(

271 wiki_source="", # filled by caller

272 wiki_chunk_index=0,

273 citation_key=citation_key,

274 claim_type="fact" if excerpt else "inference",

275 source_filename=source_filename,

276 source_hash=source_hash,

277 page_start=page_start,

278 page_end=page_end,

279 line_start=line_start,

280 line_end=line_end,

281 excerpt=excerpt,

282 created_at=created_at,

283 )

284

285

286def _resolve_citations(

287 parsed_citations: list[ParsedCitation],

288 source_name: str,

289 source_hash: str,

290 chunks: list[SearchChunk],

291) -> list[CitationRecord]:

292 """Resolve parsed citation refs to CitationRecord objects.

293 Searches for each citation's excerpt in the source chunks to find

294 the best matching location (page/line numbers).

295 """

296 records: list[CitationRecord] = []

297 now = datetime.now(UTC).isoformat()

298

299 for parsed in parsed_citations:

300 excerpt = _extract_excerpt(parsed.source_ref)

301 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, chunks)

302 records.append(

303 _build_citation_record(

304 parsed.citation_key,

305 excerpt,

306 source_name,

307 source_hash,

308 page_start,

309 page_end,

310 line_start,

311 line_end,

312 now,

313 )

314 )

315 return records

316

317

318def _content_change_ratio(old_text: str, new_text: str) -> float:

319 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite)."""

320 old_lines = old_text.splitlines()

321 new_lines = new_text.splitlines()

322 if not old_lines and not new_lines:

323 return 0.0

324 total = max(len(old_lines), len(new_lines))

325 matcher = difflib.SequenceMatcher(None, old_lines, new_lines)

326 changed = total - sum(block.size for block in matcher.get_matching_blocks())

327 return changed / total

328

329

330def _diff_summary(old_text: str, new_text: str) -> str:

331 """Human-readable unified diff summary (first 20 diff lines)."""

332 diff = difflib.unified_diff(

333 old_text.splitlines(),

334 new_text.splitlines(),

335 lineterm="",

336 fromfile="old",

337 tofile="new",

338 )

339 lines = list(diff)

340 if len(lines) > _MAX_DIFF_PREVIEW_LINES:

341 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES

342 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)"

343 return "\n".join(lines)

344

345

346def _divert_to_drafts(

347 new_content: str,

348 drafts_dir: Path,

349 slug: str,

350 change_ratio: float,

351 diff_text: str,

352) -> Path:

353 """Write new content to wiki/drafts/ with a drift note instead of overwriting."""

354 draft_path = drafts_dir / f"{slug}.md"

355 draft_path.parent.mkdir(parents=True, exist_ok=True)

356 note = f"\n\n"

357 draft_path.write_text(note + new_content, encoding="utf-8")

358 log.warning(

359 "Drift detected for %s (%.0f%% changed), diverted to drafts. Diff:\n%s",

360 slug,

361 change_ratio * 100,

362 diff_text,

363 )

364 return draft_path

365

366

367_WHITESPACE_RE = re.compile(r"\s+")

368

369

370def _normalize_whitespace(text: str) -> str:

371 """Collapse runs of whitespace to a single space and strip the edges.

372

373 PDF extractors preserve line breaks mid-sentence (``vehicle,\\nthe greater``)

374 while LLMs paraphrase the same quote as a single-spaced string

375 (``vehicle, the greater``). A strict substring check rejects a faithful

376 citation on whitespace alone, so both sides are normalized before

377 comparison.

378 """

379 return _WHITESPACE_RE.sub(" ", text).strip()

380

381

382def _verify_citations(

383 citation_records: list[CitationRecord],

384 chunks: list[SearchChunk],

385 label: str,

386 config: Config,

387) -> list[CitationRecord]:

388 """Filter citation records, keeping only those whose excerpts are in the chunks."""

389 wiki_prefix = config.wiki_dir + "/"

390 all_chunk_text = _normalize_whitespace(" ".join(c.chunk for c in chunks))

391 verified: list[CitationRecord] = []

392 for rec in citation_records:

393 if rec["source_filename"].startswith(wiki_prefix):

394 log.debug("Skipping wiki-sourced citation %s", rec["citation_key"])

395 continue

396 if rec["claim_type"] == "inference" or not rec["excerpt"]:

397 verified.append(rec)

398 continue

399 if _normalize_whitespace(rec["excerpt"]) in all_chunk_text:

400 verified.append(rec)

401 else:

402 log.debug("Citation %s excerpt not found in %s, dropping", rec["citation_key"], label)

403 return verified

404

405

406def _title_content_coherence(wiki_text: str, label: str) -> bool:

407 """Deterministic pre-check: title and body must reference the concept.

408

409 The LLM faithfulness score evaluates whether the prose reflects

410 the source chunks but does not penalize structural noise in the

411 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body

412 was coherent). This pre-check asserts three invariants:

413

414 1. The first ``# `` heading must be a sanity-valid label per

415 :func:`is_valid_label`. A heading like ``| | designer`` fails

416 the structural-char gate even though it contains the cleaned

417 display name as a substring.

418 2. The cleaned display name must appear in the heading as a

419 case-insensitive substring. Covers LLM drift where the

420 heading names a different concept than requested.

421 3. The body must mention the display name at least once outside

422 the heading. Covers the "LLM talked about something adjacent

423 but never named the concept" regression.

424

425 Returns True when all three hold, False otherwise.

426 """

427 display = clean_label_for_display(label).lower()

428 if not display:

429 return False

430 heading: str | None = None

431 body_parts: list[str] = []

432 for line in wiki_text.splitlines():

433 if heading is None and line.startswith("# "):

434 heading = line[2:].strip()

435 continue

436 body_parts.append(line)

437 if heading is None:

438 return False

439 if not is_valid_label(heading):

440 return False

441 if display not in heading.lower():

442 return False

443 body = "\n".join(body_parts).lower()

444 return display in body

445

446

447def _mean_vector(vectors: list[list[float]]) -> list[float]:

448 """Compute the element-wise mean of a non-empty vector list.

449

450 Empty input returns an empty list; callers must check before any

451 downstream dot-product so we do not leak a shape mismatch.

452

453 Routes through numpy so the inner loop runs in C: for the typical

454 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python

455 ops to a single SIMD-backed reduction.

456 """

457 if not vectors:

458 return []

459 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist()

460 return result

461

462

463def _embedding_faithfulness_score(

464 body_vec: list[float],

465 source_vectors: list[list[float]],

466) -> float:

467 """Cosine-similarity score between the body and the mean source vector.

468

469 Assumes L2-normalized vectors (both the embedder and the store

470 return normalized vectors); cosine reduces to a dot product.

471 Falls through to :func:`cosine_sim` so a non-normalized vector

472 does not silently produce an out-of-range value. Result is

473 clamped at zero because a negative cosine means the body vector

474 points the other way from the mean of the sources — treat that

475 the same as uncorrelated for threshold purposes.

476

477 Returns 0.0 on a dimension mismatch between the body vector and

478 the source-vector mean. That is not expected in production (the

479 embedder and the chunk vectors come from the same model), but a

480 stub-driven test may hand in off-shape vectors and crashing the

481 whole pipeline on the shape-check hides the real assertion.

482 """

483 from lilbee.store import cosine_sim

484

485 mean_vec = _mean_vector(source_vectors)

486 if not mean_vec or not body_vec:

487 return 0.0

488 if len(mean_vec) != len(body_vec):

489 log.warning(

490 "Body vector dim %d does not match source vector dim %d; scoring 0.0",

491 len(body_vec),

492 len(mean_vec),

493 )

494 return 0.0

495 return max(0.0, cosine_sim(body_vec, mean_vec))

496

497

498def _check_faithfulness(

499 chunks: list[SearchChunk],

500 wiki_text: str,

501 label: str,

502 config: Config | None = None,

503) -> float:

504 """Score the wiki body's similarity to its source chunks, 0.0 on failure.

505

506 Phase D: replaces the LLM-based faithfulness call with a

507 deterministic cosine-similarity score between the page body and

508 the mean of its source chunk vectors. The B3 title/body coherence

509 pre-check still runs first as a hard gate: a garbage H1 returns

510 0.0 regardless of embedding similarity, so structurally broken

511 pages route to drafts even when the prose happens to be coherent.

512

513 ``chunks`` carries ``.vector`` populated by LanceDB (see

514 ``SearchChunk`` in ``store.py``), so no extra embedder call is

515 needed for the source side. The body is embedded once via the

516 shared services embedder. Any exception in the embedder (model

517 missing, network issue, invalid config) is caught and reported as

518 0.0 so a single faulty page drops to drafts instead of aborting

519 the whole build.

520 """

521 if not _title_content_coherence(wiki_text, label):

522 log.info(

523 "Faithfulness title/body coherence failed for %r; scoring 0.0",

524 label,

525 )

526 return 0.0

527 source_vectors = [c.vector for c in chunks if c.vector]

528 if not source_vectors:

529 log.warning("No source vectors for %s; scoring 0.0", label)

530 return 0.0

531

532 # Strip the frontmatter + citation block so we embed only the body

533 # prose. render_citation_block may not have run yet when the score

534 # is computed (it is appended later), but strip_citation_block is

535 # idempotent on missing trailers.

536 body_text = strip_citation_block(wiki_text).strip()

537 if not body_text:

538 log.warning("Empty body for %s; scoring 0.0", label)

539 return 0.0

540

541 try:

542 body_vectors = get_services().embedder.embed_batch([body_text])

543 except Exception as exc:

544 log.warning("Body embedding failed for %s: %s", label, exc)

545 return 0.0

546 if not body_vectors:

547 return 0.0

548 return _embedding_faithfulness_score(body_vectors[0], source_vectors)

549

550

551def _build_frontmatter(

552 config: Config,

553 source_names: list[str],

554 score: float,

555 leaf_hash: str = "",

556 chunks: list[SearchChunk] | None = None,

557) -> str:

558 """Build YAML frontmatter for a wiki page.

559

560 When ``leaf_hash`` is non-empty it is written so incremental rebuild

561 can skip regeneration on a subsequent sync whose chunks produce the

562 same hash. When ``chunks`` is provided the frontmatter carries a

563 ``provenance`` block naming the source/chunk-index pairs that fed

564 the generator and the extraction method from config, so a bad page

565 is auditable without re-running the pipeline.

566 """

567 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names))

568 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else ""

569 provenance_block = _render_provenance(config, chunks) if chunks is not None else ""

570 return (

571 f"---\n"

572 f"generated_by: {config.chat_model}\n"

573 f"generated_at: {datetime.now(UTC).isoformat()}\n"

574 f"sources: [{sources_yaml}]\n"

575 f"faithfulness_score: {score:.2f}\n"

576 f"{hash_line}"

577 f"{provenance_block}"

578 f"---\n\n"

579 )

580

581

582def _render_provenance(config: Config, chunks: list[SearchChunk]) -> str:

583 """Render the provenance block: chunk references + extraction method.

584

585 Routes through ``yaml.safe_dump`` rather than hand-rolled string

586 formatting so a chunk source containing a quote, backslash,

587 colon, or newline does not produce invalid YAML that

588 ``parse_frontmatter`` would silently drop on read.

589 """

590 block = {

591 "provenance": {

592 "extraction_method": config.wiki_entity_mode.value,

593 "chunks": [{"source": c.source, "chunk_index": c.chunk_index} for c in chunks],

594 }

595 }

596 return yaml.safe_dump(block, sort_keys=False)

597

598

599def _write_page(

600 wiki_root: Path,

601 subdir: str,

602 slug: str,

603 full_content: str,

604 drift_threshold: float,

605) -> Path:

606 """Write page to disk with drift detection. Returns path written to.

607

608 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``);

609 any intermediate directories are created before writing.

610 """

611 page_path = wiki_root / subdir / f"{slug}.md"

612 page_path.parent.mkdir(parents=True, exist_ok=True)

613

614 if page_path.exists():

615 old_content = page_path.read_text(encoding="utf-8")

616 ratio = _content_change_ratio(old_content, full_content)

617 if ratio > drift_threshold:

618 drafts_dir = wiki_root / DRAFTS_SUBDIR

619 diff_text = _diff_summary(old_content, full_content)

620 return _divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text)

621

622 page_path.write_text(full_content, encoding="utf-8")

623 return page_path

624

625

626def _assemble_content(

627 frontmatter: str,

628 wiki_text: str,

629 citation_block: str,

630) -> str:

631 """Combine frontmatter, body, and citations into the full page content."""

632 full = frontmatter + wiki_text

633 if citation_block:

634 full += "\n\n" + citation_block

635 return full

636

637

638def index_wiki_page(content: str, wiki_source: str, store: Store) -> int:

639 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``.

640

641 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md``

642 shape (see :attr:`PageTarget.wiki_source`). Three branches:

643

644 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk,

645 embed, write. Returns the row count.

646 - subdir is ``drafts/`` or ``archive/``: skip without touching the

647 store. Returns 0.

648 - malformed ``wiki_source`` (no subdir component): log.warning and

649 return 0. Does not raise because the caller set is narrow (only

650 internal wiki paths reach here) and surfacing the bad input in

651 the log is sufficient triage.

652

653 Record shape matches the markdown-ingest convention in

654 ``ingest.py``: ``content_type="text"``, all four page/line

655 positions ``0`` (wiki pages are not paginated).

656 """

657 subdir = _subdir_from_wiki_source(wiki_source)

658 if subdir is None:

659 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source)

660 return 0

661 if subdir not in WIKI_CONTENT_SUBDIRS:

662 return 0

663

664 body = extract_body(content).strip()

665 store.clear_table(

666 CHUNKS_TABLE,

667 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'",

668 )

669 if not body:

670 return 0

671

672 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True)

673 if not chunks:

674 return 0

675

676 vectors = get_services().embedder.embed_batch(chunks)

677 records = [

678 {

679 "source": wiki_source,

680 "content_type": "text",

681 "chunk_type": CHUNK_TYPE_WIKI,

682 "page_start": 0,

683 "page_end": 0,

684 "line_start": 0,

685 "line_end": 0,

686 "chunk": text,

687 "chunk_index": idx,

688 "vector": vector,

689 }

690 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True))

691 ]

692 store.add_chunks(records)

693 return len(records)

694

695

696def _subdir_from_wiki_source(wiki_source: str) -> str | None:

697 """Return the subdir component (``summaries``, ``concepts``, ...) of *wiki_source*.

698

699 ``wiki_source`` is the ``<wiki_dir>/<subdir>/<slug>.md`` path

700 stored in citations and chunks. Returns None when the path has

701 fewer than two components.

702 """

703 parts = wiki_source.split("/")

704 return parts[1] if len(parts) >= 2 else None

705

706

707def _persist_and_finalize(

708 content: str,

709 target: PageTarget,

710 verified: list[CitationRecord],

711 source_names: list[str],

712 store: Store,

713 config: Config,

714) -> Path:

715 """Write page to disk, persist citations, index body chunks, update index and log."""

716 page_path = _write_page(

717 target.wiki_root, target.subdir, target.slug, content, config.wiki_drift_threshold

718 )

719 for rec in verified:

720 rec["wiki_source"] = target.wiki_source

721 store.delete_citations_for_wiki(target.wiki_source)

722 store.add_citations(verified)

723

724 index_wiki_page(content, target.wiki_source, store)

725

726 if config.wiki_prune_raw:

727 for name in source_names:

728 store.delete_by_source(name)

729

730 update_wiki_index(config)

731 append_wiki_log(

732 WIKI_LOG_ACTION_GENERATED,

733 f"{target.page_type} page for {target.label} -> {target.subdir}/{target.slug}.md",

734 config,

735 )

736 return page_path

737

738

739def _generate_page(

740 label: str,

741 prompt: str,

742 chunks: list[SearchChunk],

743 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],

744 page_type: str,

745 slug: str,

746 source_names: list[str],

747 provider: LLMProvider,

748 store: Store,

749 config: Config,

750 on_progress: WikiProgressCallback | None = None,

751 leaf_hash: str = "",

752) -> Path | None:

753 """Core generation pipeline shared by summary and synthesis pages."""

754

755 def _emit(stage: str, **data: object) -> None:

756 if on_progress is not None:

757 on_progress(stage, data)

758

759 _emit("preparing", chunks=len(chunks), source=label)

760

761 messages = _build_wiki_messages(prompt, provider, config)

762 _emit("generating", source=label)

763 options = config.generation_options(

764 temperature=config.wiki_temperature,

765 max_tokens=config.wiki_summary_max_tokens,

766 )

767 try:

768 response = provider.chat(messages, stream=False, options=options)

769 wiki_text = strip_reasoning(cast(str, response)).strip()

770 except Exception as exc:

771 log.warning("LLM failed to generate wiki page for %s: %s", label, exc)

772 _emit("failed", error=str(exc))

773 return None

774

775 if not wiki_text:

776 log.warning("LLM returned empty response for wiki page %s", label)

777 _emit("failed", error="Model returned empty response")

778 return None

779

780 parsed_citations = parse_wiki_citations(wiki_text)

781 verified = _verify_citations(citation_resolver(parsed_citations), chunks, label, config)

782 if not verified:

783 log.warning("No valid citations for %s, skipping", label)

784 _emit("failed", error="No valid citations found")

785 return None

786

787 _emit("faithfulness_check")

788 score = _check_faithfulness(chunks, wiki_text, label, config)

789 threshold = config.wiki_embedding_faithfulness_threshold

790 subdir = page_type if score >= threshold else DRAFTS_SUBDIR

791 if subdir == DRAFTS_SUBDIR:

792 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold)

793

794 wiki_text = strip_citation_block(wiki_text)

795 frontmatter = _build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks)

796 citation_block = render_citation_block(verified)

797 full_content = _assemble_content(frontmatter, wiki_text, citation_block)

798

799 wiki_root = config.data_root / config.wiki_dir

800 target = PageTarget(

801 wiki_root=wiki_root,

802 subdir=subdir,

803 slug=slug,

804 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",

805 page_type=page_type,

806 label=label,

807 )

808 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config)

809

810 log.info(

811 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)",

812 label,

813 target.subdir,

814 score,

815 len(verified),

816 )

817 return page_path

818

819

820def _resolve_multi_source_citations(

821 parsed_citations: list[ParsedCitation],

822 source_names: list[str],

823 source_hashes: dict[str, str],

824 chunks_by_source: dict[str, list[SearchChunk]],

825) -> list[CitationRecord]:

826 """Resolve citations from a synthesis page that cites multiple sources.

827 Each citation's source_ref is matched against the source list to

828 determine which source document it references.

829 """

830 records: list[CitationRecord] = []

831 now = datetime.now(UTC).isoformat()

832

833 all_chunks = [c for cs in chunks_by_source.values() for c in cs]

834

835 for parsed in parsed_citations:

836 excerpt = _extract_excerpt(parsed.source_ref)

837

838 matched_source = _match_citation_source(parsed.source_ref, source_names)

839 if not matched_source:

840 matched_source = _find_excerpt_source(excerpt, chunks_by_source)

841 if not matched_source and source_names:

842 # No citation match found; default to first listed source

843 log.warning(

844 "No citation match for chunk — defaulting to first source: %s",

845 source_names[0],

846 )

847 matched_source = source_names[0]

848

849 search_chunks = chunks_by_source.get(matched_source, all_chunks)

850 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, search_chunks)

851 records.append(

852 _build_citation_record(

853 parsed.citation_key,

854 excerpt,

855 matched_source,

856 source_hashes.get(matched_source, ""),

857 page_start,

858 page_end,

859 line_start,

860 line_end,

861 now,

862 )

863 )

864 return records

865

866

867def _match_citation_source(source_ref: str, source_names: list[str]) -> str:

868 """Find which source a citation references by matching filenames in the ref."""

869 for name in source_names:

870 if name in source_ref:

871 return name

872 return ""

873

874

875def _find_excerpt_source(excerpt: str, chunks_by_source: dict[str, list[SearchChunk]]) -> str:

876 """Find which source contains a given excerpt by searching chunks."""

877 if not excerpt:

878 return ""

879 for source, chunks in chunks_by_source.items():

880 for chunk in chunks:

881 if excerpt in chunk.chunk:

882 return source

883 return ""

884

885

886def _generate_synthesis_page(

887 topic: str,

888 source_names: list[str],

889 chunks_by_source: dict[str, list[SearchChunk]],

890 provider: LLMProvider,

891 store: Store,

892 config: Config,

893) -> Path | None:

894 """Generate a single synthesis page for a concept cluster.

895 Returns the path to the generated page, or None on failure.

896 """

897 all_chunks = [c for cs in chunks_by_source.values() for c in cs]

898 if not all_chunks:

899 log.warning("No chunks for synthesis topic %r, skipping", topic)

900 return None

901

902 all_chunks = _truncate_chunks_to_budget(all_chunks, config)

903 chunks_text = _chunks_to_text(all_chunks)

904 source_list = "\n".join(f"- {name}" for name in sorted(source_names))

905 template = config.wiki_synthesis_prompt

906 display_topic = clean_label_for_display(topic)

907 prompt = template.format(topic=display_topic, source_list=source_list, chunks_text=chunks_text)

908 slug = make_slug(topic)

909

910 source_hashes: dict[str, str] = {}

911 for name in source_names:

912 source_path = config.documents_dir / name

913 if source_path.exists():

914 source_hashes[name] = file_hash(source_path)

915

916 def resolver(parsed: list[ParsedCitation]) -> list[CitationRecord]:

917 return _resolve_multi_source_citations(

918 parsed, source_names, source_hashes, chunks_by_source

919 )

920

921 return _generate_page(

922 label=topic,

923 prompt=prompt,

924 chunks=all_chunks,

925 citation_resolver=resolver,

926 page_type=SYNTHESIS_SUBDIR,

927 slug=slug,

928 source_names=source_names,

929 provider=provider,

930 store=store,

931 config=config,

932 )

933

934

935def _generate_for_cluster(

936 label: str,

937 sources: frozenset[str],

938 provider: LLMProvider,

939 store: Store,

940 config: Config,

941) -> Path | None:

942 """Gather chunks for a cluster and generate a synthesis page."""

943 source_names = sorted(sources)

944 chunks_by_source: dict[str, list[SearchChunk]] = {}

945 for name in source_names:

946 chunks = store.get_chunks_by_source(name)

947 if chunks:

948 chunks_by_source[name] = chunks

949

950 if len(chunks_by_source) < MIN_CLUSTER_SOURCES:

951 return None

952

953 return _generate_synthesis_page(label, source_names, chunks_by_source, provider, store, config)

954

955

956def generate_synthesis_pages(

957 provider: LLMProvider,

958 store: Store,

959 clusterer: SourceClusterer,

960 config: Config | None = None,

961) -> list[Path]:

962 """Generate synthesis pages for source clusters spanning 3+ documents."""

963 if config is None:

964 config = cfg

965

966 clusters = clusterer.get_clusters(min_sources=MIN_CLUSTER_SOURCES)

967 if not clusters:

968 log.info("No source clusters span %d+ sources, skipping synthesis", MIN_CLUSTER_SOURCES)

969 return []

970

971 pages: list[Path] = []

972 for cluster in clusters:

973 page = _generate_for_cluster(cluster.label, cluster.sources, provider, store, config)

974 if page is not None:

975 pages.append(page)

976

977 log.info("Generated %d synthesis pages", len(pages))

978 return pages

979

980

981def _hash_existing_sources(source_names: list[str], documents_dir: Path) -> dict[str, str]:

982 """Hash each source file that still exists on disk (used for citation staleness)."""

983 out: dict[str, str] = {}

984 for name in source_names:

985 source_path = documents_dir / name

986 if source_path.exists():

987 out[name] = file_hash(source_path)

988 return out

989

990

991# Phase D: archive-migration sentinel and helpers. The sentinel lives

992# under data_dir (NOT inside wiki/) so Obsidian sync and wiki

993# tree-walkers never surface it.

994_PHASE_D_SENTINEL_NAME = ".phase-d-migrated"

995

996# Pre-Phase-D wiki concepts that we move to archive/ as part of the

997# one-time migration. Matches wiki/<CONCEPTS_SUBDIR>/*.md recursively.

998_ARCHIVE_CONCEPTS_SUBPATH = Path(ARCHIVE_SUBDIR) / CONCEPTS_SUBDIR

999

1000

1001def _maybe_run_phase_d_migration(wiki_root: Path, data_dir: Path) -> None:

1002 """One-time migration: archive pre-Phase-D concept pages.

1003

1004 Runs idempotently, gated by ``{data_dir}/.phase-d-migrated``:

1005

1006 1. Move every ``wiki/concepts/*.md`` to ``wiki/archive/concepts/``

1007 preserving relative subpaths. Older concept pages stay

1008 readable but drop out of the active wiki browse surface.

1009 2. Unwrap stale ``[[archived-slug]]`` references across the

1010 remaining pages so a reader clicking a link does not hit a

1011 404. Archived slugs become plain text.

1012 3. Write the sentinel so future builds skip this path.

1013

1014 D3's freshly LLM-curated concept pages written AFTER the sentinel

1015 exists are never touched.

1016 """

1017 sentinel = data_dir / _PHASE_D_SENTINEL_NAME

1018 if sentinel.exists():

1019 return

1020 concepts_dir = wiki_root / CONCEPTS_SUBDIR

1021 archive_dir = wiki_root / _ARCHIVE_CONCEPTS_SUBPATH

1022 archived_slugs: list[str] = []

1023 if concepts_dir.is_dir():

1024 for src in sorted(concepts_dir.rglob("*.md")):

1025 rel = src.relative_to(concepts_dir)

1026 dest = archive_dir / rel

1027 dest.parent.mkdir(parents=True, exist_ok=True)

1028 src.replace(dest)

1029 archived_slugs.append(str(rel.with_suffix("")).replace("\\", "/"))

1030

1031 if archived_slugs:

1032 _unwrap_archived_links(wiki_root, archived_slugs)

1033

1034 data_dir.mkdir(parents=True, exist_ok=True)

1035 sentinel.write_text(datetime.now(UTC).isoformat(), encoding="utf-8")

1036 if archived_slugs:

1037 log.info(

1038 "Phase D migration: archived %d concept pages, sentinel written at %s",

1039 len(archived_slugs),

1040 sentinel,

1041 )

1042

1043

1044def _unwrap_archived_links(wiki_root: Path, archived_slugs: list[str]) -> None:

1045 """Rewrite ``[[slug]]`` → ``slug`` (plain text) across remaining wiki pages.

1046

1047 The existing ``_rewrite_links_across_wiki`` path is the wrong

1048 tool here: it compiles an *additive* surface map, not a

1049 removal pass. Walk the active wiki content subdirs once per

1050 archived slug is acceptable because the archive count is

1051 bounded (concepts that existed pre-migration). Pages whose body

1052 did not change are not rewritten.

1053 """

1054 if not archived_slugs:

1055 return

1056 patterns = [(re.compile(r"\[\[" + re.escape(slug) + r"\]\]"), slug) for slug in archived_slugs]

1057 for subdir in WIKI_CONTENT_SUBDIRS:

1058 subdir_path = wiki_root / subdir

1059 if not subdir_path.is_dir():

1060 continue

1061 for md_path in subdir_path.rglob("*.md"):

1062 original = md_path.read_text(encoding="utf-8")

1063 rewritten = original

1064 for pattern, replacement in patterns:

1065 rewritten = pattern.sub(replacement, rewritten)

1066 if rewritten != original:

1067 md_path.write_text(rewritten, encoding="utf-8")

1068

1069

1070# Pending-marker conventions: the drafts listing surface

1071# (``lilbee.wiki.drafts``) scans for these prefixes to classify a

1072# draft as PARSE or COLLISION instead of a drift-routed regen. The

1073# keyword phrases live in ``wiki.shared`` so writer (gen) and reader

1074# (drafts) stay in sync on the exact wording.

1075_PENDING_PARSE_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_PARSE}"

1076_PENDING_COLLISION_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_COLLISION}"

1077

1078

1079def _write_pending_marker(

1080 drafts_dir: Path,

1081 slug: str,

1082 marker_line: str,

1083 frontmatter: str = "",

1084) -> Path:

1085 """Write a PENDING marker page under ``drafts/<slug>.md``.

1086

1087 ``marker_line`` is the leading HTML comment that both identifies

1088 the marker kind and carries the context (source, label). The

1089 optional ``frontmatter`` preserves minimal metadata for the

1090 drafts surface to round-trip (e.g. ``bad_title``-style fields).

1091 """

1092 drafts_dir.mkdir(parents=True, exist_ok=True)

1093 draft_path = drafts_dir / f"{slug}.md"

1094 body = marker_line + "\n"

1095 if frontmatter:

1096 body += "\n" + frontmatter

1097 draft_path.write_text(body, encoding="utf-8")

1098 return draft_path

1099

1100

1101def _delete_pending_marker_if_present(drafts_dir: Path, slug: str) -> bool:

1102 """Delete an existing PENDING marker for *slug*; return whether one was removed.

1103

1104 Match is slug-equality (not fuzzy): an LLM that rephrases a

1105 label on retry (``brake system`` → ``braking system``) leaves

1106 the old marker behind for the user to drain via ``wiki drafts

1107 reject``. Documented limitation; follow-up if the pattern

1108 matters.

1109 """

1110 draft_path = drafts_dir / f"{slug}.md"

1111 if not draft_path.is_file():

1112 return False

1113 try:

1114 body = draft_path.read_text(encoding="utf-8")

1115 except OSError:

1116 return False

1117 first_line = body.splitlines()[0] if body else ""

1118 is_pending = first_line.startswith(_PENDING_PARSE_MARKER_PREFIX) or first_line.startswith(

1119 _PENDING_COLLISION_MARKER_PREFIX

1120 )

1121 if not is_pending:

1122 return False

1123 draft_path.unlink()

1124 return True

1125

1126

1127def _group_entities_by_primary_source(

1128 entities: list[ExtractedEntity],

1129) -> dict[str, list[ExtractedEntity]]:

1130 """Group entities under the source that mentions them most.

1131

1132 Primary source = source with the highest chunk-ref count;

1133 lexicographic tiebreak. An entity with no refs is dropped

1134 silently (defensive: extractor always attaches refs, but a

1135 future extractor might not).

1136 """

1137 grouped: dict[str, list[ExtractedEntity]] = {}

1138 for entity in entities:

1139 if not entity.chunk_refs:

1140 continue

1141 counts: dict[str, int] = {}

1142 for ref in entity.chunk_refs:

1143 counts[ref.source] = counts.get(ref.source, 0) + 1

1144 primary = min(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0]

1145 grouped.setdefault(primary, []).append(entity)

1146 return grouped

1147

1148

1149# Regex that matches section headers the batch parser recognizes:

1150# H1 (``# Name``), H2 (``## Name``), or a bold-line heading

1151# (``**Name**``) at line start. The name capture is anchored to the

1152# rest of the line (stripped of trailing whitespace) so labels like

1153# ``## Brake System (hydraulic)`` still parse.

1154_SECTION_HEADER_RE = re.compile(

1155 r"^(?:(?:##?)\s+(?P<hashname>[^\n]+)|\*\*(?P<boldname>[^\*\n]+)\*\*)\s*$",

1156 re.MULTILINE,

1157)

1158

1159# In-body ``[^keyN]`` footnote-marker pattern. Module-scope so the

1160# batched-generation hot path (`_finalize_section`) does not recompile

1161# it on every recovered section.

1162_FOOTNOTE_MARKER_RE = re.compile(r"\[\^([a-zA-Z0-9_\-]+)\]")

1163

1164

1165def _split_batched_output(

1166 text: str,

1167 expected_entity_labels: set[str],

1168 expected_concept_labels: set[str] | None = None,

1169) -> dict[str, tuple[EntityKind, str]]:

1170 """Best-effort parse of the batched LLM response into per-label bodies.

1171

1172 Splits on H1/H2/bold-line headers, then matches each header

1173 against the expected entity and concept label sets via

1174 case-insensitive substring. Known labels are tagged with the

1175 right ``EntityKind``; unknown headers are dropped. Labels whose

1176 section could not be recovered at all are surfaced to the caller

1177 (they show up as *missing from the return dict* rather than a

1178 separate list — caller loops over the expected sets to write

1179 PENDING markers).

1180 """

1181 concepts = expected_concept_labels or set()

1182 recovered: dict[str, tuple[EntityKind, str]] = {}

1183 matches = list(_SECTION_HEADER_RE.finditer(text))

1184 if not matches:

1185 return recovered

1186 for i, match in enumerate(matches):

1187 name = match.group("hashname") or match.group("boldname") or ""

1188 name = name.strip()

1189 start = match.end()

1190 end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

1191 body = text[start:end].strip()

1192 if not body:

1193 continue

1194 lowered = name.lower()

1195 kind_label = _match_label(lowered, expected_entity_labels, EntityKind.ENTITY)

1196 if kind_label is None:

1197 kind_label = _match_label(lowered, concepts, EntityKind.CONCEPT)

1198 if kind_label is None:

1199 # Concept labels come from the LLM itself — tag any

1200 # unmatched section as CONCEPT only when the caller is

1201 # expecting concept curation; otherwise drop it as

1202 # noise.

1203 if concepts is not None and expected_concept_labels is not None:

1204 recovered.setdefault(name, (EntityKind.CONCEPT, _prefix_heading(name, body)))

1205 continue

1206 kind, label = kind_label

1207 recovered[label] = (kind, _prefix_heading(name, body))

1208 return recovered

1209

1210

1211def _match_label(

1212 lowered_name: str,

1213 expected: set[str],

1214 kind: EntityKind,

1215) -> tuple[EntityKind, str] | None:

1216 """Case-insensitive substring match of *lowered_name* against *expected*.

1217

1218 Returns ``(kind, original_label)`` on hit, ``None`` otherwise.

1219 A substring match (not equality) accommodates the LLM adding

1220 qualifiers ("Brake System (hydraulic)" vs "brake system").

1221 """

1222 for label in expected:

1223 low = label.lower()

1224 if low and (low in lowered_name or lowered_name in low):

1225 return (kind, label)

1226 return None

1227

1228

1229def _prefix_heading(name: str, body: str) -> str:

1230 """Ensure the extracted body starts with a ``# Name`` H1.

1231

1232 The batched prompt instructs the model to emit ``## Name`` per

1233 section. After splitting, the per-section body has lost its

1234 header. Rebuild an H1 so the B3 title/body coherence gate still

1235 has a heading to match.

1236 """

1237 stripped = body.lstrip()

1238 if stripped.startswith("# "):

1239 return body

1240 return f"# {name}\n\n{body}"

1241

1242

1243def _chunks_for_source(chunks: list[SearchChunk], source: str) -> list[SearchChunk]:

1244 """Return the subset of *chunks* whose ``source`` matches, preserving order."""

1245 return [c for c in chunks if c.source == source]

1246

1247

1248def _build_batch_prompt(

1249 source: str,

1250 entities: list[ExtractedEntity],

1251 chunks_text: str,

1252 extract_concepts: bool,

1253 config: Config,

1254) -> str:

1255 """Render :attr:`Config.wiki_entity_batch_prompt` for one source call.

1256

1257 ``extract_concepts`` controls whether the concept-curation

1258 paragraph is injected: True adds a "identify 3-5 concepts" block;

1259 False leaves ``{concept_instruction}`` empty so the LLM writes

1260 entity sections only. Keeps the per-source batched call the

1261 single entry point whether or not concepts are requested.

1262 """

1263 entity_labels = ", ".join(clean_label_for_display(e.label) for e in entities) or "(none)"

1264 if extract_concepts:

1265 concept_instruction = (

1266 "First, identify 3-5 CONCEPTS — abstract topics or domain terms "

1267 "from the source that deserve a standalone wiki page. Do NOT include "

1268 "pronouns, articles, or generic nouns.\n\n"

1269 "Then write a wiki section for each of the concepts you identified, "

1270 "PLUS one section for each NER ENTITY listed below.\n\n"

1271 )

1272 else:

1273 concept_instruction = ""

1274 return config.wiki_entity_batch_prompt.format(

1275 source=source,

1276 entity_list=entity_labels,

1277 chunks_text=chunks_text,

1278 concept_instruction=concept_instruction,

1279 )

1280

1281

1282def _short_source_hash(source: str) -> str:

1283 """8-char sha256 digest of *source* (stable collision-marker suffix)."""

1284 return hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]

1285

1286

1287def _generate_source_batch(

1288 source: str,

1289 entities: list[ExtractedEntity],

1290 chunks: list[SearchChunk],

1291 provider: LLMProvider,

1292 store: Store,

1293 config: Config,

1294 *,

1295 extract_concepts: bool,

1296 written_concept_slugs: dict[str, str],

1297) -> list[Path]:

1298 """Issue one LLM call for *source* and finalize every recovered section.

1299

1300 Returns the list of page paths written (entities + concepts

1301 combined). Labels not recovered by the parser become PENDING

1302 markers under ``wiki/drafts/`` so the next build can retry.

1303 Concept slugs already written by an earlier source produce a

1304 PENDING-COLLISION marker on the losing side (see

1305 :func:`_handle_concept_write`).

1306

1307 ``written_concept_slugs`` is the per-build ledger of

1308 slug → first_source. Callers share one dict across the per-source

1309 loop. The second source to propose a slug is the one that gets

1310 diverted to a collision marker.

1311 """

1312 if not chunks:

1313 return []

1314 budgeted = _truncate_chunks_to_budget(chunks, config)

1315 chunks_text = _chunks_to_text(budgeted)

1316 prompt = _build_batch_prompt(source, entities, chunks_text, extract_concepts, config)

1317 messages = _build_wiki_messages(prompt, provider, config)

1318 options = config.generation_options(

1319 temperature=config.wiki_temperature,

1320 max_tokens=config.wiki_summary_max_tokens,

1321 )

1322 try:

1323 response = provider.chat(messages, stream=False, options=options)

1324 text = strip_reasoning(cast(str, response)).strip()

1325 except Exception as exc:

1326 log.warning("Batched LLM call failed for source %s: %s", source, exc)

1327 return []

1328

1329 if not text:

1330 log.warning("Batched LLM call returned empty response for source %s", source)

1331 return []

1332

1333 expected_entity_labels = {e.label for e in entities}

1334 expected_concepts: set[str] | None = set() if extract_concepts else None

1335 parsed = _split_batched_output(text, expected_entity_labels, expected_concepts)

1336

1337 wiki_root = config.data_root / config.wiki_dir

1338 drafts_dir = wiki_root / DRAFTS_SUBDIR

1339 source_names = [source]

1340 source_hashes = _hash_existing_sources(source_names, config.documents_dir)

1341 chunks_by_source = {source: budgeted}

1342

1343 # Citation definitions live in the trailing block of the WHOLE

1344 # response, not inside any one section body. Parse once over the

1345 # full text and replay the same list for every section, so each

1346 # page sees its own citations even when only the last section

1347 # carries the definition trailer.

1348 shared_parsed_citations = parse_wiki_citations(text)

1349

1350 pages: list[Path] = []

1351 seen_labels: set[str] = set()

1352 for header_label, (kind, body) in parsed.items():

1353 seen_labels.add(header_label)

1354 resolver = functools.partial(

1355 _resolve_multi_source_citations,

1356 source_names=source_names,

1357 source_hashes=source_hashes,

1358 chunks_by_source=chunks_by_source,

1359 )

1360 page = _finalize_section(

1361 header_label=header_label,

1362 kind=kind,

1363 body=body,

1364 chunks=budgeted,

1365 citation_resolver=resolver,

1366 source_names=source_names,

1367 store=store,

1368 config=config,

1369 source=source,

1370 written_concept_slugs=written_concept_slugs,

1371 drafts_dir=drafts_dir,

1372 shared_parsed_citations=shared_parsed_citations,

1373 )

1374 if page is not None:

1375 pages.append(page)

1376

1377 for entity in entities:

1378 if entity.label not in seen_labels:

1379 marker = (

1380 f"{_PENDING_PARSE_MARKER_PREFIX} for source {source}, "

1381 f"entity/concept {entity.label} - "

1382 "run wiki build again or manually accept via wiki drafts accept -->"

1383 )

1384 # Route through ``yaml.safe_dump`` so a label or source

1385 # containing a colon, quote, or newline does not produce a

1386 # frontmatter block that ``parse_frontmatter`` silently drops.

1387 frontmatter_body = yaml.safe_dump(

1388 {

1389 "pending_source": source,

1390 "pending_label": entity.label,

1391 "pending_kind": PENDING_KIND_PARSE,

1392 },

1393 sort_keys=False,

1394 )

1395 frontmatter = f"---\n{frontmatter_body}---\n"

1396 path = _write_pending_marker(drafts_dir, entity.slug, marker, frontmatter)

1397 log.info("Wrote PENDING-PARSE marker for %s -> %s", entity.slug, path)

1398

1399 return pages

1400

1401

1402def _finalize_section(

1403 *,

1404 header_label: str,

1405 kind: EntityKind,

1406 body: str,

1407 chunks: list[SearchChunk],

1408 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]],

1409 source_names: list[str],

1410 store: Store,

1411 config: Config,

1412 source: str,

1413 written_concept_slugs: dict[str, str],

1414 drafts_dir: Path,

1415 shared_parsed_citations: list[ParsedCitation],

1416) -> Path | None:

1417 """Citation-check, faithfulness-check, write one batched section.

1418

1419 Shared by entity and concept sections from the per-source batched

1420 call. Returns the written page path, or ``None`` if the section

1421 failed any gate (no citations, empty body, slug collision marker

1422 handled via side channel). ``shared_parsed_citations`` is the

1423 definition list parsed once over the whole response — every

1424 section replays it so pages other than the last one still have

1425 their footnotes resolved.

1426 """

1427 slug = make_slug(header_label)

1428 if not slug:

1429 log.info("Empty slug for batched section %r; skipping", header_label)

1430 return None

1431

1432 # Only replay citation keys that this section actually references

1433 # in the body; otherwise every section would claim every citation.

1434 section_keys = {ref.citation_key for ref in parse_wiki_citations(body)}

1435 # Fall back to in-body ``[^keyN]`` references when no definitions

1436 # live inside the section: count occurrences of the footnote

1437 # marker against the shared definition set.

1438 section_keys.update(_FOOTNOTE_MARKER_RE.findall(body))

1439 relevant = [c for c in shared_parsed_citations if c.citation_key in section_keys]

1440 verified = _verify_citations(citation_resolver(relevant), chunks, header_label, config)

1441 if not verified:

1442 log.info("No valid citations for batched section %s, skipping", header_label)

1443 return None

1444

1445 score = _check_faithfulness(chunks, body, header_label, config)

1446 threshold = config.wiki_embedding_faithfulness_threshold

1447 page_type = CONCEPTS_SUBDIR if kind is EntityKind.CONCEPT else ENTITIES_SUBDIR

1448 subdir = page_type if score >= threshold else DRAFTS_SUBDIR

1449 if subdir == DRAFTS_SUBDIR:

1450 log.info(

1451 "Batched section %s scored %.2f (< %.2f), sending to drafts",

1452 header_label,

1453 score,

1454 threshold,

1455 )

1456

1457 clean_body = strip_citation_block(body)

1458 frontmatter = _build_frontmatter(config, source_names, score, chunks=chunks)

1459 citation_block = render_citation_block(verified)

1460 full_content = _assemble_content(frontmatter, clean_body, citation_block)

1461

1462 # Concept collision: the second source proposing a slug loses

1463 # and writes to a drafts collision marker; the winning source's

1464 # page stays untouched.

1465 if kind is EntityKind.CONCEPT and subdir == CONCEPTS_SUBDIR:

1466 first_source = written_concept_slugs.get(slug)

1467 if first_source is not None and first_source != source:

1468 return _divert_concept_collision(

1469 slug=slug,

1470 source=source,

1471 first_source=first_source,

1472 content=full_content,

1473 drafts_dir=drafts_dir,

1474 )

1475 written_concept_slugs.setdefault(slug, source)

1476

1477 # Successful regen of a previously-PENDING slug: remove the old

1478 # marker so the drafts surface no longer lists it.

1479 _delete_pending_marker_if_present(drafts_dir, slug)

1480

1481 wiki_root = config.data_root / config.wiki_dir

1482 target = PageTarget(

1483 wiki_root=wiki_root,

1484 subdir=subdir,

1485 slug=slug,

1486 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md",

1487 page_type=page_type,

1488 label=header_label,

1489 )

1490 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config)

1491 log.info(

1492 "Generated batched page for %s -> %s (score=%.2f, citations=%d)",

1493 header_label,

1494 target.subdir,

1495 score,

1496 len(verified),

1497 )

1498 return page_path

1499

1500

1501def _divert_concept_collision(

1502 *,

1503 slug: str,

1504 source: str,

1505 first_source: str,

1506 content: str,

1507 drafts_dir: Path,

1508) -> Path:

1509 """Write the losing concept to ``drafts/<slug>-collision-<hash>.md``.

1510

1511 The winning source's page is unchanged on disk. Hash is the

1512 first 8 hex of sha256(source_filename); stable per source so a

1513 retry on the same two sources lands at the same draft path,

1514 letting the user iterate without marker sprawl.

1515 """

1516 short = _short_source_hash(source)

1517 collision_slug = f"{slug}-collision-{short}"

1518 marker = (

1519 f"{_PENDING_COLLISION_MARKER_PREFIX} with source {first_source}, "

1520 f"content from {source} held for review -->\n\n"

1521 )

1522 drafts_dir.mkdir(parents=True, exist_ok=True)

1523 path = drafts_dir / f"{collision_slug}.md"

1524 path.write_text(marker + content, encoding="utf-8")

1525 log.warning(

1526 "Concept slug collision: %s already written by %s; diverted %s's version to %s",

1527 slug,

1528 first_source,

1529 source,

1530 path,

1531 )

1532 return path

1533

1534

1535def build_wiki(

1536 entities: list[ExtractedEntity],

1537 provider: LLMProvider,

1538 store: Store,

1539 config: Config | None = None,

1540 *,

1541 extract_concepts: bool = True,

1542) -> list[Path]:

1543 """Produce entity and LLM-curated concept pages per source.

1544

1545 Phase D replaces the per-entity / per-concept fan-out with a

1546 per-source batched call: for each source in ``entities``' chunk

1547 refs, one LLM call identifies 3-5 concepts AND writes a wiki

1548 section for every pre-extracted entity belonging to that source.

1549 Output sections are split, citation-verified, embedding-scored,

1550 and landed under ``wiki/entities/`` or ``wiki/concepts/``

1551 depending on kind.

1552

1553 ``extract_concepts=False`` (used by the incremental-ingest hook)

1554 drops the concept-curation paragraph from the prompt so a

1555 touched source does not churn concept slugs.

1556

1557 A one-time archive migration runs first (idempotently, gated by

1558 ``{data_dir}/.phase-d-migrated``), moving pre-Phase-D concept

1559 pages under ``wiki/archive/concepts/`` and unwrapping stale

1560 ``[[archived-slug]]`` links across the remaining pages.

1561 """

1562 if config is None:

1563 config = cfg

1564 wiki_root = config.data_root / config.wiki_dir

1565 _maybe_run_phase_d_migration(wiki_root, config.data_dir)

1566

1567 grouped = _group_entities_by_primary_source(entities)

1568 all_sources = _all_sources_in_scope(entities, grouped, store, config, extract_concepts)

1569 written_concept_slugs: dict[str, str] = {}

1570 pages: list[Path] = []

1571

1572 for source in sorted(all_sources):

1573 source_entities = grouped.get(source, [])

1574 chunks = store.get_chunks_by_source(source)

1575 chunk_count = len(chunks)

1576 source_extract = extract_concepts and chunk_count >= config.wiki_batch_min_chunks

1577 if not source_entities and not source_extract:

1578 log.info(

1579 "Skipping source %s: %d entities, %d chunks, min=%d, extract=%s",

1580 source,

1581 len(source_entities),

1582 chunk_count,

1583 config.wiki_batch_min_chunks,

1584 source_extract,

1585 )

1586 continue

1587 source_pages = _generate_source_batch(

1588 source=source,

1589 entities=source_entities,

1590 chunks=chunks,

1591 provider=provider,

1592 store=store,

1593 config=config,

1594 extract_concepts=source_extract,

1595 written_concept_slugs=written_concept_slugs,

1596 )

1597 pages.extend(source_pages)

1598

1599 _rewrite_links_across_wiki(entities, config)

1600 log.info("Generated %d batched wiki pages", len(pages))

1601 return pages

1602

1603

1604def _all_sources_in_scope(

1605 entities: list[ExtractedEntity],

1606 grouped: dict[str, list[ExtractedEntity]],

1607 store: Store,

1608 config: Config,

1609 extract_concepts: bool,

1610) -> set[str]:

1611 """Union of sources with entities and (when enabled) eligible for concept curation.

1612

1613 Seed the union with every entity's primary source. When

1614 ``extract_concepts`` is True AND ``wiki_batch_min_chunks`` is

1615 satisfied, add any source in the store that passes the floor.

1616 This gives concept-only sources (no extracted entities) their

1617 chance at curation while keeping zero-entity short sources

1618 skipped entirely.

1619 """

1620 sources: set[str] = set(grouped)

1621 if not extract_concepts:

1622 return sources

1623 try:

1624 records = store.get_sources()

1625 except Exception as exc:

1626 log.warning("get_sources failed; sticking to entity-grouped sources: %s", exc)

1627 return sources

1628 for record in records:

1629 name = record.get("filename", "") if isinstance(record, dict) else ""

1630 if not name:

1631 continue

1632 if name in sources:

1633 continue

1634 chunk_count = record.get("chunk_count", 0) if isinstance(record, dict) else 0

1635 if chunk_count >= config.wiki_batch_min_chunks:

1636 sources.add(name)

1637 _ = entities # silences linters on unused pass-through; kept for doc clarity

1638 return sources

1639

1640

1641def _entity_surface_map(entities: list[ExtractedEntity]) -> dict[str, str]:

1642 """Build the surface-form -> slug map for the ``[[link]]`` rewriter.

1643

1644 Includes both the entity's human label (e.g. *"Henry Ford"*) and

1645 the slug-with-hyphens-as-spaces variant (*"henry ford"*) so the

1646 rewriter catches either form in body text.

1647 """

1648 mapping: dict[str, str] = {}

1649 for entity in entities:

1650 mapping[entity.label] = entity.slug

1651 spaced = entity.slug.replace("-", " ")

1652 if spaced and spaced != entity.label:

1653 mapping[spaced] = entity.slug

1654 return mapping

1655

1656

1657_ENTITY_LIKE_SUBDIRS: tuple[str, ...] = (CONCEPTS_SUBDIR, ENTITIES_SUBDIR)

1658

1659

1660def _augment_surface_map_with_existing_pages(

1661 surface_to_slug: dict[str, str], wiki_root: Path

1662) -> None:

1663 """Add slugs for pages already on disk so an incremental rebuild of

1664 one concept still links to its unchanged neighbors. **Mutates

1665 surface_to_slug in place.** Only enriches the map with the

1666 hyphen-to-space surface form because frontmatter labels aren't

1667 read here; body prose typically uses the spaced form so this

1668 covers the common case.

1669 """

1670 for subdir in _ENTITY_LIKE_SUBDIRS:

1671 subdir_path = wiki_root / subdir

1672 if not subdir_path.is_dir():

1673 continue

1674 for md_path in subdir_path.rglob("*.md"):

1675 slug = md_path.stem

1676 spaced = slug.replace("-", " ")

1677 surface_to_slug.setdefault(spaced, slug)

1678

1679

1680def _rewrite_links_across_wiki(entities: list[ExtractedEntity], config: Config) -> None:

1681 """Rewrite ``[[slug]]`` links on every page under ``wiki/`` content subdirs.

1682

1683 A page never receives a link to itself: the rewriter takes the

1684 owning slug and drops it inside its match callback, so the

1685 surface map is shared unmodified across every page in the walk

1686 (no O(M) dict rebuild per file). The map is augmented with

1687 slugs from the existing on-disk corpus so a touched page still

1688 links to untouched neighbors. The alternation regex + lookup are

1689 compiled once per build and reused across pages.

1690 """

1691 surface_to_slug = _entity_surface_map(entities)

1692 wiki_root = config.data_root / config.wiki_dir

1693 _augment_surface_map_with_existing_pages(surface_to_slug, wiki_root)

1694 rewriter = compile_rewriter(surface_to_slug)

1695 if rewriter is None:

1696 return

1697

1698 for subdir in WIKI_CONTENT_SUBDIRS:

1699 subdir_path = wiki_root / subdir

1700 if not subdir_path.is_dir():

1701 continue

1702 is_entity_subdir = subdir in _ENTITY_LIKE_SUBDIRS

1703 for md_path in subdir_path.rglob("*.md"):

1704 owning_slug = md_path.stem if is_entity_subdir else None

1705 original = md_path.read_text(encoding="utf-8")

1706 rewritten = apply_rewriter(original, rewriter, skip_slug=owning_slug)

1707 if rewritten != original:

1708 md_path.write_text(rewritten, encoding="utf-8")

1709

1710

1711class WikiBuildSummary(TypedDict):

1712 """Result of a full wiki build/update."""

1713

1714 paths: list[str]

1715 entities: int

1716 count: int

1717

1718

1719def run_full_build(config: Config | None = None) -> WikiBuildSummary:

1720 """Extract entities + build wiki across every ingested source.

1721

1722 Shared entry point for CLI ``wiki build`` / ``wiki update``, MCP

1723 ``wiki_build`` / ``wiki_update``, and ``POST /api/wiki/build`` /

1724 ``PATCH /api/wiki/update``.

1725

1726 Side effects (in order):

1727 1. Reads every source via ``store.get_sources()``.

1728 2. Reads chunks for each source via ``store.get_chunks_by_source``.

1729 3. Calls the entity extractor (may invoke the LLM provider).

1730 4. Calls :func:`build_wiki` which writes wiki page files.

1731 5. Calls :func:`update_wiki_index` which rewrites ``wiki/index.md``.

1732 6. Calls :func:`append_wiki_log` which appends a build entry.

1733

1734 Concurrency:

1735 Not safe to run concurrently with itself or with another wiki

1736 write path (drafts accept/reject, prune). Callers that share an

1737 event loop or process must serialize via an external lock — the

1738 REST routes do this with a per-process ``asyncio.Lock``; MCP and

1739 CLI run in their own processes and don't need one.

1740

1741 Running concurrently with ``/api/sync`` (an ingest write path

1742 rather than a wiki write path) is permitted but not coherent: a

1743 sync that lands between this function's source-scan and per-source

1744 chunk-fetch may produce a wiki that's missing pages for sources

1745 ingested mid-build. The result is incomplete, not corrupt, and

1746 is repaired by re-running ``run_full_build`` after the sync

1747 finishes.

1748

1749 A crash mid-build leaves a partial wiki on disk; the next successful

1750 build is idempotent and re-emits any pages it would have written, so

1751 recovery is "run it again."

1752 """

1753 if config is None:

1754 config = cfg

1755 from lilbee.wiki.entity_extractor import get_entity_extractor

1756 from lilbee.wiki.shared import WIKI_LOG_ACTION_BUILD

1757

1758 svc = get_services()

1759 chunks: list[SearchChunk] = []

1760 for record in svc.store.get_sources():

1761 chunks.extend(svc.store.get_chunks_by_source(record["filename"]))

1762

1763 extractor = get_entity_extractor(config.wiki_entity_mode, svc.provider, config)

1764 entities = extractor.extract(chunks)

1765 pages = build_wiki(

1766 entities,

1767 svc.provider,

1768 svc.store,

1769 config,

1770 extract_concepts=config.wiki_extract_concepts,

1771 )

1772 update_wiki_index()

1773 append_wiki_log(WIKI_LOG_ACTION_BUILD, f"{len(pages)} pages from {len(entities)} records")

1774 return {

1775 "paths": [str(p) for p in pages],

1776 "entities": len(entities),

1777 "count": len(pages),

1778 }

1779

1780

1781class WikiSynthesizeSummary(TypedDict):

1782 """Result of running synthesis-page generation."""

1783

1784 paths: list[str]

1785 count: int

1786

1787

1788def run_full_synthesize(config: Config | None = None) -> WikiSynthesizeSummary:

1789 """Generate synthesis pages for cross-source clusters of 3+ documents.

1790

1791 Shared entry point for MCP ``wiki_synthesize`` and ``POST

1792 /api/wiki/synthesize``. Concurrency contract matches

1793 :func:`run_full_build`: not safe to run in parallel with itself or

1794 with other wiki write paths; callers serialize via an external lock

1795 on shared event loops.

1796 """

1797 if config is None:

1798 config = cfg

1799 svc = get_services()

1800 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer, config)

1801 return {

1802 "paths": [str(p) for p in paths],

1803 "count": len(paths),

1804 }

Coverage for src / lilbee / wiki / gen.py: 100%

706 statements