Coverage for src / lilbee / wiki / gen.py: 100%

706 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Wiki page generation — LLM-driven synthesis with citation provenance. 

2 

3Generates summary pages (1:1 with sources) and synthesis pages (cross-source, 

4concept-graph-driven) from raw chunks. Each page carries inline citations 

5([^srcN]) for facts and [*inference*] markers for LLM synthesis. The 

6_citations table is the source of truth; markdown footnotes are rendered from it. 

7""" 

8 

9from __future__ import annotations 

10 

11import difflib 

12import functools 

13import hashlib 

14import logging 

15import re 

16from collections.abc import Callable 

17from datetime import UTC, datetime 

18from pathlib import Path 

19from typing import TypedDict, cast 

20 

21import numpy as np 

22import yaml 

23 

24from lilbee.chunk import chunk_text 

25from lilbee.clustering import SourceClusterer 

26from lilbee.config import CHUNKS_TABLE, DEFAULT_NUM_CTX, Config, cfg 

27from lilbee.ingest import file_hash 

28from lilbee.providers.base import LLMProvider 

29from lilbee.reasoning import strip_reasoning 

30from lilbee.services import get_services 

31from lilbee.store import ( 

32 CHUNK_TYPE_WIKI, 

33 CitationRecord, 

34 SearchChunk, 

35 Store, 

36 escape_sql_string, 

37) 

38from lilbee.wiki.citation import ( 

39 ParsedCitation, 

40 extract_body, 

41 parse_wiki_citations, 

42 render_citation_block, 

43 strip_citation_block, 

44) 

45from lilbee.wiki.entity_extractor import EntityKind, ExtractedEntity 

46from lilbee.wiki.index import append_wiki_log, update_wiki_index 

47from lilbee.wiki.links import apply_rewriter, compile_rewriter 

48from lilbee.wiki.shared import ( 

49 ARCHIVE_SUBDIR, 

50 CONCEPTS_SUBDIR, 

51 DRAFTS_SUBDIR, 

52 ENTITIES_SUBDIR, 

53 MIN_CLUSTER_SOURCES, 

54 PENDING_KIND_PARSE, 

55 PENDING_MARKER_KEYWORD_COLLISION, 

56 PENDING_MARKER_KEYWORD_PARSE, 

57 SUMMARIES_SUBDIR, 

58 SYNTHESIS_SUBDIR, 

59 WIKI_CONTENT_SUBDIRS, 

60 WIKI_LOG_ACTION_GENERATED, 

61 PageTarget, 

62 clean_label_for_display, 

63 is_valid_label, 

64 make_slug, 

65 parse_frontmatter, 

66) 

67 

68log = logging.getLogger(__name__) 

69 

70WikiProgressCallback = Callable[[str, dict[str, object]], None] 

71"""Callback for wiki generation progress: (stage, data) -> None.""" 

72 

73_MAX_DIFF_PREVIEW_LINES = 20 # lines of unified diff shown in drift warnings 

74 

75 

76# Fraction of context window reserved for chunks. The remainder leaves 

77# room for the system/user prompt template and generation output. 

78_CONTEXT_BUDGET_FRACTION = 0.75 

79 

80# Approximate characters per token for budget estimation. 4 chars/token 

81# is a widely used heuristic for English text. 

82_CHARS_PER_TOKEN = 4 

83 

84# Directive recognized by chat templates that support a reasoning mode 

85# (Qwen3, DeepSeek-R1, etc.). Wiki generation is a summarization task 

86# where chain-of-thought adds wall-clock cost without improving output, 

87# so we suppress it whenever the provider reports the capability. 

88_NO_THINK_DIRECTIVE = "/no_think" 

89 

90# Capability string returned by llama-cpp providers for reasoning models 

91# (Qwen3, DeepSeek-R1). Defined locally so gen.py doesn't depend on a 

92# specific provider-layer constant name. 

93_CAPABILITY_THINKING = "thinking" 

94 

95# JSON-style escape sequences that may appear inside quoted excerpts the 

96# model emits. Any backslash-prefixed character not in this map stays 

97# verbatim (e.g. ``\\x`` passes through unchanged). 

98_EXCERPT_ESCAPES: dict[str, str] = {"n": "\n", "t": "\t", '"': '"', "\\": "\\"} 

99 

100 

101def _build_wiki_messages( 

102 prompt: str, provider: LLMProvider, config: Config 

103) -> list[dict[str, str]]: 

104 """Build the chat messages list for a wiki-gen call. 

105 

106 When the provider reports the ``thinking`` capability for the active 

107 chat model, prepends ``/no_think`` so the chat template disables the 

108 reasoning mode. Otherwise the prompt passes through unchanged. 

109 """ 

110 capabilities = provider.get_capabilities(config.chat_model) 

111 if _CAPABILITY_THINKING in capabilities: 

112 prompt = f"{_NO_THINK_DIRECTIVE}\n\n{prompt}" 

113 return [{"role": "user", "content": prompt}] 

114 

115 

116def _truncate_chunks_to_budget( 

117 chunks: list[SearchChunk], 

118 config: Config, 

119) -> list[SearchChunk]: 

120 """Drop trailing chunks so the total text fits within the model's context budget. 

121 

122 Uses a chars/4 heuristic for token estimation. Returns the original list 

123 unchanged when all chunks fit. 

124 """ 

125 context_window = config.num_ctx or DEFAULT_NUM_CTX 

126 budget_tokens = int(context_window * _CONTEXT_BUDGET_FRACTION) 

127 budget_chars = budget_tokens * _CHARS_PER_TOKEN 

128 

129 total_chars = 0 

130 kept: list[SearchChunk] = [] 

131 for chunk in chunks: 

132 chunk_chars = len(chunk.chunk) 

133 if total_chars + chunk_chars > budget_chars and kept: 

134 break 

135 kept.append(chunk) 

136 total_chars += chunk_chars 

137 

138 if len(kept) < len(chunks): 

139 log.warning( 

140 "Truncated chunks from %d to %d to fit context window (%d tokens)", 

141 len(chunks), 

142 len(kept), 

143 context_window, 

144 ) 

145 return kept 

146 

147 

148def _group_chunks_by_page( 

149 chunks: list[SearchChunk], 

150) -> list[tuple[int, list[SearchChunk]]]: 

151 """Group chunks by ``page_start``, preserving in-document order within a page. 

152 

153 Returns ``(page_start, chunks)`` tuples sorted ascending by page number. 

154 Chunks with ``page_start=0`` (non-paginated sources) collapse to a single 

155 entry keyed at 0, so a markdown or code source still emits exactly one 

156 summary file until structure detection arrives in a later stage. 

157 """ 

158 grouped: dict[int, list[SearchChunk]] = {} 

159 for chunk in chunks: 

160 grouped.setdefault(chunk.page_start, []).append(chunk) 

161 return sorted(grouped.items()) 

162 

163 

164def _leaf_hash(chunks: list[SearchChunk]) -> str: 

165 """SHA-256 over concatenated chunk content (null-separated, in given order). 

166 

167 Acts as the cache key for incremental rebuild: an existing page whose 

168 frontmatter ``leaf_hash`` matches this value has already summarized the 

169 exact same input and can be reused without a new LLM call. 

170 """ 

171 h = hashlib.sha256() 

172 for chunk in chunks: 

173 h.update(chunk.chunk.encode("utf-8")) 

174 h.update(b"\0") 

175 return h.hexdigest() 

176 

177 

178def _find_cached_leaf(wiki_root: Path, slug: str, leaf_hash: str) -> Path | None: 

179 """Return an existing page whose ``leaf_hash`` frontmatter matches, or ``None``. 

180 

181 Checks both ``summaries/`` and ``drafts/`` so an unchanged draft stays in 

182 drafts rather than triggering a speculative regeneration. 

183 """ 

184 for subdir in (SUMMARIES_SUBDIR, DRAFTS_SUBDIR): 

185 candidate = wiki_root / subdir / f"{slug}.md" 

186 if not candidate.is_file(): 

187 continue 

188 fm = parse_frontmatter(candidate.read_text(encoding="utf-8")) 

189 if fm.get("leaf_hash") == leaf_hash: 

190 return candidate 

191 return None 

192 

193 

194def _chunks_to_text(chunks: list[SearchChunk]) -> str: 

195 """Format chunks as numbered text blocks for the LLM prompt.""" 

196 parts: list[str] = [] 

197 for i, chunk in enumerate(chunks): 

198 location = "" 

199 if chunk.page_start: 

200 location = f" (page {chunk.page_start})" 

201 elif chunk.line_start: 

202 location = f" (lines {chunk.line_start}-{chunk.line_end})" 

203 parts.append(f"[Chunk {i + 1}]{location}:\n{chunk.chunk}") 

204 return "\n\n".join(parts) 

205 

206 

207def _extract_excerpt(source_ref: str) -> str: 

208 """Extract the quoted excerpt from a citation source_ref string. 

209 e.g. 'doc.md, excerpt: "Python supports typing."' → 'Python supports typing.' 

210 

211 Common JSON-style escape sequences inside the quoted span (``\\n``, 

212 ``\\t``, ``\\"``, ``\\\\``) are decoded to their literal characters so 

213 they round-trip against the source text. Some models "helpfully" 

214 encode real newlines as ``\\n`` when emitting a quoted excerpt; the 

215 source chunk they came from has real newlines, so skipping this 

216 step leaves otherwise-faithful citations unverifiable. 

217 """ 

218 marker = 'excerpt: "' 

219 idx = source_ref.find(marker) 

220 if idx == -1: 

221 return "" 

222 start = idx + len(marker) 

223 end = source_ref.find('"', start) 

224 raw = source_ref[start:].strip() if end == -1 else source_ref[start:end].strip() 

225 return _decode_excerpt_escapes(raw) 

226 

227 

228def _decode_excerpt_escapes(raw: str) -> str: 

229 """Decode the JSON-style escapes models commonly emit inside quoted strings.""" 

230 if "\\" not in raw: 

231 return raw 

232 result: list[str] = [] 

233 i = 0 

234 while i < len(raw): 

235 ch = raw[i] 

236 mapped = _EXCERPT_ESCAPES.get(raw[i + 1]) if ch == "\\" and i + 1 < len(raw) else None 

237 if mapped is not None: 

238 result.append(mapped) 

239 i += 2 

240 else: 

241 result.append(ch) 

242 i += 1 

243 return "".join(result) 

244 

245 

246def _find_excerpt_location( 

247 excerpt: str, 

248 chunks: list[SearchChunk], 

249) -> tuple[int, int, int, int]: 

250 """Find page/line location of an excerpt within chunks.""" 

251 if excerpt: 

252 for chunk in chunks: 

253 if excerpt in chunk.chunk: 

254 return chunk.page_start, chunk.page_end, chunk.line_start, chunk.line_end 

255 return 0, 0, 0, 0 

256 

257 

258def _build_citation_record( 

259 citation_key: str, 

260 excerpt: str, 

261 source_filename: str, 

262 source_hash: str, 

263 page_start: int, 

264 page_end: int, 

265 line_start: int, 

266 line_end: int, 

267 created_at: str, 

268) -> CitationRecord: 

269 """Build a single CitationRecord with consistent defaults.""" 

270 return CitationRecord( 

271 wiki_source="", # filled by caller 

272 wiki_chunk_index=0, 

273 citation_key=citation_key, 

274 claim_type="fact" if excerpt else "inference", 

275 source_filename=source_filename, 

276 source_hash=source_hash, 

277 page_start=page_start, 

278 page_end=page_end, 

279 line_start=line_start, 

280 line_end=line_end, 

281 excerpt=excerpt, 

282 created_at=created_at, 

283 ) 

284 

285 

286def _resolve_citations( 

287 parsed_citations: list[ParsedCitation], 

288 source_name: str, 

289 source_hash: str, 

290 chunks: list[SearchChunk], 

291) -> list[CitationRecord]: 

292 """Resolve parsed citation refs to CitationRecord objects. 

293 Searches for each citation's excerpt in the source chunks to find 

294 the best matching location (page/line numbers). 

295 """ 

296 records: list[CitationRecord] = [] 

297 now = datetime.now(UTC).isoformat() 

298 

299 for parsed in parsed_citations: 

300 excerpt = _extract_excerpt(parsed.source_ref) 

301 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, chunks) 

302 records.append( 

303 _build_citation_record( 

304 parsed.citation_key, 

305 excerpt, 

306 source_name, 

307 source_hash, 

308 page_start, 

309 page_end, 

310 line_start, 

311 line_end, 

312 now, 

313 ) 

314 ) 

315 return records 

316 

317 

318def _content_change_ratio(old_text: str, new_text: str) -> float: 

319 """Fraction of lines that changed between two texts (0.0 = identical, 1.0 = total rewrite).""" 

320 old_lines = old_text.splitlines() 

321 new_lines = new_text.splitlines() 

322 if not old_lines and not new_lines: 

323 return 0.0 

324 total = max(len(old_lines), len(new_lines)) 

325 matcher = difflib.SequenceMatcher(None, old_lines, new_lines) 

326 changed = total - sum(block.size for block in matcher.get_matching_blocks()) 

327 return changed / total 

328 

329 

330def _diff_summary(old_text: str, new_text: str) -> str: 

331 """Human-readable unified diff summary (first 20 diff lines).""" 

332 diff = difflib.unified_diff( 

333 old_text.splitlines(), 

334 new_text.splitlines(), 

335 lineterm="", 

336 fromfile="old", 

337 tofile="new", 

338 ) 

339 lines = list(diff) 

340 if len(lines) > _MAX_DIFF_PREVIEW_LINES: 

341 extra = len(lines) - _MAX_DIFF_PREVIEW_LINES 

342 return "\n".join(lines[:_MAX_DIFF_PREVIEW_LINES]) + f"\n... ({extra} more lines)" 

343 return "\n".join(lines) 

344 

345 

346def _divert_to_drafts( 

347 new_content: str, 

348 drafts_dir: Path, 

349 slug: str, 

350 change_ratio: float, 

351 diff_text: str, 

352) -> Path: 

353 """Write new content to wiki/drafts/ with a drift note instead of overwriting.""" 

354 draft_path = drafts_dir / f"{slug}.md" 

355 draft_path.parent.mkdir(parents=True, exist_ok=True) 

356 note = f"<!-- DRIFT: {change_ratio:.0%} content changed - flagged for human review -->\n\n" 

357 draft_path.write_text(note + new_content, encoding="utf-8") 

358 log.warning( 

359 "Drift detected for %s (%.0f%% changed), diverted to drafts. Diff:\n%s", 

360 slug, 

361 change_ratio * 100, 

362 diff_text, 

363 ) 

364 return draft_path 

365 

366 

367_WHITESPACE_RE = re.compile(r"\s+") 

368 

369 

370def _normalize_whitespace(text: str) -> str: 

371 """Collapse runs of whitespace to a single space and strip the edges. 

372 

373 PDF extractors preserve line breaks mid-sentence (``vehicle,\\nthe greater``) 

374 while LLMs paraphrase the same quote as a single-spaced string 

375 (``vehicle, the greater``). A strict substring check rejects a faithful 

376 citation on whitespace alone, so both sides are normalized before 

377 comparison. 

378 """ 

379 return _WHITESPACE_RE.sub(" ", text).strip() 

380 

381 

382def _verify_citations( 

383 citation_records: list[CitationRecord], 

384 chunks: list[SearchChunk], 

385 label: str, 

386 config: Config, 

387) -> list[CitationRecord]: 

388 """Filter citation records, keeping only those whose excerpts are in the chunks.""" 

389 wiki_prefix = config.wiki_dir + "/" 

390 all_chunk_text = _normalize_whitespace(" ".join(c.chunk for c in chunks)) 

391 verified: list[CitationRecord] = [] 

392 for rec in citation_records: 

393 if rec["source_filename"].startswith(wiki_prefix): 

394 log.debug("Skipping wiki-sourced citation %s", rec["citation_key"]) 

395 continue 

396 if rec["claim_type"] == "inference" or not rec["excerpt"]: 

397 verified.append(rec) 

398 continue 

399 if _normalize_whitespace(rec["excerpt"]) in all_chunk_text: 

400 verified.append(rec) 

401 else: 

402 log.debug("Citation %s excerpt not found in %s, dropping", rec["citation_key"], label) 

403 return verified 

404 

405 

406def _title_content_coherence(wiki_text: str, label: str) -> bool: 

407 """Deterministic pre-check: title and body must reference the concept. 

408 

409 The LLM faithfulness score evaluates whether the prose reflects 

410 the source chunks but does not penalize structural noise in the 

411 title (bb-8b7s: ``| | designer`` passed at 0.90 because the body 

412 was coherent). This pre-check asserts three invariants: 

413 

414 1. The first ``# `` heading must be a sanity-valid label per 

415 :func:`is_valid_label`. A heading like ``| | designer`` fails 

416 the structural-char gate even though it contains the cleaned 

417 display name as a substring. 

418 2. The cleaned display name must appear in the heading as a 

419 case-insensitive substring. Covers LLM drift where the 

420 heading names a different concept than requested. 

421 3. The body must mention the display name at least once outside 

422 the heading. Covers the "LLM talked about something adjacent 

423 but never named the concept" regression. 

424 

425 Returns True when all three hold, False otherwise. 

426 """ 

427 display = clean_label_for_display(label).lower() 

428 if not display: 

429 return False 

430 heading: str | None = None 

431 body_parts: list[str] = [] 

432 for line in wiki_text.splitlines(): 

433 if heading is None and line.startswith("# "): 

434 heading = line[2:].strip() 

435 continue 

436 body_parts.append(line) 

437 if heading is None: 

438 return False 

439 if not is_valid_label(heading): 

440 return False 

441 if display not in heading.lower(): 

442 return False 

443 body = "\n".join(body_parts).lower() 

444 return display in body 

445 

446 

447def _mean_vector(vectors: list[list[float]]) -> list[float]: 

448 """Compute the element-wise mean of a non-empty vector list. 

449 

450 Empty input returns an empty list; callers must check before any 

451 downstream dot-product so we do not leak a shape mismatch. 

452 

453 Routes through numpy so the inner loop runs in C: for the typical 

454 ``D=768``, ``N=10`` case this cuts per-call cost from ~8k Python 

455 ops to a single SIMD-backed reduction. 

456 """ 

457 if not vectors: 

458 return [] 

459 result: list[float] = np.asarray(vectors, dtype=np.float32).mean(axis=0).tolist() 

460 return result 

461 

462 

463def _embedding_faithfulness_score( 

464 body_vec: list[float], 

465 source_vectors: list[list[float]], 

466) -> float: 

467 """Cosine-similarity score between the body and the mean source vector. 

468 

469 Assumes L2-normalized vectors (both the embedder and the store 

470 return normalized vectors); cosine reduces to a dot product. 

471 Falls through to :func:`cosine_sim` so a non-normalized vector 

472 does not silently produce an out-of-range value. Result is 

473 clamped at zero because a negative cosine means the body vector 

474 points the other way from the mean of the sources — treat that 

475 the same as uncorrelated for threshold purposes. 

476 

477 Returns 0.0 on a dimension mismatch between the body vector and 

478 the source-vector mean. That is not expected in production (the 

479 embedder and the chunk vectors come from the same model), but a 

480 stub-driven test may hand in off-shape vectors and crashing the 

481 whole pipeline on the shape-check hides the real assertion. 

482 """ 

483 from lilbee.store import cosine_sim 

484 

485 mean_vec = _mean_vector(source_vectors) 

486 if not mean_vec or not body_vec: 

487 return 0.0 

488 if len(mean_vec) != len(body_vec): 

489 log.warning( 

490 "Body vector dim %d does not match source vector dim %d; scoring 0.0", 

491 len(body_vec), 

492 len(mean_vec), 

493 ) 

494 return 0.0 

495 return max(0.0, cosine_sim(body_vec, mean_vec)) 

496 

497 

498def _check_faithfulness( 

499 chunks: list[SearchChunk], 

500 wiki_text: str, 

501 label: str, 

502 config: Config | None = None, 

503) -> float: 

504 """Score the wiki body's similarity to its source chunks, 0.0 on failure. 

505 

506 Phase D: replaces the LLM-based faithfulness call with a 

507 deterministic cosine-similarity score between the page body and 

508 the mean of its source chunk vectors. The B3 title/body coherence 

509 pre-check still runs first as a hard gate: a garbage H1 returns 

510 0.0 regardless of embedding similarity, so structurally broken 

511 pages route to drafts even when the prose happens to be coherent. 

512 

513 ``chunks`` carries ``.vector`` populated by LanceDB (see 

514 ``SearchChunk`` in ``store.py``), so no extra embedder call is 

515 needed for the source side. The body is embedded once via the 

516 shared services embedder. Any exception in the embedder (model 

517 missing, network issue, invalid config) is caught and reported as 

518 0.0 so a single faulty page drops to drafts instead of aborting 

519 the whole build. 

520 """ 

521 if not _title_content_coherence(wiki_text, label): 

522 log.info( 

523 "Faithfulness title/body coherence failed for %r; scoring 0.0", 

524 label, 

525 ) 

526 return 0.0 

527 source_vectors = [c.vector for c in chunks if c.vector] 

528 if not source_vectors: 

529 log.warning("No source vectors for %s; scoring 0.0", label) 

530 return 0.0 

531 

532 # Strip the frontmatter + citation block so we embed only the body 

533 # prose. render_citation_block may not have run yet when the score 

534 # is computed (it is appended later), but strip_citation_block is 

535 # idempotent on missing trailers. 

536 body_text = strip_citation_block(wiki_text).strip() 

537 if not body_text: 

538 log.warning("Empty body for %s; scoring 0.0", label) 

539 return 0.0 

540 

541 try: 

542 body_vectors = get_services().embedder.embed_batch([body_text]) 

543 except Exception as exc: 

544 log.warning("Body embedding failed for %s: %s", label, exc) 

545 return 0.0 

546 if not body_vectors: 

547 return 0.0 

548 return _embedding_faithfulness_score(body_vectors[0], source_vectors) 

549 

550 

551def _build_frontmatter( 

552 config: Config, 

553 source_names: list[str], 

554 score: float, 

555 leaf_hash: str = "", 

556 chunks: list[SearchChunk] | None = None, 

557) -> str: 

558 """Build YAML frontmatter for a wiki page. 

559 

560 When ``leaf_hash`` is non-empty it is written so incremental rebuild 

561 can skip regeneration on a subsequent sync whose chunks produce the 

562 same hash. When ``chunks`` is provided the frontmatter carries a 

563 ``provenance`` block naming the source/chunk-index pairs that fed 

564 the generator and the extraction method from config, so a bad page 

565 is auditable without re-running the pipeline. 

566 """ 

567 sources_yaml = ", ".join(f'"{s}"' for s in sorted(source_names)) 

568 hash_line = f"leaf_hash: {leaf_hash}\n" if leaf_hash else "" 

569 provenance_block = _render_provenance(config, chunks) if chunks is not None else "" 

570 return ( 

571 f"---\n" 

572 f"generated_by: {config.chat_model}\n" 

573 f"generated_at: {datetime.now(UTC).isoformat()}\n" 

574 f"sources: [{sources_yaml}]\n" 

575 f"faithfulness_score: {score:.2f}\n" 

576 f"{hash_line}" 

577 f"{provenance_block}" 

578 f"---\n\n" 

579 ) 

580 

581 

582def _render_provenance(config: Config, chunks: list[SearchChunk]) -> str: 

583 """Render the provenance block: chunk references + extraction method. 

584 

585 Routes through ``yaml.safe_dump`` rather than hand-rolled string 

586 formatting so a chunk source containing a quote, backslash, 

587 colon, or newline does not produce invalid YAML that 

588 ``parse_frontmatter`` would silently drop on read. 

589 """ 

590 block = { 

591 "provenance": { 

592 "extraction_method": config.wiki_entity_mode.value, 

593 "chunks": [{"source": c.source, "chunk_index": c.chunk_index} for c in chunks], 

594 } 

595 } 

596 return yaml.safe_dump(block, sort_keys=False) 

597 

598 

599def _write_page( 

600 wiki_root: Path, 

601 subdir: str, 

602 slug: str, 

603 full_content: str, 

604 drift_threshold: float, 

605) -> Path: 

606 """Write page to disk with drift detection. Returns path written to. 

607 

608 ``slug`` may contain forward slashes (e.g. ``cv-manual/page-0042``); 

609 any intermediate directories are created before writing. 

610 """ 

611 page_path = wiki_root / subdir / f"{slug}.md" 

612 page_path.parent.mkdir(parents=True, exist_ok=True) 

613 

614 if page_path.exists(): 

615 old_content = page_path.read_text(encoding="utf-8") 

616 ratio = _content_change_ratio(old_content, full_content) 

617 if ratio > drift_threshold: 

618 drafts_dir = wiki_root / DRAFTS_SUBDIR 

619 diff_text = _diff_summary(old_content, full_content) 

620 return _divert_to_drafts(full_content, drafts_dir, slug, ratio, diff_text) 

621 

622 page_path.write_text(full_content, encoding="utf-8") 

623 return page_path 

624 

625 

626def _assemble_content( 

627 frontmatter: str, 

628 wiki_text: str, 

629 citation_block: str, 

630) -> str: 

631 """Combine frontmatter, body, and citations into the full page content.""" 

632 full = frontmatter + wiki_text 

633 if citation_block: 

634 full += "\n\n" + citation_block 

635 return full 

636 

637 

638def index_wiki_page(content: str, wiki_source: str, store: Store) -> int: 

639 """Chunk a wiki page body, embed it, and write rows with ``chunk_type="wiki"``. 

640 

641 ``wiki_source`` must follow the ``<wiki_dir>/<subdir>/<slug>.md`` 

642 shape (see :attr:`PageTarget.wiki_source`). Three branches: 

643 

644 - subdir in :data:`WIKI_CONTENT_SUBDIRS`: clear stale rows, chunk, 

645 embed, write. Returns the row count. 

646 - subdir is ``drafts/`` or ``archive/``: skip without touching the 

647 store. Returns 0. 

648 - malformed ``wiki_source`` (no subdir component): log.warning and 

649 return 0. Does not raise because the caller set is narrow (only 

650 internal wiki paths reach here) and surfacing the bad input in 

651 the log is sufficient triage. 

652 

653 Record shape matches the markdown-ingest convention in 

654 ``ingest.py``: ``content_type="text"``, all four page/line 

655 positions ``0`` (wiki pages are not paginated). 

656 """ 

657 subdir = _subdir_from_wiki_source(wiki_source) 

658 if subdir is None: 

659 log.warning("index_wiki_page: malformed wiki_source %r (no subdir)", wiki_source) 

660 return 0 

661 if subdir not in WIKI_CONTENT_SUBDIRS: 

662 return 0 

663 

664 body = extract_body(content).strip() 

665 store.clear_table( 

666 CHUNKS_TABLE, 

667 f"source = '{escape_sql_string(wiki_source)}' AND chunk_type = '{CHUNK_TYPE_WIKI}'", 

668 ) 

669 if not body: 

670 return 0 

671 

672 chunks = chunk_text(body, mime_type="text/markdown", use_semantic=True) 

673 if not chunks: 

674 return 0 

675 

676 vectors = get_services().embedder.embed_batch(chunks) 

677 records = [ 

678 { 

679 "source": wiki_source, 

680 "content_type": "text", 

681 "chunk_type": CHUNK_TYPE_WIKI, 

682 "page_start": 0, 

683 "page_end": 0, 

684 "line_start": 0, 

685 "line_end": 0, 

686 "chunk": text, 

687 "chunk_index": idx, 

688 "vector": vector, 

689 } 

690 for idx, (text, vector) in enumerate(zip(chunks, vectors, strict=True)) 

691 ] 

692 store.add_chunks(records) 

693 return len(records) 

694 

695 

696def _subdir_from_wiki_source(wiki_source: str) -> str | None: 

697 """Return the subdir component (``summaries``, ``concepts``, ...) of *wiki_source*. 

698 

699 ``wiki_source`` is the ``<wiki_dir>/<subdir>/<slug>.md`` path 

700 stored in citations and chunks. Returns None when the path has 

701 fewer than two components. 

702 """ 

703 parts = wiki_source.split("/") 

704 return parts[1] if len(parts) >= 2 else None 

705 

706 

707def _persist_and_finalize( 

708 content: str, 

709 target: PageTarget, 

710 verified: list[CitationRecord], 

711 source_names: list[str], 

712 store: Store, 

713 config: Config, 

714) -> Path: 

715 """Write page to disk, persist citations, index body chunks, update index and log.""" 

716 page_path = _write_page( 

717 target.wiki_root, target.subdir, target.slug, content, config.wiki_drift_threshold 

718 ) 

719 for rec in verified: 

720 rec["wiki_source"] = target.wiki_source 

721 store.delete_citations_for_wiki(target.wiki_source) 

722 store.add_citations(verified) 

723 

724 index_wiki_page(content, target.wiki_source, store) 

725 

726 if config.wiki_prune_raw: 

727 for name in source_names: 

728 store.delete_by_source(name) 

729 

730 update_wiki_index(config) 

731 append_wiki_log( 

732 WIKI_LOG_ACTION_GENERATED, 

733 f"{target.page_type} page for {target.label} -> {target.subdir}/{target.slug}.md", 

734 config, 

735 ) 

736 return page_path 

737 

738 

739def _generate_page( 

740 label: str, 

741 prompt: str, 

742 chunks: list[SearchChunk], 

743 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]], 

744 page_type: str, 

745 slug: str, 

746 source_names: list[str], 

747 provider: LLMProvider, 

748 store: Store, 

749 config: Config, 

750 on_progress: WikiProgressCallback | None = None, 

751 leaf_hash: str = "", 

752) -> Path | None: 

753 """Core generation pipeline shared by summary and synthesis pages.""" 

754 

755 def _emit(stage: str, **data: object) -> None: 

756 if on_progress is not None: 

757 on_progress(stage, data) 

758 

759 _emit("preparing", chunks=len(chunks), source=label) 

760 

761 messages = _build_wiki_messages(prompt, provider, config) 

762 _emit("generating", source=label) 

763 options = config.generation_options( 

764 temperature=config.wiki_temperature, 

765 max_tokens=config.wiki_summary_max_tokens, 

766 ) 

767 try: 

768 response = provider.chat(messages, stream=False, options=options) 

769 wiki_text = strip_reasoning(cast(str, response)).strip() 

770 except Exception as exc: 

771 log.warning("LLM failed to generate wiki page for %s: %s", label, exc) 

772 _emit("failed", error=str(exc)) 

773 return None 

774 

775 if not wiki_text: 

776 log.warning("LLM returned empty response for wiki page %s", label) 

777 _emit("failed", error="Model returned empty response") 

778 return None 

779 

780 parsed_citations = parse_wiki_citations(wiki_text) 

781 verified = _verify_citations(citation_resolver(parsed_citations), chunks, label, config) 

782 if not verified: 

783 log.warning("No valid citations for %s, skipping", label) 

784 _emit("failed", error="No valid citations found") 

785 return None 

786 

787 _emit("faithfulness_check") 

788 score = _check_faithfulness(chunks, wiki_text, label, config) 

789 threshold = config.wiki_embedding_faithfulness_threshold 

790 subdir = page_type if score >= threshold else DRAFTS_SUBDIR 

791 if subdir == DRAFTS_SUBDIR: 

792 log.info("Wiki page %s scored %.2f (< %.2f), sending to drafts", label, score, threshold) 

793 

794 wiki_text = strip_citation_block(wiki_text) 

795 frontmatter = _build_frontmatter(config, source_names, score, leaf_hash, chunks=chunks) 

796 citation_block = render_citation_block(verified) 

797 full_content = _assemble_content(frontmatter, wiki_text, citation_block) 

798 

799 wiki_root = config.data_root / config.wiki_dir 

800 target = PageTarget( 

801 wiki_root=wiki_root, 

802 subdir=subdir, 

803 slug=slug, 

804 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md", 

805 page_type=page_type, 

806 label=label, 

807 ) 

808 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config) 

809 

810 log.info( 

811 "Generated wiki page for %s -> %s (score=%.2f, citations=%d)", 

812 label, 

813 target.subdir, 

814 score, 

815 len(verified), 

816 ) 

817 return page_path 

818 

819 

820def _resolve_multi_source_citations( 

821 parsed_citations: list[ParsedCitation], 

822 source_names: list[str], 

823 source_hashes: dict[str, str], 

824 chunks_by_source: dict[str, list[SearchChunk]], 

825) -> list[CitationRecord]: 

826 """Resolve citations from a synthesis page that cites multiple sources. 

827 Each citation's source_ref is matched against the source list to 

828 determine which source document it references. 

829 """ 

830 records: list[CitationRecord] = [] 

831 now = datetime.now(UTC).isoformat() 

832 

833 all_chunks = [c for cs in chunks_by_source.values() for c in cs] 

834 

835 for parsed in parsed_citations: 

836 excerpt = _extract_excerpt(parsed.source_ref) 

837 

838 matched_source = _match_citation_source(parsed.source_ref, source_names) 

839 if not matched_source: 

840 matched_source = _find_excerpt_source(excerpt, chunks_by_source) 

841 if not matched_source and source_names: 

842 # No citation match found; default to first listed source 

843 log.warning( 

844 "No citation match for chunk — defaulting to first source: %s", 

845 source_names[0], 

846 ) 

847 matched_source = source_names[0] 

848 

849 search_chunks = chunks_by_source.get(matched_source, all_chunks) 

850 page_start, page_end, line_start, line_end = _find_excerpt_location(excerpt, search_chunks) 

851 records.append( 

852 _build_citation_record( 

853 parsed.citation_key, 

854 excerpt, 

855 matched_source, 

856 source_hashes.get(matched_source, ""), 

857 page_start, 

858 page_end, 

859 line_start, 

860 line_end, 

861 now, 

862 ) 

863 ) 

864 return records 

865 

866 

867def _match_citation_source(source_ref: str, source_names: list[str]) -> str: 

868 """Find which source a citation references by matching filenames in the ref.""" 

869 for name in source_names: 

870 if name in source_ref: 

871 return name 

872 return "" 

873 

874 

875def _find_excerpt_source(excerpt: str, chunks_by_source: dict[str, list[SearchChunk]]) -> str: 

876 """Find which source contains a given excerpt by searching chunks.""" 

877 if not excerpt: 

878 return "" 

879 for source, chunks in chunks_by_source.items(): 

880 for chunk in chunks: 

881 if excerpt in chunk.chunk: 

882 return source 

883 return "" 

884 

885 

886def _generate_synthesis_page( 

887 topic: str, 

888 source_names: list[str], 

889 chunks_by_source: dict[str, list[SearchChunk]], 

890 provider: LLMProvider, 

891 store: Store, 

892 config: Config, 

893) -> Path | None: 

894 """Generate a single synthesis page for a concept cluster. 

895 Returns the path to the generated page, or None on failure. 

896 """ 

897 all_chunks = [c for cs in chunks_by_source.values() for c in cs] 

898 if not all_chunks: 

899 log.warning("No chunks for synthesis topic %r, skipping", topic) 

900 return None 

901 

902 all_chunks = _truncate_chunks_to_budget(all_chunks, config) 

903 chunks_text = _chunks_to_text(all_chunks) 

904 source_list = "\n".join(f"- {name}" for name in sorted(source_names)) 

905 template = config.wiki_synthesis_prompt 

906 display_topic = clean_label_for_display(topic) 

907 prompt = template.format(topic=display_topic, source_list=source_list, chunks_text=chunks_text) 

908 slug = make_slug(topic) 

909 

910 source_hashes: dict[str, str] = {} 

911 for name in source_names: 

912 source_path = config.documents_dir / name 

913 if source_path.exists(): 

914 source_hashes[name] = file_hash(source_path) 

915 

916 def resolver(parsed: list[ParsedCitation]) -> list[CitationRecord]: 

917 return _resolve_multi_source_citations( 

918 parsed, source_names, source_hashes, chunks_by_source 

919 ) 

920 

921 return _generate_page( 

922 label=topic, 

923 prompt=prompt, 

924 chunks=all_chunks, 

925 citation_resolver=resolver, 

926 page_type=SYNTHESIS_SUBDIR, 

927 slug=slug, 

928 source_names=source_names, 

929 provider=provider, 

930 store=store, 

931 config=config, 

932 ) 

933 

934 

935def _generate_for_cluster( 

936 label: str, 

937 sources: frozenset[str], 

938 provider: LLMProvider, 

939 store: Store, 

940 config: Config, 

941) -> Path | None: 

942 """Gather chunks for a cluster and generate a synthesis page.""" 

943 source_names = sorted(sources) 

944 chunks_by_source: dict[str, list[SearchChunk]] = {} 

945 for name in source_names: 

946 chunks = store.get_chunks_by_source(name) 

947 if chunks: 

948 chunks_by_source[name] = chunks 

949 

950 if len(chunks_by_source) < MIN_CLUSTER_SOURCES: 

951 return None 

952 

953 return _generate_synthesis_page(label, source_names, chunks_by_source, provider, store, config) 

954 

955 

956def generate_synthesis_pages( 

957 provider: LLMProvider, 

958 store: Store, 

959 clusterer: SourceClusterer, 

960 config: Config | None = None, 

961) -> list[Path]: 

962 """Generate synthesis pages for source clusters spanning 3+ documents.""" 

963 if config is None: 

964 config = cfg 

965 

966 clusters = clusterer.get_clusters(min_sources=MIN_CLUSTER_SOURCES) 

967 if not clusters: 

968 log.info("No source clusters span %d+ sources, skipping synthesis", MIN_CLUSTER_SOURCES) 

969 return [] 

970 

971 pages: list[Path] = [] 

972 for cluster in clusters: 

973 page = _generate_for_cluster(cluster.label, cluster.sources, provider, store, config) 

974 if page is not None: 

975 pages.append(page) 

976 

977 log.info("Generated %d synthesis pages", len(pages)) 

978 return pages 

979 

980 

981def _hash_existing_sources(source_names: list[str], documents_dir: Path) -> dict[str, str]: 

982 """Hash each source file that still exists on disk (used for citation staleness).""" 

983 out: dict[str, str] = {} 

984 for name in source_names: 

985 source_path = documents_dir / name 

986 if source_path.exists(): 

987 out[name] = file_hash(source_path) 

988 return out 

989 

990 

991# Phase D: archive-migration sentinel and helpers. The sentinel lives 

992# under data_dir (NOT inside wiki/) so Obsidian sync and wiki 

993# tree-walkers never surface it. 

994_PHASE_D_SENTINEL_NAME = ".phase-d-migrated" 

995 

996# Pre-Phase-D wiki concepts that we move to archive/ as part of the 

997# one-time migration. Matches wiki/<CONCEPTS_SUBDIR>/*.md recursively. 

998_ARCHIVE_CONCEPTS_SUBPATH = Path(ARCHIVE_SUBDIR) / CONCEPTS_SUBDIR 

999 

1000 

1001def _maybe_run_phase_d_migration(wiki_root: Path, data_dir: Path) -> None: 

1002 """One-time migration: archive pre-Phase-D concept pages. 

1003 

1004 Runs idempotently, gated by ``{data_dir}/.phase-d-migrated``: 

1005 

1006 1. Move every ``wiki/concepts/*.md`` to ``wiki/archive/concepts/`` 

1007 preserving relative subpaths. Older concept pages stay 

1008 readable but drop out of the active wiki browse surface. 

1009 2. Unwrap stale ``[[archived-slug]]`` references across the 

1010 remaining pages so a reader clicking a link does not hit a 

1011 404. Archived slugs become plain text. 

1012 3. Write the sentinel so future builds skip this path. 

1013 

1014 D3's freshly LLM-curated concept pages written AFTER the sentinel 

1015 exists are never touched. 

1016 """ 

1017 sentinel = data_dir / _PHASE_D_SENTINEL_NAME 

1018 if sentinel.exists(): 

1019 return 

1020 concepts_dir = wiki_root / CONCEPTS_SUBDIR 

1021 archive_dir = wiki_root / _ARCHIVE_CONCEPTS_SUBPATH 

1022 archived_slugs: list[str] = [] 

1023 if concepts_dir.is_dir(): 

1024 for src in sorted(concepts_dir.rglob("*.md")): 

1025 rel = src.relative_to(concepts_dir) 

1026 dest = archive_dir / rel 

1027 dest.parent.mkdir(parents=True, exist_ok=True) 

1028 src.replace(dest) 

1029 archived_slugs.append(str(rel.with_suffix("")).replace("\\", "/")) 

1030 

1031 if archived_slugs: 

1032 _unwrap_archived_links(wiki_root, archived_slugs) 

1033 

1034 data_dir.mkdir(parents=True, exist_ok=True) 

1035 sentinel.write_text(datetime.now(UTC).isoformat(), encoding="utf-8") 

1036 if archived_slugs: 

1037 log.info( 

1038 "Phase D migration: archived %d concept pages, sentinel written at %s", 

1039 len(archived_slugs), 

1040 sentinel, 

1041 ) 

1042 

1043 

1044def _unwrap_archived_links(wiki_root: Path, archived_slugs: list[str]) -> None: 

1045 """Rewrite ``[[slug]]`` → ``slug`` (plain text) across remaining wiki pages. 

1046 

1047 The existing ``_rewrite_links_across_wiki`` path is the wrong 

1048 tool here: it compiles an *additive* surface map, not a 

1049 removal pass. Walk the active wiki content subdirs once per 

1050 archived slug is acceptable because the archive count is 

1051 bounded (concepts that existed pre-migration). Pages whose body 

1052 did not change are not rewritten. 

1053 """ 

1054 if not archived_slugs: 

1055 return 

1056 patterns = [(re.compile(r"\[\[" + re.escape(slug) + r"\]\]"), slug) for slug in archived_slugs] 

1057 for subdir in WIKI_CONTENT_SUBDIRS: 

1058 subdir_path = wiki_root / subdir 

1059 if not subdir_path.is_dir(): 

1060 continue 

1061 for md_path in subdir_path.rglob("*.md"): 

1062 original = md_path.read_text(encoding="utf-8") 

1063 rewritten = original 

1064 for pattern, replacement in patterns: 

1065 rewritten = pattern.sub(replacement, rewritten) 

1066 if rewritten != original: 

1067 md_path.write_text(rewritten, encoding="utf-8") 

1068 

1069 

1070# Pending-marker conventions: the drafts listing surface 

1071# (``lilbee.wiki.drafts``) scans for these prefixes to classify a 

1072# draft as PARSE or COLLISION instead of a drift-routed regen. The 

1073# keyword phrases live in ``wiki.shared`` so writer (gen) and reader 

1074# (drafts) stay in sync on the exact wording. 

1075_PENDING_PARSE_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_PARSE}" 

1076_PENDING_COLLISION_MARKER_PREFIX = f"<!-- {PENDING_MARKER_KEYWORD_COLLISION}" 

1077 

1078 

1079def _write_pending_marker( 

1080 drafts_dir: Path, 

1081 slug: str, 

1082 marker_line: str, 

1083 frontmatter: str = "", 

1084) -> Path: 

1085 """Write a PENDING marker page under ``drafts/<slug>.md``. 

1086 

1087 ``marker_line`` is the leading HTML comment that both identifies 

1088 the marker kind and carries the context (source, label). The 

1089 optional ``frontmatter`` preserves minimal metadata for the 

1090 drafts surface to round-trip (e.g. ``bad_title``-style fields). 

1091 """ 

1092 drafts_dir.mkdir(parents=True, exist_ok=True) 

1093 draft_path = drafts_dir / f"{slug}.md" 

1094 body = marker_line + "\n" 

1095 if frontmatter: 

1096 body += "\n" + frontmatter 

1097 draft_path.write_text(body, encoding="utf-8") 

1098 return draft_path 

1099 

1100 

1101def _delete_pending_marker_if_present(drafts_dir: Path, slug: str) -> bool: 

1102 """Delete an existing PENDING marker for *slug*; return whether one was removed. 

1103 

1104 Match is slug-equality (not fuzzy): an LLM that rephrases a 

1105 label on retry (``brake system`` → ``braking system``) leaves 

1106 the old marker behind for the user to drain via ``wiki drafts 

1107 reject``. Documented limitation; follow-up if the pattern 

1108 matters. 

1109 """ 

1110 draft_path = drafts_dir / f"{slug}.md" 

1111 if not draft_path.is_file(): 

1112 return False 

1113 try: 

1114 body = draft_path.read_text(encoding="utf-8") 

1115 except OSError: 

1116 return False 

1117 first_line = body.splitlines()[0] if body else "" 

1118 is_pending = first_line.startswith(_PENDING_PARSE_MARKER_PREFIX) or first_line.startswith( 

1119 _PENDING_COLLISION_MARKER_PREFIX 

1120 ) 

1121 if not is_pending: 

1122 return False 

1123 draft_path.unlink() 

1124 return True 

1125 

1126 

1127def _group_entities_by_primary_source( 

1128 entities: list[ExtractedEntity], 

1129) -> dict[str, list[ExtractedEntity]]: 

1130 """Group entities under the source that mentions them most. 

1131 

1132 Primary source = source with the highest chunk-ref count; 

1133 lexicographic tiebreak. An entity with no refs is dropped 

1134 silently (defensive: extractor always attaches refs, but a 

1135 future extractor might not). 

1136 """ 

1137 grouped: dict[str, list[ExtractedEntity]] = {} 

1138 for entity in entities: 

1139 if not entity.chunk_refs: 

1140 continue 

1141 counts: dict[str, int] = {} 

1142 for ref in entity.chunk_refs: 

1143 counts[ref.source] = counts.get(ref.source, 0) + 1 

1144 primary = min(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0] 

1145 grouped.setdefault(primary, []).append(entity) 

1146 return grouped 

1147 

1148 

1149# Regex that matches section headers the batch parser recognizes: 

1150# H1 (``# Name``), H2 (``## Name``), or a bold-line heading 

1151# (``**Name**``) at line start. The name capture is anchored to the 

1152# rest of the line (stripped of trailing whitespace) so labels like 

1153# ``## Brake System (hydraulic)`` still parse. 

1154_SECTION_HEADER_RE = re.compile( 

1155 r"^(?:(?:##?)\s+(?P<hashname>[^\n]+)|\*\*(?P<boldname>[^\*\n]+)\*\*)\s*$", 

1156 re.MULTILINE, 

1157) 

1158 

1159# In-body ``[^keyN]`` footnote-marker pattern. Module-scope so the 

1160# batched-generation hot path (`_finalize_section`) does not recompile 

1161# it on every recovered section. 

1162_FOOTNOTE_MARKER_RE = re.compile(r"\[\^([a-zA-Z0-9_\-]+)\]") 

1163 

1164 

1165def _split_batched_output( 

1166 text: str, 

1167 expected_entity_labels: set[str], 

1168 expected_concept_labels: set[str] | None = None, 

1169) -> dict[str, tuple[EntityKind, str]]: 

1170 """Best-effort parse of the batched LLM response into per-label bodies. 

1171 

1172 Splits on H1/H2/bold-line headers, then matches each header 

1173 against the expected entity and concept label sets via 

1174 case-insensitive substring. Known labels are tagged with the 

1175 right ``EntityKind``; unknown headers are dropped. Labels whose 

1176 section could not be recovered at all are surfaced to the caller 

1177 (they show up as *missing from the return dict* rather than a 

1178 separate list — caller loops over the expected sets to write 

1179 PENDING markers). 

1180 """ 

1181 concepts = expected_concept_labels or set() 

1182 recovered: dict[str, tuple[EntityKind, str]] = {} 

1183 matches = list(_SECTION_HEADER_RE.finditer(text)) 

1184 if not matches: 

1185 return recovered 

1186 for i, match in enumerate(matches): 

1187 name = match.group("hashname") or match.group("boldname") or "" 

1188 name = name.strip() 

1189 start = match.end() 

1190 end = matches[i + 1].start() if i + 1 < len(matches) else len(text) 

1191 body = text[start:end].strip() 

1192 if not body: 

1193 continue 

1194 lowered = name.lower() 

1195 kind_label = _match_label(lowered, expected_entity_labels, EntityKind.ENTITY) 

1196 if kind_label is None: 

1197 kind_label = _match_label(lowered, concepts, EntityKind.CONCEPT) 

1198 if kind_label is None: 

1199 # Concept labels come from the LLM itself — tag any 

1200 # unmatched section as CONCEPT only when the caller is 

1201 # expecting concept curation; otherwise drop it as 

1202 # noise. 

1203 if concepts is not None and expected_concept_labels is not None: 

1204 recovered.setdefault(name, (EntityKind.CONCEPT, _prefix_heading(name, body))) 

1205 continue 

1206 kind, label = kind_label 

1207 recovered[label] = (kind, _prefix_heading(name, body)) 

1208 return recovered 

1209 

1210 

1211def _match_label( 

1212 lowered_name: str, 

1213 expected: set[str], 

1214 kind: EntityKind, 

1215) -> tuple[EntityKind, str] | None: 

1216 """Case-insensitive substring match of *lowered_name* against *expected*. 

1217 

1218 Returns ``(kind, original_label)`` on hit, ``None`` otherwise. 

1219 A substring match (not equality) accommodates the LLM adding 

1220 qualifiers ("Brake System (hydraulic)" vs "brake system"). 

1221 """ 

1222 for label in expected: 

1223 low = label.lower() 

1224 if low and (low in lowered_name or lowered_name in low): 

1225 return (kind, label) 

1226 return None 

1227 

1228 

1229def _prefix_heading(name: str, body: str) -> str: 

1230 """Ensure the extracted body starts with a ``# Name`` H1. 

1231 

1232 The batched prompt instructs the model to emit ``## Name`` per 

1233 section. After splitting, the per-section body has lost its 

1234 header. Rebuild an H1 so the B3 title/body coherence gate still 

1235 has a heading to match. 

1236 """ 

1237 stripped = body.lstrip() 

1238 if stripped.startswith("# "): 

1239 return body 

1240 return f"# {name}\n\n{body}" 

1241 

1242 

1243def _chunks_for_source(chunks: list[SearchChunk], source: str) -> list[SearchChunk]: 

1244 """Return the subset of *chunks* whose ``source`` matches, preserving order.""" 

1245 return [c for c in chunks if c.source == source] 

1246 

1247 

1248def _build_batch_prompt( 

1249 source: str, 

1250 entities: list[ExtractedEntity], 

1251 chunks_text: str, 

1252 extract_concepts: bool, 

1253 config: Config, 

1254) -> str: 

1255 """Render :attr:`Config.wiki_entity_batch_prompt` for one source call. 

1256 

1257 ``extract_concepts`` controls whether the concept-curation 

1258 paragraph is injected: True adds a "identify 3-5 concepts" block; 

1259 False leaves ``{concept_instruction}`` empty so the LLM writes 

1260 entity sections only. Keeps the per-source batched call the 

1261 single entry point whether or not concepts are requested. 

1262 """ 

1263 entity_labels = ", ".join(clean_label_for_display(e.label) for e in entities) or "(none)" 

1264 if extract_concepts: 

1265 concept_instruction = ( 

1266 "First, identify 3-5 CONCEPTS — abstract topics or domain terms " 

1267 "from the source that deserve a standalone wiki page. Do NOT include " 

1268 "pronouns, articles, or generic nouns.\n\n" 

1269 "Then write a wiki section for each of the concepts you identified, " 

1270 "PLUS one section for each NER ENTITY listed below.\n\n" 

1271 ) 

1272 else: 

1273 concept_instruction = "" 

1274 return config.wiki_entity_batch_prompt.format( 

1275 source=source, 

1276 entity_list=entity_labels, 

1277 chunks_text=chunks_text, 

1278 concept_instruction=concept_instruction, 

1279 ) 

1280 

1281 

1282def _short_source_hash(source: str) -> str: 

1283 """8-char sha256 digest of *source* (stable collision-marker suffix).""" 

1284 return hashlib.sha256(source.encode("utf-8")).hexdigest()[:8] 

1285 

1286 

1287def _generate_source_batch( 

1288 source: str, 

1289 entities: list[ExtractedEntity], 

1290 chunks: list[SearchChunk], 

1291 provider: LLMProvider, 

1292 store: Store, 

1293 config: Config, 

1294 *, 

1295 extract_concepts: bool, 

1296 written_concept_slugs: dict[str, str], 

1297) -> list[Path]: 

1298 """Issue one LLM call for *source* and finalize every recovered section. 

1299 

1300 Returns the list of page paths written (entities + concepts 

1301 combined). Labels not recovered by the parser become PENDING 

1302 markers under ``wiki/drafts/`` so the next build can retry. 

1303 Concept slugs already written by an earlier source produce a 

1304 PENDING-COLLISION marker on the losing side (see 

1305 :func:`_handle_concept_write`). 

1306 

1307 ``written_concept_slugs`` is the per-build ledger of 

1308 slug → first_source. Callers share one dict across the per-source 

1309 loop. The second source to propose a slug is the one that gets 

1310 diverted to a collision marker. 

1311 """ 

1312 if not chunks: 

1313 return [] 

1314 budgeted = _truncate_chunks_to_budget(chunks, config) 

1315 chunks_text = _chunks_to_text(budgeted) 

1316 prompt = _build_batch_prompt(source, entities, chunks_text, extract_concepts, config) 

1317 messages = _build_wiki_messages(prompt, provider, config) 

1318 options = config.generation_options( 

1319 temperature=config.wiki_temperature, 

1320 max_tokens=config.wiki_summary_max_tokens, 

1321 ) 

1322 try: 

1323 response = provider.chat(messages, stream=False, options=options) 

1324 text = strip_reasoning(cast(str, response)).strip() 

1325 except Exception as exc: 

1326 log.warning("Batched LLM call failed for source %s: %s", source, exc) 

1327 return [] 

1328 

1329 if not text: 

1330 log.warning("Batched LLM call returned empty response for source %s", source) 

1331 return [] 

1332 

1333 expected_entity_labels = {e.label for e in entities} 

1334 expected_concepts: set[str] | None = set() if extract_concepts else None 

1335 parsed = _split_batched_output(text, expected_entity_labels, expected_concepts) 

1336 

1337 wiki_root = config.data_root / config.wiki_dir 

1338 drafts_dir = wiki_root / DRAFTS_SUBDIR 

1339 source_names = [source] 

1340 source_hashes = _hash_existing_sources(source_names, config.documents_dir) 

1341 chunks_by_source = {source: budgeted} 

1342 

1343 # Citation definitions live in the trailing block of the WHOLE 

1344 # response, not inside any one section body. Parse once over the 

1345 # full text and replay the same list for every section, so each 

1346 # page sees its own citations even when only the last section 

1347 # carries the definition trailer. 

1348 shared_parsed_citations = parse_wiki_citations(text) 

1349 

1350 pages: list[Path] = [] 

1351 seen_labels: set[str] = set() 

1352 for header_label, (kind, body) in parsed.items(): 

1353 seen_labels.add(header_label) 

1354 resolver = functools.partial( 

1355 _resolve_multi_source_citations, 

1356 source_names=source_names, 

1357 source_hashes=source_hashes, 

1358 chunks_by_source=chunks_by_source, 

1359 ) 

1360 page = _finalize_section( 

1361 header_label=header_label, 

1362 kind=kind, 

1363 body=body, 

1364 chunks=budgeted, 

1365 citation_resolver=resolver, 

1366 source_names=source_names, 

1367 store=store, 

1368 config=config, 

1369 source=source, 

1370 written_concept_slugs=written_concept_slugs, 

1371 drafts_dir=drafts_dir, 

1372 shared_parsed_citations=shared_parsed_citations, 

1373 ) 

1374 if page is not None: 

1375 pages.append(page) 

1376 

1377 for entity in entities: 

1378 if entity.label not in seen_labels: 

1379 marker = ( 

1380 f"{_PENDING_PARSE_MARKER_PREFIX} for source {source}, " 

1381 f"entity/concept {entity.label} - " 

1382 "run wiki build again or manually accept via wiki drafts accept -->" 

1383 ) 

1384 # Route through ``yaml.safe_dump`` so a label or source 

1385 # containing a colon, quote, or newline does not produce a 

1386 # frontmatter block that ``parse_frontmatter`` silently drops. 

1387 frontmatter_body = yaml.safe_dump( 

1388 { 

1389 "pending_source": source, 

1390 "pending_label": entity.label, 

1391 "pending_kind": PENDING_KIND_PARSE, 

1392 }, 

1393 sort_keys=False, 

1394 ) 

1395 frontmatter = f"---\n{frontmatter_body}---\n" 

1396 path = _write_pending_marker(drafts_dir, entity.slug, marker, frontmatter) 

1397 log.info("Wrote PENDING-PARSE marker for %s -> %s", entity.slug, path) 

1398 

1399 return pages 

1400 

1401 

1402def _finalize_section( 

1403 *, 

1404 header_label: str, 

1405 kind: EntityKind, 

1406 body: str, 

1407 chunks: list[SearchChunk], 

1408 citation_resolver: Callable[[list[ParsedCitation]], list[CitationRecord]], 

1409 source_names: list[str], 

1410 store: Store, 

1411 config: Config, 

1412 source: str, 

1413 written_concept_slugs: dict[str, str], 

1414 drafts_dir: Path, 

1415 shared_parsed_citations: list[ParsedCitation], 

1416) -> Path | None: 

1417 """Citation-check, faithfulness-check, write one batched section. 

1418 

1419 Shared by entity and concept sections from the per-source batched 

1420 call. Returns the written page path, or ``None`` if the section 

1421 failed any gate (no citations, empty body, slug collision marker 

1422 handled via side channel). ``shared_parsed_citations`` is the 

1423 definition list parsed once over the whole response — every 

1424 section replays it so pages other than the last one still have 

1425 their footnotes resolved. 

1426 """ 

1427 slug = make_slug(header_label) 

1428 if not slug: 

1429 log.info("Empty slug for batched section %r; skipping", header_label) 

1430 return None 

1431 

1432 # Only replay citation keys that this section actually references 

1433 # in the body; otherwise every section would claim every citation. 

1434 section_keys = {ref.citation_key for ref in parse_wiki_citations(body)} 

1435 # Fall back to in-body ``[^keyN]`` references when no definitions 

1436 # live inside the section: count occurrences of the footnote 

1437 # marker against the shared definition set. 

1438 section_keys.update(_FOOTNOTE_MARKER_RE.findall(body)) 

1439 relevant = [c for c in shared_parsed_citations if c.citation_key in section_keys] 

1440 verified = _verify_citations(citation_resolver(relevant), chunks, header_label, config) 

1441 if not verified: 

1442 log.info("No valid citations for batched section %s, skipping", header_label) 

1443 return None 

1444 

1445 score = _check_faithfulness(chunks, body, header_label, config) 

1446 threshold = config.wiki_embedding_faithfulness_threshold 

1447 page_type = CONCEPTS_SUBDIR if kind is EntityKind.CONCEPT else ENTITIES_SUBDIR 

1448 subdir = page_type if score >= threshold else DRAFTS_SUBDIR 

1449 if subdir == DRAFTS_SUBDIR: 

1450 log.info( 

1451 "Batched section %s scored %.2f (< %.2f), sending to drafts", 

1452 header_label, 

1453 score, 

1454 threshold, 

1455 ) 

1456 

1457 clean_body = strip_citation_block(body) 

1458 frontmatter = _build_frontmatter(config, source_names, score, chunks=chunks) 

1459 citation_block = render_citation_block(verified) 

1460 full_content = _assemble_content(frontmatter, clean_body, citation_block) 

1461 

1462 # Concept collision: the second source proposing a slug loses 

1463 # and writes to a drafts collision marker; the winning source's 

1464 # page stays untouched. 

1465 if kind is EntityKind.CONCEPT and subdir == CONCEPTS_SUBDIR: 

1466 first_source = written_concept_slugs.get(slug) 

1467 if first_source is not None and first_source != source: 

1468 return _divert_concept_collision( 

1469 slug=slug, 

1470 source=source, 

1471 first_source=first_source, 

1472 content=full_content, 

1473 drafts_dir=drafts_dir, 

1474 ) 

1475 written_concept_slugs.setdefault(slug, source) 

1476 

1477 # Successful regen of a previously-PENDING slug: remove the old 

1478 # marker so the drafts surface no longer lists it. 

1479 _delete_pending_marker_if_present(drafts_dir, slug) 

1480 

1481 wiki_root = config.data_root / config.wiki_dir 

1482 target = PageTarget( 

1483 wiki_root=wiki_root, 

1484 subdir=subdir, 

1485 slug=slug, 

1486 wiki_source=f"{config.wiki_dir}/{subdir}/{slug}.md", 

1487 page_type=page_type, 

1488 label=header_label, 

1489 ) 

1490 page_path = _persist_and_finalize(full_content, target, verified, source_names, store, config) 

1491 log.info( 

1492 "Generated batched page for %s -> %s (score=%.2f, citations=%d)", 

1493 header_label, 

1494 target.subdir, 

1495 score, 

1496 len(verified), 

1497 ) 

1498 return page_path 

1499 

1500 

1501def _divert_concept_collision( 

1502 *, 

1503 slug: str, 

1504 source: str, 

1505 first_source: str, 

1506 content: str, 

1507 drafts_dir: Path, 

1508) -> Path: 

1509 """Write the losing concept to ``drafts/<slug>-collision-<hash>.md``. 

1510 

1511 The winning source's page is unchanged on disk. Hash is the 

1512 first 8 hex of sha256(source_filename); stable per source so a 

1513 retry on the same two sources lands at the same draft path, 

1514 letting the user iterate without marker sprawl. 

1515 """ 

1516 short = _short_source_hash(source) 

1517 collision_slug = f"{slug}-collision-{short}" 

1518 marker = ( 

1519 f"{_PENDING_COLLISION_MARKER_PREFIX} with source {first_source}, " 

1520 f"content from {source} held for review -->\n\n" 

1521 ) 

1522 drafts_dir.mkdir(parents=True, exist_ok=True) 

1523 path = drafts_dir / f"{collision_slug}.md" 

1524 path.write_text(marker + content, encoding="utf-8") 

1525 log.warning( 

1526 "Concept slug collision: %s already written by %s; diverted %s's version to %s", 

1527 slug, 

1528 first_source, 

1529 source, 

1530 path, 

1531 ) 

1532 return path 

1533 

1534 

1535def build_wiki( 

1536 entities: list[ExtractedEntity], 

1537 provider: LLMProvider, 

1538 store: Store, 

1539 config: Config | None = None, 

1540 *, 

1541 extract_concepts: bool = True, 

1542) -> list[Path]: 

1543 """Produce entity and LLM-curated concept pages per source. 

1544 

1545 Phase D replaces the per-entity / per-concept fan-out with a 

1546 per-source batched call: for each source in ``entities``' chunk 

1547 refs, one LLM call identifies 3-5 concepts AND writes a wiki 

1548 section for every pre-extracted entity belonging to that source. 

1549 Output sections are split, citation-verified, embedding-scored, 

1550 and landed under ``wiki/entities/`` or ``wiki/concepts/`` 

1551 depending on kind. 

1552 

1553 ``extract_concepts=False`` (used by the incremental-ingest hook) 

1554 drops the concept-curation paragraph from the prompt so a 

1555 touched source does not churn concept slugs. 

1556 

1557 A one-time archive migration runs first (idempotently, gated by 

1558 ``{data_dir}/.phase-d-migrated``), moving pre-Phase-D concept 

1559 pages under ``wiki/archive/concepts/`` and unwrapping stale 

1560 ``[[archived-slug]]`` links across the remaining pages. 

1561 """ 

1562 if config is None: 

1563 config = cfg 

1564 wiki_root = config.data_root / config.wiki_dir 

1565 _maybe_run_phase_d_migration(wiki_root, config.data_dir) 

1566 

1567 grouped = _group_entities_by_primary_source(entities) 

1568 all_sources = _all_sources_in_scope(entities, grouped, store, config, extract_concepts) 

1569 written_concept_slugs: dict[str, str] = {} 

1570 pages: list[Path] = [] 

1571 

1572 for source in sorted(all_sources): 

1573 source_entities = grouped.get(source, []) 

1574 chunks = store.get_chunks_by_source(source) 

1575 chunk_count = len(chunks) 

1576 source_extract = extract_concepts and chunk_count >= config.wiki_batch_min_chunks 

1577 if not source_entities and not source_extract: 

1578 log.info( 

1579 "Skipping source %s: %d entities, %d chunks, min=%d, extract=%s", 

1580 source, 

1581 len(source_entities), 

1582 chunk_count, 

1583 config.wiki_batch_min_chunks, 

1584 source_extract, 

1585 ) 

1586 continue 

1587 source_pages = _generate_source_batch( 

1588 source=source, 

1589 entities=source_entities, 

1590 chunks=chunks, 

1591 provider=provider, 

1592 store=store, 

1593 config=config, 

1594 extract_concepts=source_extract, 

1595 written_concept_slugs=written_concept_slugs, 

1596 ) 

1597 pages.extend(source_pages) 

1598 

1599 _rewrite_links_across_wiki(entities, config) 

1600 log.info("Generated %d batched wiki pages", len(pages)) 

1601 return pages 

1602 

1603 

1604def _all_sources_in_scope( 

1605 entities: list[ExtractedEntity], 

1606 grouped: dict[str, list[ExtractedEntity]], 

1607 store: Store, 

1608 config: Config, 

1609 extract_concepts: bool, 

1610) -> set[str]: 

1611 """Union of sources with entities and (when enabled) eligible for concept curation. 

1612 

1613 Seed the union with every entity's primary source. When 

1614 ``extract_concepts`` is True AND ``wiki_batch_min_chunks`` is 

1615 satisfied, add any source in the store that passes the floor. 

1616 This gives concept-only sources (no extracted entities) their 

1617 chance at curation while keeping zero-entity short sources 

1618 skipped entirely. 

1619 """ 

1620 sources: set[str] = set(grouped) 

1621 if not extract_concepts: 

1622 return sources 

1623 try: 

1624 records = store.get_sources() 

1625 except Exception as exc: 

1626 log.warning("get_sources failed; sticking to entity-grouped sources: %s", exc) 

1627 return sources 

1628 for record in records: 

1629 name = record.get("filename", "") if isinstance(record, dict) else "" 

1630 if not name: 

1631 continue 

1632 if name in sources: 

1633 continue 

1634 chunk_count = record.get("chunk_count", 0) if isinstance(record, dict) else 0 

1635 if chunk_count >= config.wiki_batch_min_chunks: 

1636 sources.add(name) 

1637 _ = entities # silences linters on unused pass-through; kept for doc clarity 

1638 return sources 

1639 

1640 

1641def _entity_surface_map(entities: list[ExtractedEntity]) -> dict[str, str]: 

1642 """Build the surface-form -> slug map for the ``[[link]]`` rewriter. 

1643 

1644 Includes both the entity's human label (e.g. *"Henry Ford"*) and 

1645 the slug-with-hyphens-as-spaces variant (*"henry ford"*) so the 

1646 rewriter catches either form in body text. 

1647 """ 

1648 mapping: dict[str, str] = {} 

1649 for entity in entities: 

1650 mapping[entity.label] = entity.slug 

1651 spaced = entity.slug.replace("-", " ") 

1652 if spaced and spaced != entity.label: 

1653 mapping[spaced] = entity.slug 

1654 return mapping 

1655 

1656 

1657_ENTITY_LIKE_SUBDIRS: tuple[str, ...] = (CONCEPTS_SUBDIR, ENTITIES_SUBDIR) 

1658 

1659 

1660def _augment_surface_map_with_existing_pages( 

1661 surface_to_slug: dict[str, str], wiki_root: Path 

1662) -> None: 

1663 """Add slugs for pages already on disk so an incremental rebuild of 

1664 one concept still links to its unchanged neighbors. **Mutates 

1665 surface_to_slug in place.** Only enriches the map with the 

1666 hyphen-to-space surface form because frontmatter labels aren't 

1667 read here; body prose typically uses the spaced form so this 

1668 covers the common case. 

1669 """ 

1670 for subdir in _ENTITY_LIKE_SUBDIRS: 

1671 subdir_path = wiki_root / subdir 

1672 if not subdir_path.is_dir(): 

1673 continue 

1674 for md_path in subdir_path.rglob("*.md"): 

1675 slug = md_path.stem 

1676 spaced = slug.replace("-", " ") 

1677 surface_to_slug.setdefault(spaced, slug) 

1678 

1679 

1680def _rewrite_links_across_wiki(entities: list[ExtractedEntity], config: Config) -> None: 

1681 """Rewrite ``[[slug]]`` links on every page under ``wiki/`` content subdirs. 

1682 

1683 A page never receives a link to itself: the rewriter takes the 

1684 owning slug and drops it inside its match callback, so the 

1685 surface map is shared unmodified across every page in the walk 

1686 (no O(M) dict rebuild per file). The map is augmented with 

1687 slugs from the existing on-disk corpus so a touched page still 

1688 links to untouched neighbors. The alternation regex + lookup are 

1689 compiled once per build and reused across pages. 

1690 """ 

1691 surface_to_slug = _entity_surface_map(entities) 

1692 wiki_root = config.data_root / config.wiki_dir 

1693 _augment_surface_map_with_existing_pages(surface_to_slug, wiki_root) 

1694 rewriter = compile_rewriter(surface_to_slug) 

1695 if rewriter is None: 

1696 return 

1697 

1698 for subdir in WIKI_CONTENT_SUBDIRS: 

1699 subdir_path = wiki_root / subdir 

1700 if not subdir_path.is_dir(): 

1701 continue 

1702 is_entity_subdir = subdir in _ENTITY_LIKE_SUBDIRS 

1703 for md_path in subdir_path.rglob("*.md"): 

1704 owning_slug = md_path.stem if is_entity_subdir else None 

1705 original = md_path.read_text(encoding="utf-8") 

1706 rewritten = apply_rewriter(original, rewriter, skip_slug=owning_slug) 

1707 if rewritten != original: 

1708 md_path.write_text(rewritten, encoding="utf-8") 

1709 

1710 

1711class WikiBuildSummary(TypedDict): 

1712 """Result of a full wiki build/update.""" 

1713 

1714 paths: list[str] 

1715 entities: int 

1716 count: int 

1717 

1718 

1719def run_full_build(config: Config | None = None) -> WikiBuildSummary: 

1720 """Extract entities + build wiki across every ingested source. 

1721 

1722 Shared entry point for CLI ``wiki build`` / ``wiki update``, MCP 

1723 ``wiki_build`` / ``wiki_update``, and ``POST /api/wiki/build`` / 

1724 ``PATCH /api/wiki/update``. 

1725 

1726 Side effects (in order): 

1727 1. Reads every source via ``store.get_sources()``. 

1728 2. Reads chunks for each source via ``store.get_chunks_by_source``. 

1729 3. Calls the entity extractor (may invoke the LLM provider). 

1730 4. Calls :func:`build_wiki` which writes wiki page files. 

1731 5. Calls :func:`update_wiki_index` which rewrites ``wiki/index.md``. 

1732 6. Calls :func:`append_wiki_log` which appends a build entry. 

1733 

1734 Concurrency: 

1735 Not safe to run concurrently with itself or with another wiki 

1736 write path (drafts accept/reject, prune). Callers that share an 

1737 event loop or process must serialize via an external lock — the 

1738 REST routes do this with a per-process ``asyncio.Lock``; MCP and 

1739 CLI run in their own processes and don't need one. 

1740 

1741 Running concurrently with ``/api/sync`` (an ingest write path 

1742 rather than a wiki write path) is permitted but not coherent: a 

1743 sync that lands between this function's source-scan and per-source 

1744 chunk-fetch may produce a wiki that's missing pages for sources 

1745 ingested mid-build. The result is incomplete, not corrupt, and 

1746 is repaired by re-running ``run_full_build`` after the sync 

1747 finishes. 

1748 

1749 A crash mid-build leaves a partial wiki on disk; the next successful 

1750 build is idempotent and re-emits any pages it would have written, so 

1751 recovery is "run it again." 

1752 """ 

1753 if config is None: 

1754 config = cfg 

1755 from lilbee.wiki.entity_extractor import get_entity_extractor 

1756 from lilbee.wiki.shared import WIKI_LOG_ACTION_BUILD 

1757 

1758 svc = get_services() 

1759 chunks: list[SearchChunk] = [] 

1760 for record in svc.store.get_sources(): 

1761 chunks.extend(svc.store.get_chunks_by_source(record["filename"])) 

1762 

1763 extractor = get_entity_extractor(config.wiki_entity_mode, svc.provider, config) 

1764 entities = extractor.extract(chunks) 

1765 pages = build_wiki( 

1766 entities, 

1767 svc.provider, 

1768 svc.store, 

1769 config, 

1770 extract_concepts=config.wiki_extract_concepts, 

1771 ) 

1772 update_wiki_index() 

1773 append_wiki_log(WIKI_LOG_ACTION_BUILD, f"{len(pages)} pages from {len(entities)} records") 

1774 return { 

1775 "paths": [str(p) for p in pages], 

1776 "entities": len(entities), 

1777 "count": len(pages), 

1778 } 

1779 

1780 

1781class WikiSynthesizeSummary(TypedDict): 

1782 """Result of running synthesis-page generation.""" 

1783 

1784 paths: list[str] 

1785 count: int 

1786 

1787 

1788def run_full_synthesize(config: Config | None = None) -> WikiSynthesizeSummary: 

1789 """Generate synthesis pages for cross-source clusters of 3+ documents. 

1790 

1791 Shared entry point for MCP ``wiki_synthesize`` and ``POST 

1792 /api/wiki/synthesize``. Concurrency contract matches 

1793 :func:`run_full_build`: not safe to run in parallel with itself or 

1794 with other wiki write paths; callers serialize via an external lock 

1795 on shared event loops. 

1796 """ 

1797 if config is None: 

1798 config = cfg 

1799 svc = get_services() 

1800 paths = generate_synthesis_pages(svc.provider, svc.store, svc.clusterer, config) 

1801 return { 

1802 "paths": [str(p) for p in paths], 

1803 "count": len(paths), 

1804 }