Coverage for src / lilbee / wiki / drafts.py: 100%

141 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Draft review surface. List, diff, accept, reject wiki drafts. 

2 

3Wiki generation routes pages to ``wiki/drafts/`` when the content 

4drift against an existing page exceeds the configured threshold or 

5when the faithfulness score falls below it. Without a review 

6surface drafts accumulate with no exit ramp, so this module exposes 

7the four operations a reviewer needs: see what is pending, diff 

8against the published version, accept (overwrite the published 

9page and re-index its chunks), or reject (delete the draft file). 

10""" 

11 

12from __future__ import annotations 

13 

14import difflib 

15import logging 

16import re 

17from dataclasses import dataclass 

18from pathlib import Path 

19from typing import Any 

20 

21from lilbee.store import Store 

22from lilbee.wiki.gen import index_wiki_page 

23from lilbee.wiki.shared import ( 

24 CONCEPTS_SUBDIR, 

25 DRAFTS_SUBDIR, 

26 ENTITIES_SUBDIR, 

27 PENDING_KIND_COLLISION, 

28 PENDING_KIND_DRIFT, 

29 PENDING_KIND_PARSE, 

30 PENDING_MARKER_KEYWORD_COLLISION, 

31 PENDING_MARKER_KEYWORD_PARSE, 

32 SUMMARIES_SUBDIR, 

33 SYNTHESIS_SUBDIR, 

34 parse_frontmatter, 

35) 

36 

37# Re-export the kind constants from wiki.shared so existing imports 

38# (``from lilbee.wiki.drafts import PENDING_KIND_PARSE``) keep working. 

39# Their canonical home is :mod:`lilbee.wiki.shared` — the writer side 

40# in :mod:`lilbee.wiki.gen` would create a circular import if it 

41# reached into this module for them. 

42__all__ = [ 

43 "PENDING_KIND_COLLISION", 

44 "PENDING_KIND_DRIFT", 

45 "PENDING_KIND_PARSE", 

46 "AcceptResult", 

47 "DraftInfo", 

48 "accept_draft", 

49 "diff_draft", 

50 "list_drafts", 

51 "reject_draft", 

52] 

53 

54log = logging.getLogger(__name__) 

55 

56_DRIFT_MARKER_RE = re.compile( 

57 r"<!--\s*DRIFT:\s*(?P<pct>\d+)%\s*content changed[^>]*-->", 

58 re.IGNORECASE, 

59) 

60 

61# Phase D: batched-generation pending markers. The per-source batched 

62# call writes one of these when the parser could not recover a 

63# requested section, or when two sources proposed the same concept 

64# slug and the second write lost the race. The keyword phrases live 

65# in ``wiki.shared`` so writer (gen) and reader (drafts) agree on the 

66# exact wording; this regex adds the ``<!--`` wrapper plus ``\s+`` in 

67# place of each literal space, so the reader tolerates double-space 

68# variations in cached markers. Keywords carry no regex metacharacters 

69# so ``re.escape`` is unnecessary. 

70_PARSE_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_PARSE.replace(" ", r"\s+") 

71_COLLISION_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_COLLISION.replace(" ", r"\s+") 

72_PENDING_PARSE_MARKER_RE = re.compile( 

73 rf"<!--\s*{_PARSE_KEYWORD_PATTERN}[^>]*-->", 

74 re.IGNORECASE, 

75) 

76_PENDING_COLLISION_MARKER_RE = re.compile( 

77 rf"<!--\s*{_COLLISION_KEYWORD_PATTERN}[^>]*-->", 

78 re.IGNORECASE, 

79) 

80 

81# Published wiki subdirs searched in priority order when pairing a 

82# draft slug with its counterpart. Summaries and synthesis come first 

83# because they are the subdirs most drafts originate from (drift 

84# detection runs on regen of an existing source or cluster page). 

85_PUBLISHED_SUBDIRS: tuple[str, ...] = ( 

86 SUMMARIES_SUBDIR, 

87 SYNTHESIS_SUBDIR, 

88 CONCEPTS_SUBDIR, 

89 ENTITIES_SUBDIR, 

90) 

91 

92 

93@dataclass 

94class DraftInfo: 

95 """Metadata about a single draft, surfaced in ``wiki drafts list``. 

96 

97 Phase D: ``pending_kind`` distinguishes drift drafts (None) from 

98 batched-generation markers (``"parse"``, ``"collision"``). Callers 

99 can render the kind in the list view and branch on it when 

100 deciding how to surface the draft (e.g. a collision needs the 

101 winning-source context, a parse marker just needs a rerun). 

102 """ 

103 

104 slug: str 

105 path: Path 

106 drift_ratio: float | None 

107 faithfulness_score: float | None 

108 bad_title: bool 

109 published_path: Path | None 

110 mtime: float 

111 pending_kind: str | None = None 

112 

113 @property 

114 def published_exists(self) -> bool: 

115 """True when a matching published page exists for this draft.""" 

116 return self.published_path is not None 

117 

118 def to_dict(self) -> dict[str, Any]: 

119 """Serialize to a JSON-friendly dict.""" 

120 return { 

121 "slug": self.slug, 

122 "path": str(self.path), 

123 "drift_ratio": self.drift_ratio, 

124 "faithfulness_score": self.faithfulness_score, 

125 "bad_title": self.bad_title, 

126 "published_path": str(self.published_path) if self.published_path else None, 

127 "published_exists": self.published_exists, 

128 "mtime": self.mtime, 

129 "pending_kind": self.pending_kind, 

130 } 

131 

132 

133@dataclass 

134class AcceptResult: 

135 """Outcome of accepting a draft. Returned so callers can confirm. 

136 

137 ``requested_slug`` is always the slug the caller asked to accept 

138 (for PENDING-COLLISION drafts this looks like 

139 ``brakes-collision-abc12345``). ``slug`` is where the content 

140 landed (the de-collisioned base slug, so ``brakes``). For 

141 non-collision drafts the two match. HTTP clients that round-trip 

142 accept→list-refresh can compare both fields to track the rename. 

143 """ 

144 

145 slug: str 

146 requested_slug: str 

147 moved_to: Path 

148 reindexed_chunks: int 

149 

150 def to_dict(self) -> dict[str, Any]: 

151 """Serialize to a JSON-friendly dict for HTTP/MCP/CLI responses.""" 

152 return { 

153 "slug": self.slug, 

154 "requested_slug": self.requested_slug, 

155 "moved_to": self.moved_to.as_posix(), 

156 "reindexed_chunks": self.reindexed_chunks, 

157 } 

158 

159 

160def _draft_path(wiki_root: Path, slug: str) -> Path: 

161 return wiki_root / DRAFTS_SUBDIR / f"{slug}.md" 

162 

163 

164def _find_published(wiki_root: Path, slug: str) -> Path | None: 

165 """Return the first published page matching *slug*, or None. 

166 

167 Checks summaries, synthesis, concepts, and entities subdirs in 

168 priority order so a draft regenerated from an existing summary 

169 page pairs with its original rather than the same slug under a 

170 different page type. 

171 """ 

172 for subdir in _PUBLISHED_SUBDIRS: 

173 candidate = wiki_root / subdir / f"{slug}.md" 

174 if candidate.is_file(): 

175 return candidate 

176 return None 

177 

178 

179def _parse_drift_ratio(text: str) -> float | None: 

180 """Extract the drift percentage from a draft's leading marker.""" 

181 match = _DRIFT_MARKER_RE.search(text) 

182 if match is None: 

183 return None 

184 return int(match.group("pct")) / 100.0 

185 

186 

187def _parse_pending_kind(text: str) -> str | None: 

188 """Classify *text* as a PENDING-PARSE, PENDING-COLLISION, or neither. 

189 

190 Returns ``None`` when the leading marker is absent or is the 

191 drift marker. Only inspects the first marker encountered so a 

192 draft body that quotes the HTML comment (unlikely but possible) 

193 does not get mis-classified. 

194 """ 

195 if _PENDING_PARSE_MARKER_RE.search(text): 

196 return PENDING_KIND_PARSE 

197 if _PENDING_COLLISION_MARKER_RE.search(text): 

198 return PENDING_KIND_COLLISION 

199 return None 

200 

201 

202def _strip_drift_marker(text: str) -> str: 

203 """Remove the drift-review marker so accepted content lands clean.""" 

204 return _DRIFT_MARKER_RE.sub("", text, count=1).lstrip() 

205 

206 

207def _strip_pending_markers(text: str) -> str: 

208 """Remove PENDING-PARSE/COLLISION markers on the way into a published page.""" 

209 text = _PENDING_PARSE_MARKER_RE.sub("", text, count=1) 

210 text = _PENDING_COLLISION_MARKER_RE.sub("", text, count=1) 

211 return text.lstrip() 

212 

213 

214def _classify_and_strip_markers(text: str) -> tuple[str | None, float | None, str]: 

215 """Single-pass read: parse kind, drift ratio, and return marker-stripped body. 

216 

217 ``list_drafts`` used to run five ``.sub()`` traversals per draft 

218 (two for pending-marker stripping, three across the drift helpers 

219 and their callers). This helper does three ``.sub()`` passes plus 

220 the three ``.search()`` scans needed to detect which markers are 

221 present, returning kind, drift ratio, and stripped body together. 

222 """ 

223 pending_kind = _parse_pending_kind(text) 

224 drift = _parse_drift_ratio(text) 

225 stripped = _PENDING_PARSE_MARKER_RE.sub("", text, count=1) 

226 stripped = _PENDING_COLLISION_MARKER_RE.sub("", stripped, count=1) 

227 stripped = _DRIFT_MARKER_RE.sub("", stripped, count=1) 

228 return pending_kind, drift, stripped.lstrip() 

229 

230 

231def list_drafts(wiki_root: Path) -> list[DraftInfo]: 

232 """Return one ``DraftInfo`` per draft markdown file under ``drafts/``. 

233 

234 Recurses so per-source draft nesting (``drafts/<source>/page.md``) 

235 is covered. Reads each draft's full text once, classifies any 

236 pending marker and drift ratio, strips the markers, then parses 

237 frontmatter on the stripped body (so frontmatter parsing works 

238 uniformly whether or not a marker shifted it down). 

239 """ 

240 drafts_dir = wiki_root / DRAFTS_SUBDIR 

241 if not drafts_dir.is_dir(): 

242 return [] 

243 infos: list[DraftInfo] = [] 

244 for path in sorted(drafts_dir.rglob("*.md")): 

245 text = path.read_text(encoding="utf-8") 

246 pending_kind, drift, stripped = _classify_and_strip_markers(text) 

247 fm = parse_frontmatter(stripped) 

248 slug = str(path.relative_to(drafts_dir).with_suffix("")).replace("\\", "/") 

249 infos.append( 

250 DraftInfo( 

251 slug=slug, 

252 path=path, 

253 drift_ratio=drift, 

254 faithfulness_score=_coerce_float(fm.get("faithfulness_score")), 

255 bad_title=bool(fm.get("bad_title", False)), 

256 published_path=_find_published(wiki_root, slug), 

257 mtime=path.stat().st_mtime, 

258 pending_kind=pending_kind, 

259 ) 

260 ) 

261 return infos 

262 

263 

264def diff_draft(slug: str, wiki_root: Path) -> str: 

265 """Return a unified diff of the draft against its published counterpart. 

266 

267 Raises :class:`FileNotFoundError` when the draft does not exist. 

268 When no published counterpart exists the diff shows the draft as 

269 all-new (baseline empty), which is useful for reviewing drafts 

270 that originated from a fresh low-faithfulness generation. 

271 """ 

272 draft = _draft_path(wiki_root, slug) 

273 if not draft.is_file(): 

274 raise FileNotFoundError(f"draft not found: {slug}") 

275 draft_text = draft.read_text(encoding="utf-8") 

276 published = _find_published(wiki_root, slug) 

277 baseline = published.read_text(encoding="utf-8") if published else "" 

278 diff = difflib.unified_diff( 

279 baseline.splitlines(), 

280 draft_text.splitlines(), 

281 fromfile=str(published) if published else "(new draft)", 

282 tofile=str(draft), 

283 lineterm="", 

284 ) 

285 return "\n".join(diff) 

286 

287 

288_COLLISION_SUFFIX_RE = re.compile(r"-collision-[0-9a-f]{8}$") 

289 

290 

291def _base_slug_for_collision(slug: str) -> str: 

292 """Strip the ``-collision-<hash>`` suffix so accept lands on the winning slug.""" 

293 return _COLLISION_SUFFIX_RE.sub("", slug) 

294 

295 

296def accept_draft(slug: str, wiki_root: Path, store: Store) -> AcceptResult: 

297 """Move the draft into its published subdir and re-index its chunks. 

298 

299 Behavior branches on the draft's pending kind: 

300 

301 - **Drift draft** (default): write the accepted body to its 

302 published counterpart (or ``summaries/`` when unpaired), 

303 re-index, delete the draft. 

304 - **PENDING-PARSE** (batched-generation parser could not recover 

305 a section): accepting is a no-op on the published side — the 

306 marker has no body to accept. The marker is deleted and the 

307 user is told to run ``wiki build`` to regenerate. Returns an 

308 ``AcceptResult`` with ``reindexed_chunks=0`` and 

309 ``moved_to`` pointing at the deleted marker. 

310 - **PENDING-COLLISION** (two sources proposed the same concept 

311 slug): strips the ``-collision-<hash>`` suffix to find the 

312 winning slug, overwrites the winning page with this draft's 

313 body, re-indexes, deletes the collision marker. 

314 

315 Sequence for drift/collision: write the published file first, 

316 re-index next, delete the draft last. If the re-index raises 

317 (chunker, embedder, LanceDB contention), the draft file stays 

318 on disk so the user can retry ``accept`` — ``index_wiki_page`` 

319 is idempotent on the same ``wiki_source`` (``clear_table`` + 

320 re-write). 

321 

322 Raises :class:`FileNotFoundError` when the draft does not exist. 

323 """ 

324 draft = _draft_path(wiki_root, slug) 

325 if not draft.is_file(): 

326 raise FileNotFoundError(f"draft not found: {slug}") 

327 raw = draft.read_text(encoding="utf-8") 

328 pending_kind = _parse_pending_kind(raw) 

329 

330 if pending_kind == PENDING_KIND_PARSE: 

331 draft.unlink() 

332 log.info( 

333 "Accepted PENDING-PARSE marker %s; run `lilbee wiki build` " 

334 "to regenerate the missing section.", 

335 slug, 

336 ) 

337 return AcceptResult(slug=slug, requested_slug=slug, moved_to=draft, reindexed_chunks=0) 

338 

339 clean = _strip_pending_markers(_strip_drift_marker(raw)) 

340 

341 target_slug = _base_slug_for_collision(slug) if pending_kind == PENDING_KIND_COLLISION else slug 

342 published = _find_published(wiki_root, target_slug) 

343 if published is not None: 

344 target = published 

345 else: 

346 target = wiki_root / SUMMARIES_SUBDIR / f"{target_slug}.md" 

347 log.info( 

348 "Draft %s has no published counterpart; accepting into %s", 

349 slug, 

350 SUMMARIES_SUBDIR, 

351 ) 

352 target.parent.mkdir(parents=True, exist_ok=True) 

353 target.write_text(clean, encoding="utf-8") 

354 

355 reindexed = _reindex_accepted_page(target, wiki_root, store) 

356 draft.unlink() 

357 log.info("Accepted draft %s -> %s (%d chunks indexed)", slug, target, reindexed) 

358 return AcceptResult( 

359 slug=target_slug, 

360 requested_slug=slug, 

361 moved_to=target, 

362 reindexed_chunks=reindexed, 

363 ) 

364 

365 

366def reject_draft(slug: str, wiki_root: Path) -> None: 

367 """Delete the draft file without touching the published page or the index.""" 

368 draft = _draft_path(wiki_root, slug) 

369 if not draft.is_file(): 

370 raise FileNotFoundError(f"draft not found: {slug}") 

371 draft.unlink() 

372 log.info("Rejected draft %s", slug) 

373 

374 

375def _reindex_accepted_page(target: Path, wiki_root: Path, store: Store) -> int: 

376 """Re-index *target* via :func:`lilbee.wiki.gen.index_wiki_page`. 

377 

378 Returns the number of ``chunk_type="wiki"`` rows written. Routes 

379 through the same chunk / embed / clear-and-rewrite path as initial 

380 page generation, so an accepted draft is indexed identically to a 

381 fresh page and no bespoke accept-time code path exists. 

382 """ 

383 wiki_source = _wiki_source_for(target, wiki_root) 

384 content = target.read_text(encoding="utf-8") 

385 return index_wiki_page(content, wiki_source, store) 

386 

387 

388def _wiki_source_for(target: Path, wiki_root: Path) -> str: 

389 """Build the ``wiki_source`` identifier used in the chunks table. 

390 

391 Shape matches :attr:`PageTarget.wiki_source`: 

392 ``<wiki_dir>/<subdir>/<slug>.md``. 

393 """ 

394 wiki_dir_name = wiki_root.name 

395 relative = target.relative_to(wiki_root) 

396 return f"{wiki_dir_name}/{relative.as_posix()}" 

397 

398 

399def _coerce_float(value: Any) -> float | None: 

400 """Return *value* as a float, or None when conversion is not sensible.""" 

401 if value is None: 

402 return None 

403 try: 

404 return float(value) 

405 except (TypeError, ValueError): 

406 return None