Coverage for src / lilbee / wiki / drafts.py: 100%
141 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Draft review surface. List, diff, accept, reject wiki drafts.
3Wiki generation routes pages to ``wiki/drafts/`` when the content
4drift against an existing page exceeds the configured threshold or
5when the faithfulness score falls below it. Without a review
6surface drafts accumulate with no exit ramp, so this module exposes
7the four operations a reviewer needs: see what is pending, diff
8against the published version, accept (overwrite the published
9page and re-index its chunks), or reject (delete the draft file).
10"""
12from __future__ import annotations
14import difflib
15import logging
16import re
17from dataclasses import dataclass
18from pathlib import Path
19from typing import Any
21from lilbee.store import Store
22from lilbee.wiki.gen import index_wiki_page
23from lilbee.wiki.shared import (
24 CONCEPTS_SUBDIR,
25 DRAFTS_SUBDIR,
26 ENTITIES_SUBDIR,
27 PENDING_KIND_COLLISION,
28 PENDING_KIND_DRIFT,
29 PENDING_KIND_PARSE,
30 PENDING_MARKER_KEYWORD_COLLISION,
31 PENDING_MARKER_KEYWORD_PARSE,
32 SUMMARIES_SUBDIR,
33 SYNTHESIS_SUBDIR,
34 parse_frontmatter,
35)
37# Re-export the kind constants from wiki.shared so existing imports
38# (``from lilbee.wiki.drafts import PENDING_KIND_PARSE``) keep working.
39# Their canonical home is :mod:`lilbee.wiki.shared` — the writer side
40# in :mod:`lilbee.wiki.gen` would create a circular import if it
41# reached into this module for them.
42__all__ = [
43 "PENDING_KIND_COLLISION",
44 "PENDING_KIND_DRIFT",
45 "PENDING_KIND_PARSE",
46 "AcceptResult",
47 "DraftInfo",
48 "accept_draft",
49 "diff_draft",
50 "list_drafts",
51 "reject_draft",
52]
54log = logging.getLogger(__name__)
56_DRIFT_MARKER_RE = re.compile(
57 r"<!--\s*DRIFT:\s*(?P<pct>\d+)%\s*content changed[^>]*-->",
58 re.IGNORECASE,
59)
61# Phase D: batched-generation pending markers. The per-source batched
62# call writes one of these when the parser could not recover a
63# requested section, or when two sources proposed the same concept
64# slug and the second write lost the race. The keyword phrases live
65# in ``wiki.shared`` so writer (gen) and reader (drafts) agree on the
66# exact wording; this regex adds the ``<!--`` wrapper plus ``\s+`` in
67# place of each literal space, so the reader tolerates double-space
68# variations in cached markers. Keywords carry no regex metacharacters
69# so ``re.escape`` is unnecessary.
70_PARSE_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_PARSE.replace(" ", r"\s+")
71_COLLISION_KEYWORD_PATTERN = PENDING_MARKER_KEYWORD_COLLISION.replace(" ", r"\s+")
72_PENDING_PARSE_MARKER_RE = re.compile(
73 rf"<!--\s*{_PARSE_KEYWORD_PATTERN}[^>]*-->",
74 re.IGNORECASE,
75)
76_PENDING_COLLISION_MARKER_RE = re.compile(
77 rf"<!--\s*{_COLLISION_KEYWORD_PATTERN}[^>]*-->",
78 re.IGNORECASE,
79)
81# Published wiki subdirs searched in priority order when pairing a
82# draft slug with its counterpart. Summaries and synthesis come first
83# because they are the subdirs most drafts originate from (drift
84# detection runs on regen of an existing source or cluster page).
85_PUBLISHED_SUBDIRS: tuple[str, ...] = (
86 SUMMARIES_SUBDIR,
87 SYNTHESIS_SUBDIR,
88 CONCEPTS_SUBDIR,
89 ENTITIES_SUBDIR,
90)
93@dataclass
94class DraftInfo:
95 """Metadata about a single draft, surfaced in ``wiki drafts list``.
97 Phase D: ``pending_kind`` distinguishes drift drafts (None) from
98 batched-generation markers (``"parse"``, ``"collision"``). Callers
99 can render the kind in the list view and branch on it when
100 deciding how to surface the draft (e.g. a collision needs the
101 winning-source context, a parse marker just needs a rerun).
102 """
104 slug: str
105 path: Path
106 drift_ratio: float | None
107 faithfulness_score: float | None
108 bad_title: bool
109 published_path: Path | None
110 mtime: float
111 pending_kind: str | None = None
113 @property
114 def published_exists(self) -> bool:
115 """True when a matching published page exists for this draft."""
116 return self.published_path is not None
118 def to_dict(self) -> dict[str, Any]:
119 """Serialize to a JSON-friendly dict."""
120 return {
121 "slug": self.slug,
122 "path": str(self.path),
123 "drift_ratio": self.drift_ratio,
124 "faithfulness_score": self.faithfulness_score,
125 "bad_title": self.bad_title,
126 "published_path": str(self.published_path) if self.published_path else None,
127 "published_exists": self.published_exists,
128 "mtime": self.mtime,
129 "pending_kind": self.pending_kind,
130 }
133@dataclass
134class AcceptResult:
135 """Outcome of accepting a draft. Returned so callers can confirm.
137 ``requested_slug`` is always the slug the caller asked to accept
138 (for PENDING-COLLISION drafts this looks like
139 ``brakes-collision-abc12345``). ``slug`` is where the content
140 landed (the de-collisioned base slug, so ``brakes``). For
141 non-collision drafts the two match. HTTP clients that round-trip
142 accept→list-refresh can compare both fields to track the rename.
143 """
145 slug: str
146 requested_slug: str
147 moved_to: Path
148 reindexed_chunks: int
150 def to_dict(self) -> dict[str, Any]:
151 """Serialize to a JSON-friendly dict for HTTP/MCP/CLI responses."""
152 return {
153 "slug": self.slug,
154 "requested_slug": self.requested_slug,
155 "moved_to": self.moved_to.as_posix(),
156 "reindexed_chunks": self.reindexed_chunks,
157 }
160def _draft_path(wiki_root: Path, slug: str) -> Path:
161 return wiki_root / DRAFTS_SUBDIR / f"{slug}.md"
164def _find_published(wiki_root: Path, slug: str) -> Path | None:
165 """Return the first published page matching *slug*, or None.
167 Checks summaries, synthesis, concepts, and entities subdirs in
168 priority order so a draft regenerated from an existing summary
169 page pairs with its original rather than the same slug under a
170 different page type.
171 """
172 for subdir in _PUBLISHED_SUBDIRS:
173 candidate = wiki_root / subdir / f"{slug}.md"
174 if candidate.is_file():
175 return candidate
176 return None
179def _parse_drift_ratio(text: str) -> float | None:
180 """Extract the drift percentage from a draft's leading marker."""
181 match = _DRIFT_MARKER_RE.search(text)
182 if match is None:
183 return None
184 return int(match.group("pct")) / 100.0
187def _parse_pending_kind(text: str) -> str | None:
188 """Classify *text* as a PENDING-PARSE, PENDING-COLLISION, or neither.
190 Returns ``None`` when the leading marker is absent or is the
191 drift marker. Only inspects the first marker encountered so a
192 draft body that quotes the HTML comment (unlikely but possible)
193 does not get mis-classified.
194 """
195 if _PENDING_PARSE_MARKER_RE.search(text):
196 return PENDING_KIND_PARSE
197 if _PENDING_COLLISION_MARKER_RE.search(text):
198 return PENDING_KIND_COLLISION
199 return None
202def _strip_drift_marker(text: str) -> str:
203 """Remove the drift-review marker so accepted content lands clean."""
204 return _DRIFT_MARKER_RE.sub("", text, count=1).lstrip()
207def _strip_pending_markers(text: str) -> str:
208 """Remove PENDING-PARSE/COLLISION markers on the way into a published page."""
209 text = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)
210 text = _PENDING_COLLISION_MARKER_RE.sub("", text, count=1)
211 return text.lstrip()
214def _classify_and_strip_markers(text: str) -> tuple[str | None, float | None, str]:
215 """Single-pass read: parse kind, drift ratio, and return marker-stripped body.
217 ``list_drafts`` used to run five ``.sub()`` traversals per draft
218 (two for pending-marker stripping, three across the drift helpers
219 and their callers). This helper does three ``.sub()`` passes plus
220 the three ``.search()`` scans needed to detect which markers are
221 present, returning kind, drift ratio, and stripped body together.
222 """
223 pending_kind = _parse_pending_kind(text)
224 drift = _parse_drift_ratio(text)
225 stripped = _PENDING_PARSE_MARKER_RE.sub("", text, count=1)
226 stripped = _PENDING_COLLISION_MARKER_RE.sub("", stripped, count=1)
227 stripped = _DRIFT_MARKER_RE.sub("", stripped, count=1)
228 return pending_kind, drift, stripped.lstrip()
231def list_drafts(wiki_root: Path) -> list[DraftInfo]:
232 """Return one ``DraftInfo`` per draft markdown file under ``drafts/``.
234 Recurses so per-source draft nesting (``drafts/<source>/page.md``)
235 is covered. Reads each draft's full text once, classifies any
236 pending marker and drift ratio, strips the markers, then parses
237 frontmatter on the stripped body (so frontmatter parsing works
238 uniformly whether or not a marker shifted it down).
239 """
240 drafts_dir = wiki_root / DRAFTS_SUBDIR
241 if not drafts_dir.is_dir():
242 return []
243 infos: list[DraftInfo] = []
244 for path in sorted(drafts_dir.rglob("*.md")):
245 text = path.read_text(encoding="utf-8")
246 pending_kind, drift, stripped = _classify_and_strip_markers(text)
247 fm = parse_frontmatter(stripped)
248 slug = str(path.relative_to(drafts_dir).with_suffix("")).replace("\\", "/")
249 infos.append(
250 DraftInfo(
251 slug=slug,
252 path=path,
253 drift_ratio=drift,
254 faithfulness_score=_coerce_float(fm.get("faithfulness_score")),
255 bad_title=bool(fm.get("bad_title", False)),
256 published_path=_find_published(wiki_root, slug),
257 mtime=path.stat().st_mtime,
258 pending_kind=pending_kind,
259 )
260 )
261 return infos
264def diff_draft(slug: str, wiki_root: Path) -> str:
265 """Return a unified diff of the draft against its published counterpart.
267 Raises :class:`FileNotFoundError` when the draft does not exist.
268 When no published counterpart exists the diff shows the draft as
269 all-new (baseline empty), which is useful for reviewing drafts
270 that originated from a fresh low-faithfulness generation.
271 """
272 draft = _draft_path(wiki_root, slug)
273 if not draft.is_file():
274 raise FileNotFoundError(f"draft not found: {slug}")
275 draft_text = draft.read_text(encoding="utf-8")
276 published = _find_published(wiki_root, slug)
277 baseline = published.read_text(encoding="utf-8") if published else ""
278 diff = difflib.unified_diff(
279 baseline.splitlines(),
280 draft_text.splitlines(),
281 fromfile=str(published) if published else "(new draft)",
282 tofile=str(draft),
283 lineterm="",
284 )
285 return "\n".join(diff)
288_COLLISION_SUFFIX_RE = re.compile(r"-collision-[0-9a-f]{8}$")
291def _base_slug_for_collision(slug: str) -> str:
292 """Strip the ``-collision-<hash>`` suffix so accept lands on the winning slug."""
293 return _COLLISION_SUFFIX_RE.sub("", slug)
296def accept_draft(slug: str, wiki_root: Path, store: Store) -> AcceptResult:
297 """Move the draft into its published subdir and re-index its chunks.
299 Behavior branches on the draft's pending kind:
301 - **Drift draft** (default): write the accepted body to its
302 published counterpart (or ``summaries/`` when unpaired),
303 re-index, delete the draft.
304 - **PENDING-PARSE** (batched-generation parser could not recover
305 a section): accepting is a no-op on the published side — the
306 marker has no body to accept. The marker is deleted and the
307 user is told to run ``wiki build`` to regenerate. Returns an
308 ``AcceptResult`` with ``reindexed_chunks=0`` and
309 ``moved_to`` pointing at the deleted marker.
310 - **PENDING-COLLISION** (two sources proposed the same concept
311 slug): strips the ``-collision-<hash>`` suffix to find the
312 winning slug, overwrites the winning page with this draft's
313 body, re-indexes, deletes the collision marker.
315 Sequence for drift/collision: write the published file first,
316 re-index next, delete the draft last. If the re-index raises
317 (chunker, embedder, LanceDB contention), the draft file stays
318 on disk so the user can retry ``accept`` — ``index_wiki_page``
319 is idempotent on the same ``wiki_source`` (``clear_table`` +
320 re-write).
322 Raises :class:`FileNotFoundError` when the draft does not exist.
323 """
324 draft = _draft_path(wiki_root, slug)
325 if not draft.is_file():
326 raise FileNotFoundError(f"draft not found: {slug}")
327 raw = draft.read_text(encoding="utf-8")
328 pending_kind = _parse_pending_kind(raw)
330 if pending_kind == PENDING_KIND_PARSE:
331 draft.unlink()
332 log.info(
333 "Accepted PENDING-PARSE marker %s; run `lilbee wiki build` "
334 "to regenerate the missing section.",
335 slug,
336 )
337 return AcceptResult(slug=slug, requested_slug=slug, moved_to=draft, reindexed_chunks=0)
339 clean = _strip_pending_markers(_strip_drift_marker(raw))
341 target_slug = _base_slug_for_collision(slug) if pending_kind == PENDING_KIND_COLLISION else slug
342 published = _find_published(wiki_root, target_slug)
343 if published is not None:
344 target = published
345 else:
346 target = wiki_root / SUMMARIES_SUBDIR / f"{target_slug}.md"
347 log.info(
348 "Draft %s has no published counterpart; accepting into %s",
349 slug,
350 SUMMARIES_SUBDIR,
351 )
352 target.parent.mkdir(parents=True, exist_ok=True)
353 target.write_text(clean, encoding="utf-8")
355 reindexed = _reindex_accepted_page(target, wiki_root, store)
356 draft.unlink()
357 log.info("Accepted draft %s -> %s (%d chunks indexed)", slug, target, reindexed)
358 return AcceptResult(
359 slug=target_slug,
360 requested_slug=slug,
361 moved_to=target,
362 reindexed_chunks=reindexed,
363 )
366def reject_draft(slug: str, wiki_root: Path) -> None:
367 """Delete the draft file without touching the published page or the index."""
368 draft = _draft_path(wiki_root, slug)
369 if not draft.is_file():
370 raise FileNotFoundError(f"draft not found: {slug}")
371 draft.unlink()
372 log.info("Rejected draft %s", slug)
375def _reindex_accepted_page(target: Path, wiki_root: Path, store: Store) -> int:
376 """Re-index *target* via :func:`lilbee.wiki.gen.index_wiki_page`.
378 Returns the number of ``chunk_type="wiki"`` rows written. Routes
379 through the same chunk / embed / clear-and-rewrite path as initial
380 page generation, so an accepted draft is indexed identically to a
381 fresh page and no bespoke accept-time code path exists.
382 """
383 wiki_source = _wiki_source_for(target, wiki_root)
384 content = target.read_text(encoding="utf-8")
385 return index_wiki_page(content, wiki_source, store)
388def _wiki_source_for(target: Path, wiki_root: Path) -> str:
389 """Build the ``wiki_source`` identifier used in the chunks table.
391 Shape matches :attr:`PageTarget.wiki_source`:
392 ``<wiki_dir>/<subdir>/<slug>.md``.
393 """
394 wiki_dir_name = wiki_root.name
395 relative = target.relative_to(wiki_root)
396 return f"{wiki_dir_name}/{relative.as_posix()}"
399def _coerce_float(value: Any) -> float | None:
400 """Return *value* as a float, or None when conversion is not sensible."""
401 if value is None:
402 return None
403 try:
404 return float(value)
405 except (TypeError, ValueError):
406 return None