Coverage for src / lilbee / wiki / shared.py: 100%
81 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Shared wiki utilities — frontmatter parsing, constants, slug generation."""
3from __future__ import annotations
5import re
6from dataclasses import dataclass
7from enum import StrEnum
8from pathlib import Path
9from typing import Any
11import yaml
13MIN_CLUSTER_SOURCES = 3 # minimum unique sources for a synthesis page
15SUMMARIES_SUBDIR = "summaries"
16SYNTHESIS_SUBDIR = "synthesis"
17CONCEPTS_SUBDIR = "concepts"
18ENTITIES_SUBDIR = "entities"
19DRAFTS_SUBDIR = "drafts"
20ARCHIVE_SUBDIR = "archive"
23class WikiPageType(StrEnum):
24 """Kind of wiki page. Values are used as frontmatter/API labels."""
26 SUMMARY = "summary"
27 SYNTHESIS = "synthesis"
28 CONCEPT = "concept"
29 ENTITY = "entity"
30 DRAFT = "draft"
31 ARCHIVE = "archive"
34WIKI_CONTENT_SUBDIRS: tuple[str, ...] = (
35 SUMMARIES_SUBDIR,
36 SYNTHESIS_SUBDIR,
37 CONCEPTS_SUBDIR,
38 ENTITIES_SUBDIR,
39)
41WIKI_DISABLED_ERROR = "wiki not enabled"
43# PENDING-marker keyword phrases written into ``drafts/<slug>.md`` by the
44# batched generator and matched by the drafts-review surface. Centralized
45# here so the gen-side writer and the drafts-side reader agree on the
46# exact wording. Changing a keyword here requires updating any cached
47# markers on disk (one-shot find -delete or a regen).
48PENDING_MARKER_KEYWORD_PARSE = "PENDING: batch parse failed"
49PENDING_MARKER_KEYWORD_COLLISION = "PENDING: concept slug collision"
51# Values written into the ``pending_kind`` frontmatter field and
52# surfaced verbatim through ``DraftInfo.pending_kind`` to CLI / HTTP /
53# MCP callers. Kept as plain string constants (not an enum) because the
54# value round-trips through YAML and JSON without translation.
55PENDING_KIND_PARSE = "parse"
56PENDING_KIND_COLLISION = "collision"
57# Display-only default shown to users when a draft has no PENDING marker
58# (i.e. a regular drift draft). Never written into
59# ``DraftInfo.pending_kind`` on disk; consumers fall back to this
60# constant instead of hard-coding ``"drift"``.
61PENDING_KIND_DRIFT = "drift"
63# wiki/log.md action labels. Distinct from WIKI_STATUS_* (which are result
64# statuses returned to CLI/MCP/HTTP callers); these are internal audit trail
65# verbs written into the log file.
66WIKI_LOG_ACTION_GENERATED = "generated"
67WIKI_LOG_ACTION_BUILD = "build"
68WIKI_LOG_ACTION_INGEST = "ingest"
69WIKI_LOG_ACTION_LINT = "lint"
71SUBDIR_TO_TYPE: dict[str, WikiPageType] = {
72 SUMMARIES_SUBDIR: WikiPageType.SUMMARY,
73 SYNTHESIS_SUBDIR: WikiPageType.SYNTHESIS,
74 CONCEPTS_SUBDIR: WikiPageType.CONCEPT,
75 ENTITIES_SUBDIR: WikiPageType.ENTITY,
76 DRAFTS_SUBDIR: WikiPageType.DRAFT,
77 ARCHIVE_SUBDIR: WikiPageType.ARCHIVE,
78}
80# One source of truth for sidebar-style headings keyed by page type.
81# Consumed by ``wiki/index.py`` and the TUI sidebar via
82# ``cli/tui/messages.WIKI_TYPE_HEADINGS``.
83WIKI_TYPE_HEADINGS: dict[WikiPageType, str] = {
84 WikiPageType.CONCEPT: "Concepts",
85 WikiPageType.ENTITY: "Entities",
86 WikiPageType.SUMMARY: "Source Summaries",
87 WikiPageType.SYNTHESIS: "Synthesis",
88}
90_SLUG_CLEAN_RE = re.compile(r"[^a-z0-9-]")
92# Characters that signal markdown-structural noise in a concept label.
93# Single source of truth for both ``is_valid_label`` (membership check)
94# and ``clean_label_for_display`` (regex strip).
95_STRUCTURAL_CHARS = frozenset("|#>")
96_DISPLAY_STRUCTURAL_RE = re.compile(f"[{re.escape(''.join(_STRUCTURAL_CHARS))}]+")
97_DISPLAY_WHITESPACE_RE = re.compile(r"\s+")
99LABEL_SANITY_MIN_LEN = 3
100LABEL_SANITY_MIN_ALNUM_RATIO = 0.5
103@dataclass(frozen=True)
104class PageTarget:
105 """Grouping of page location fields for wiki generation."""
107 wiki_root: Path
108 subdir: str
109 slug: str
110 wiki_source: str
111 page_type: str
112 label: str
115def parse_frontmatter(text: str) -> dict[str, Any]:
116 """Extract YAML frontmatter fields from a wiki page string.
117 Uses line-by-line scanning so ``---`` inside YAML content is not
118 mistaken for the closing delimiter.
119 """
120 lines = text.splitlines()
121 if not lines or lines[0].strip() != "---":
122 return {}
123 end_idx: int | None = None
124 for i in range(1, len(lines)):
125 if lines[i].strip() == "---":
126 end_idx = i
127 break
128 if end_idx is None:
129 return {}
130 block = "\n".join(lines[1:end_idx])
131 try:
132 return yaml.safe_load(block) or {}
133 except yaml.YAMLError:
134 return {}
137def make_slug(label: str) -> str:
138 """Turn a concept label into a filesystem-safe slug.
140 Lowercases, maps whitespace to single hyphens and slashes to double
141 hyphens (path encoding), strips anything outside ``[a-z0-9-]``, and
142 trims leading and trailing hyphens. Returns ``""`` when no sluggable
143 characters remain; callers must treat an empty slug as "skip this
144 entity" so the generator never writes a file called ``.md``.
146 Internal hyphen runs from the ``/`` path encoding are preserved;
147 only leading and trailing hyphens (e.g. ``--body`` from a stripped
148 ``| | Body``) are removed.
149 """
150 slug = label.lower().replace(" ", "-").replace("/", "--")
151 slug = _SLUG_CLEAN_RE.sub("", slug)
152 return slug.strip("-")
155def is_valid_label(label: str) -> bool:
156 """Reject structural-noise labels before aggregation.
158 Catches the noise patterns observed in QA (bb-8b7s):
160 - empty or sub-three-char fragments,
161 - markdown table delimiters (``| | designer``),
162 - page-number-prefixed tokens (``158 vehicle``),
163 - paren-prefixed numerics (``(7.0 l)`` — would otherwise slug to
164 ``70-l`` after punctuation cleanup),
165 - hyphen-prefixed fragments (``-answers`` — trailing text from
166 markdown bracket-link extraction).
168 Requires the first non-whitespace character to be a Unicode letter
169 so any non-alpha prefix (digit, bracket, hyphen, punctuation) is
170 rejected up front. Legitimate labels like ``E-mail`` or ``iPhone``
171 pass. Still permissive on three-char fragments like ``cro`` /
172 ``fus``; A3's entity-type filter and ``wiki_entity_min_mentions``
173 catch those downstream.
174 """
175 stripped = label.strip()
176 if len(stripped) < LABEL_SANITY_MIN_LEN:
177 return False
178 if not stripped[0].isalpha():
179 return False
180 if any(ch in _STRUCTURAL_CHARS for ch in stripped):
181 return False
182 alnum = sum(1 for ch in stripped if ch.isalnum())
183 return alnum / len(stripped) >= LABEL_SANITY_MIN_ALNUM_RATIO
186def clean_label_for_display(label: str) -> str:
187 """Return a prompt-safe version of *label* for the ``{topic}`` slot.
189 Defense-in-depth behind :func:`is_valid_label`: a concept or entity
190 label that reached this function already passed the sanity gate
191 and should not contain ``|#>`` in practice. The structural-char
192 strip here guards against a future code path that bypasses the
193 gate (synthesis cluster labels sourced from ``concept_nodes``,
194 user-supplied topics, tests). The always-useful work is whitespace
195 normalization: spaCy surface forms can carry internal runs of
196 whitespace that would reach the H1 verbatim.
198 Preserves the original capitalization so proper nouns
199 (``Chevrolet Caprice``, ``iPhone``) survive intact; the model
200 title-cases lowercase common nouns on its own.
201 """
202 clean = _DISPLAY_STRUCTURAL_RE.sub("", label)
203 return _DISPLAY_WHITESPACE_RE.sub(" ", clean).strip()