Coverage for src / lilbee / wiki / shared.py: 100%

81 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Shared wiki utilities — frontmatter parsing, constants, slug generation.""" 

2 

3from __future__ import annotations 

4 

5import re 

6from dataclasses import dataclass 

7from enum import StrEnum 

8from pathlib import Path 

9from typing import Any 

10 

11import yaml 

12 

13MIN_CLUSTER_SOURCES = 3 # minimum unique sources for a synthesis page 

14 

15SUMMARIES_SUBDIR = "summaries" 

16SYNTHESIS_SUBDIR = "synthesis" 

17CONCEPTS_SUBDIR = "concepts" 

18ENTITIES_SUBDIR = "entities" 

19DRAFTS_SUBDIR = "drafts" 

20ARCHIVE_SUBDIR = "archive" 

21 

22 

23class WikiPageType(StrEnum): 

24 """Kind of wiki page. Values are used as frontmatter/API labels.""" 

25 

26 SUMMARY = "summary" 

27 SYNTHESIS = "synthesis" 

28 CONCEPT = "concept" 

29 ENTITY = "entity" 

30 DRAFT = "draft" 

31 ARCHIVE = "archive" 

32 

33 

34WIKI_CONTENT_SUBDIRS: tuple[str, ...] = ( 

35 SUMMARIES_SUBDIR, 

36 SYNTHESIS_SUBDIR, 

37 CONCEPTS_SUBDIR, 

38 ENTITIES_SUBDIR, 

39) 

40 

41WIKI_DISABLED_ERROR = "wiki not enabled" 

42 

43# PENDING-marker keyword phrases written into ``drafts/<slug>.md`` by the 

44# batched generator and matched by the drafts-review surface. Centralized 

45# here so the gen-side writer and the drafts-side reader agree on the 

46# exact wording. Changing a keyword here requires updating any cached 

47# markers on disk (one-shot find -delete or a regen). 

48PENDING_MARKER_KEYWORD_PARSE = "PENDING: batch parse failed" 

49PENDING_MARKER_KEYWORD_COLLISION = "PENDING: concept slug collision" 

50 

51# Values written into the ``pending_kind`` frontmatter field and 

52# surfaced verbatim through ``DraftInfo.pending_kind`` to CLI / HTTP / 

53# MCP callers. Kept as plain string constants (not an enum) because the 

54# value round-trips through YAML and JSON without translation. 

55PENDING_KIND_PARSE = "parse" 

56PENDING_KIND_COLLISION = "collision" 

57# Display-only default shown to users when a draft has no PENDING marker 

58# (i.e. a regular drift draft). Never written into 

59# ``DraftInfo.pending_kind`` on disk; consumers fall back to this 

60# constant instead of hard-coding ``"drift"``. 

61PENDING_KIND_DRIFT = "drift" 

62 

63# wiki/log.md action labels. Distinct from WIKI_STATUS_* (which are result 

64# statuses returned to CLI/MCP/HTTP callers); these are internal audit trail 

65# verbs written into the log file. 

66WIKI_LOG_ACTION_GENERATED = "generated" 

67WIKI_LOG_ACTION_BUILD = "build" 

68WIKI_LOG_ACTION_INGEST = "ingest" 

69WIKI_LOG_ACTION_LINT = "lint" 

70 

71SUBDIR_TO_TYPE: dict[str, WikiPageType] = { 

72 SUMMARIES_SUBDIR: WikiPageType.SUMMARY, 

73 SYNTHESIS_SUBDIR: WikiPageType.SYNTHESIS, 

74 CONCEPTS_SUBDIR: WikiPageType.CONCEPT, 

75 ENTITIES_SUBDIR: WikiPageType.ENTITY, 

76 DRAFTS_SUBDIR: WikiPageType.DRAFT, 

77 ARCHIVE_SUBDIR: WikiPageType.ARCHIVE, 

78} 

79 

80# One source of truth for sidebar-style headings keyed by page type. 

81# Consumed by ``wiki/index.py`` and the TUI sidebar via 

82# ``cli/tui/messages.WIKI_TYPE_HEADINGS``. 

83WIKI_TYPE_HEADINGS: dict[WikiPageType, str] = { 

84 WikiPageType.CONCEPT: "Concepts", 

85 WikiPageType.ENTITY: "Entities", 

86 WikiPageType.SUMMARY: "Source Summaries", 

87 WikiPageType.SYNTHESIS: "Synthesis", 

88} 

89 

90_SLUG_CLEAN_RE = re.compile(r"[^a-z0-9-]") 

91 

92# Characters that signal markdown-structural noise in a concept label. 

93# Single source of truth for both ``is_valid_label`` (membership check) 

94# and ``clean_label_for_display`` (regex strip). 

95_STRUCTURAL_CHARS = frozenset("|#>") 

96_DISPLAY_STRUCTURAL_RE = re.compile(f"[{re.escape(''.join(_STRUCTURAL_CHARS))}]+") 

97_DISPLAY_WHITESPACE_RE = re.compile(r"\s+") 

98 

99LABEL_SANITY_MIN_LEN = 3 

100LABEL_SANITY_MIN_ALNUM_RATIO = 0.5 

101 

102 

103@dataclass(frozen=True) 

104class PageTarget: 

105 """Grouping of page location fields for wiki generation.""" 

106 

107 wiki_root: Path 

108 subdir: str 

109 slug: str 

110 wiki_source: str 

111 page_type: str 

112 label: str 

113 

114 

115def parse_frontmatter(text: str) -> dict[str, Any]: 

116 """Extract YAML frontmatter fields from a wiki page string. 

117 Uses line-by-line scanning so ``---`` inside YAML content is not 

118 mistaken for the closing delimiter. 

119 """ 

120 lines = text.splitlines() 

121 if not lines or lines[0].strip() != "---": 

122 return {} 

123 end_idx: int | None = None 

124 for i in range(1, len(lines)): 

125 if lines[i].strip() == "---": 

126 end_idx = i 

127 break 

128 if end_idx is None: 

129 return {} 

130 block = "\n".join(lines[1:end_idx]) 

131 try: 

132 return yaml.safe_load(block) or {} 

133 except yaml.YAMLError: 

134 return {} 

135 

136 

137def make_slug(label: str) -> str: 

138 """Turn a concept label into a filesystem-safe slug. 

139 

140 Lowercases, maps whitespace to single hyphens and slashes to double 

141 hyphens (path encoding), strips anything outside ``[a-z0-9-]``, and 

142 trims leading and trailing hyphens. Returns ``""`` when no sluggable 

143 characters remain; callers must treat an empty slug as "skip this 

144 entity" so the generator never writes a file called ``.md``. 

145 

146 Internal hyphen runs from the ``/`` path encoding are preserved; 

147 only leading and trailing hyphens (e.g. ``--body`` from a stripped 

148 ``| | Body``) are removed. 

149 """ 

150 slug = label.lower().replace(" ", "-").replace("/", "--") 

151 slug = _SLUG_CLEAN_RE.sub("", slug) 

152 return slug.strip("-") 

153 

154 

155def is_valid_label(label: str) -> bool: 

156 """Reject structural-noise labels before aggregation. 

157 

158 Catches the noise patterns observed in QA (bb-8b7s): 

159 

160 - empty or sub-three-char fragments, 

161 - markdown table delimiters (``| | designer``), 

162 - page-number-prefixed tokens (``158 vehicle``), 

163 - paren-prefixed numerics (``(7.0 l)`` — would otherwise slug to 

164 ``70-l`` after punctuation cleanup), 

165 - hyphen-prefixed fragments (``-answers`` — trailing text from 

166 markdown bracket-link extraction). 

167 

168 Requires the first non-whitespace character to be a Unicode letter 

169 so any non-alpha prefix (digit, bracket, hyphen, punctuation) is 

170 rejected up front. Legitimate labels like ``E-mail`` or ``iPhone`` 

171 pass. Still permissive on three-char fragments like ``cro`` / 

172 ``fus``; A3's entity-type filter and ``wiki_entity_min_mentions`` 

173 catch those downstream. 

174 """ 

175 stripped = label.strip() 

176 if len(stripped) < LABEL_SANITY_MIN_LEN: 

177 return False 

178 if not stripped[0].isalpha(): 

179 return False 

180 if any(ch in _STRUCTURAL_CHARS for ch in stripped): 

181 return False 

182 alnum = sum(1 for ch in stripped if ch.isalnum()) 

183 return alnum / len(stripped) >= LABEL_SANITY_MIN_ALNUM_RATIO 

184 

185 

186def clean_label_for_display(label: str) -> str: 

187 """Return a prompt-safe version of *label* for the ``{topic}`` slot. 

188 

189 Defense-in-depth behind :func:`is_valid_label`: a concept or entity 

190 label that reached this function already passed the sanity gate 

191 and should not contain ``|#>`` in practice. The structural-char 

192 strip here guards against a future code path that bypasses the 

193 gate (synthesis cluster labels sourced from ``concept_nodes``, 

194 user-supplied topics, tests). The always-useful work is whitespace 

195 normalization: spaCy surface forms can carry internal runs of 

196 whitespace that would reach the H1 verbatim. 

197 

198 Preserves the original capitalization so proper nouns 

199 (``Chevrolet Caprice``, ``iPhone``) survive intact; the model 

200 title-cases lowercase common nouns on its own. 

201 """ 

202 clean = _DISPLAY_STRUCTURAL_RE.sub("", label) 

203 return _DISPLAY_WHITESPACE_RE.sub(" ", clean).strip()