Coverage for src / lilbee / wiki / lint.py: 100%

144 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Lint wiki pages for citation staleness, missing sources, and unmarked claims. 

2 

3Two modes: 

4- lightweight: runs automatically after sync, checks only pages whose sources changed 

5- full: manual ``lilbee wiki lint``, checks all wiki pages 

6""" 

7 

8from __future__ import annotations 

9 

10import logging 

11from dataclasses import dataclass, field 

12from enum import Enum 

13from pathlib import Path 

14 

15from lilbee.config import Config, cfg 

16from lilbee.ingest import file_hash 

17from lilbee.security import validate_path_within 

18from lilbee.store import CitationRecord, Store 

19from lilbee.wiki.citation import ( 

20 CitationStatus, 

21 find_unmarked_claims, 

22 verify_citation, 

23) 

24from lilbee.wiki.grammar import WIKI_LINK_RE 

25from lilbee.wiki.index import append_wiki_log 

26from lilbee.wiki.shared import ( 

27 CONCEPTS_SUBDIR, 

28 ENTITIES_SUBDIR, 

29 WIKI_CONTENT_SUBDIRS, 

30 WIKI_LOG_ACTION_LINT, 

31 parse_frontmatter, 

32) 

33 

34_ORPHAN_CANDIDATE_SUBDIRS: tuple[str, ...] = (CONCEPTS_SUBDIR, ENTITIES_SUBDIR) 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class IssueSeverity(Enum): 

40 """Severity level for lint issues.""" 

41 

42 WARNING = "warning" 

43 ERROR = "error" 

44 

45 

46class IssueType(Enum): 

47 """Classification of lint findings, used by prune to filter programmatically.""" 

48 

49 PATH_TRAVERSAL = "path_traversal" 

50 SOURCE_MISSING = "source_missing" 

51 STALE_HASH = "stale_hash" 

52 EXCERPT_MISSING = "excerpt_missing" 

53 MODEL_CHANGED = "model_changed" 

54 UNMARKED_CLAIM = "unmarked_claim" 

55 ORPHAN = "orphan" 

56 

57 

58@dataclass(frozen=True) 

59class LintIssue: 

60 """A single lint finding on a wiki page.""" 

61 

62 wiki_source: str 

63 severity: IssueSeverity 

64 message: str 

65 issue_type: IssueType | None = None 

66 

67 def to_dict(self) -> dict[str, str]: 

68 """Serialize to a plain dict suitable for JSON output.""" 

69 return { 

70 "wiki_source": self.wiki_source, 

71 "severity": self.severity.value, 

72 "message": self.message, 

73 "issue_type": self.issue_type.value if self.issue_type else "", 

74 } 

75 

76 

77@dataclass 

78class LintReport: 

79 """Aggregated results from linting one or more wiki pages.""" 

80 

81 issues: list[LintIssue] = field(default_factory=list) 

82 

83 @property 

84 def error_count(self) -> int: 

85 return sum(1 for i in self.issues if i.severity == IssueSeverity.ERROR) 

86 

87 @property 

88 def warning_count(self) -> int: 

89 return sum(1 for i in self.issues if i.severity == IssueSeverity.WARNING) 

90 

91 

92def _lint_citation( 

93 rec: CitationRecord, 

94 documents_dir: Path, 

95) -> LintIssue | None: 

96 """Check a single citation record against the filesystem. 

97 Returns a LintIssue if the citation is stale or broken, None if valid. 

98 """ 

99 source_path = documents_dir / rec["source_filename"] 

100 wiki_source = rec["wiki_source"] 

101 

102 try: 

103 validate_path_within(source_path, documents_dir) 

104 except ValueError: 

105 return LintIssue( 

106 wiki_source=wiki_source, 

107 severity=IssueSeverity.ERROR, 

108 message=f"Source path escapes documents dir: {rec['source_filename']}", 

109 issue_type=IssueType.PATH_TRAVERSAL, 

110 ) 

111 

112 if not source_path.exists(): 

113 return LintIssue( 

114 wiki_source=wiki_source, 

115 severity=IssueSeverity.ERROR, 

116 message=f"Source deleted: {rec['source_filename']}", 

117 issue_type=IssueType.SOURCE_MISSING, 

118 ) 

119 

120 current_hash = file_hash(source_path) 

121 if current_hash != rec["source_hash"]: 

122 return LintIssue( 

123 wiki_source=wiki_source, 

124 severity=IssueSeverity.WARNING, 

125 message=f"Stale hash for {rec['source_filename']} (citation: {rec['citation_key']})", 

126 issue_type=IssueType.STALE_HASH, 

127 ) 

128 

129 source_text = source_path.read_text(encoding="utf-8", errors="replace") 

130 status = verify_citation(rec, source_text) 

131 if status == CitationStatus.EXCERPT_MISSING: 

132 return LintIssue( 

133 wiki_source=wiki_source, 

134 severity=IssueSeverity.WARNING, 

135 message=f"Excerpt not found in source for {rec['citation_key']}", 

136 issue_type=IssueType.EXCERPT_MISSING, 

137 ) 

138 return None 

139 

140 

141def _lint_model_changed(wiki_source: str, text: str, config: Config) -> LintIssue | None: 

142 """Flag pages whose generated_by model differs from the current chat model.""" 

143 generated_by = parse_frontmatter(text).get("generated_by", "") 

144 if not generated_by: 

145 return None 

146 if generated_by != config.chat_model: 

147 return LintIssue( 

148 wiki_source=wiki_source, 

149 severity=IssueSeverity.WARNING, 

150 issue_type=IssueType.MODEL_CHANGED, 

151 message=( 

152 f"model_changed: page generated by {generated_by!r}, " 

153 f"current model is {config.chat_model!r}" 

154 ), 

155 ) 

156 return None 

157 

158 

159def _lint_unmarked(wiki_source: str, text: str) -> list[LintIssue]: 

160 """Find unmarked claims in a wiki page.""" 

161 unmarked = find_unmarked_claims(text) 

162 return [ 

163 LintIssue( 

164 wiki_source=wiki_source, 

165 severity=IssueSeverity.WARNING, 

166 message=f"Unmarked claim: {line[:80]}", 

167 issue_type=IssueType.UNMARKED_CLAIM, 

168 ) 

169 for line in unmarked 

170 ] 

171 

172 

173def lint_wiki_page( 

174 wiki_source: str, 

175 store: Store, 

176 config: Config | None = None, 

177) -> list[LintIssue]: 

178 """Lint a single wiki page: check citations and unmarked claims.""" 

179 if config is None: 

180 config = cfg 

181 issues: list[LintIssue] = [] 

182 

183 citations = store.get_citations_for_wiki(wiki_source) 

184 for rec in citations: 

185 issue = _lint_citation(rec, config.documents_dir) 

186 if issue is not None: 

187 issues.append(issue) 

188 

189 wiki_root = config.data_root / config.wiki_dir 

190 # wiki_source is like "wiki/summaries/doc.md" — strip the wiki_dir prefix 

191 relative = str(wiki_source).removeprefix(str(config.wiki_dir) + "/") 

192 wiki_path = wiki_root / relative 

193 if wiki_path.exists(): 

194 text = wiki_path.read_text(encoding="utf-8", errors="replace") 

195 issues.extend(_lint_unmarked(wiki_source, text)) 

196 model_issue = _lint_model_changed(wiki_source, text, config) 

197 if model_issue is not None: 

198 issues.append(model_issue) 

199 

200 return issues 

201 

202 

203def lint_changed_sources( 

204 changed_sources: list[str], 

205 store: Store, 

206 config: Config | None = None, 

207) -> LintReport: 

208 """Lightweight lint for wiki pages citing changed or removed sources. 

209 

210 Callable from tools that already know the set of changed sources 

211 (e.g. a future `lilbee wiki check <source>` command); the sync 

212 pipeline uses `_incremental_wiki_update` instead, which runs full 

213 extraction rather than citation replay. 

214 """ 

215 if config is None: 

216 config = cfg 

217 report = LintReport() 

218 

219 seen_pages: set[str] = set() 

220 for source_name in changed_sources: 

221 citations = store.get_citations_for_source(source_name) 

222 for rec in citations: 

223 wiki_source = rec["wiki_source"] 

224 if wiki_source in seen_pages: 

225 continue 

226 seen_pages.add(wiki_source) 

227 report.issues.extend(lint_wiki_page(wiki_source, store, config)) 

228 

229 if report.issues: 

230 log.info( 

231 "Wiki lint: %d error(s), %d warning(s)", 

232 report.error_count, 

233 report.warning_count, 

234 ) 

235 return report 

236 

237 

238def lint_all( 

239 store: Store, 

240 config: Config | None = None, 

241) -> LintReport: 

242 """Full lint: check every wiki page in the store.""" 

243 if config is None: 

244 config = cfg 

245 report = LintReport() 

246 

247 wiki_root = config.data_root / config.wiki_dir 

248 if not wiki_root.exists(): 

249 return report 

250 

251 for subdir in WIKI_CONTENT_SUBDIRS: 

252 subdir_path = wiki_root / subdir 

253 if not subdir_path.is_dir(): 

254 continue 

255 for md_path in sorted(subdir_path.rglob("*.md")): 

256 relative = md_path.relative_to(wiki_root) 

257 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

258 report.issues.extend(lint_wiki_page(wiki_source, store, config)) 

259 

260 report.issues.extend(_lint_orphans(wiki_root, config)) 

261 append_wiki_log( 

262 WIKI_LOG_ACTION_LINT, 

263 f"{report.error_count} error(s), {report.warning_count} warning(s)", 

264 config, 

265 ) 

266 return report 

267 

268 

269def _lint_orphans(wiki_root: Path, config: Config) -> list[LintIssue]: 

270 """Flag concept/entity pages that no other page links back to. 

271 

272 Single-pass over the wiki tree: we collect every inbound 

273 ``[[slug]]`` reference and the set of orphan candidates in one 

274 ``rglob`` walk, then subtract. The earlier two-pass version 

275 re-walked the tree to compute ``referenced`` and again to check 

276 candidates, which doubles the file-IO at build time. 

277 """ 

278 referenced: set[str] = set() 

279 candidates: list[Path] = [] 

280 candidate_roots = {wiki_root / sub for sub in _ORPHAN_CANDIDATE_SUBDIRS} 

281 for md_path in wiki_root.rglob("*.md"): 

282 text = md_path.read_text(encoding="utf-8", errors="replace") 

283 for match in WIKI_LINK_RE.finditer(text): 

284 slug = match.group(1).split("|", 1)[0].strip().lower() 

285 if slug: 

286 referenced.add(slug) 

287 if any(root in md_path.parents for root in candidate_roots): 

288 candidates.append(md_path) 

289 

290 issues: list[LintIssue] = [] 

291 for md_path in sorted(candidates): 

292 slug = md_path.stem.lower() 

293 if slug in referenced: 

294 continue 

295 relative = md_path.relative_to(wiki_root) 

296 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

297 issues.append( 

298 LintIssue( 

299 wiki_source=wiki_source, 

300 severity=IssueSeverity.WARNING, 

301 issue_type=IssueType.ORPHAN, 

302 message=f"Orphan: no inbound [[{slug}]] links from any other page", 

303 ) 

304 ) 

305 return issues