Coverage for src / lilbee / wiki / lint.py: 100%
144 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Lint wiki pages for citation staleness, missing sources, and unmarked claims.
3Two modes:
4- lightweight: runs automatically after sync, checks only pages whose sources changed
5- full: manual ``lilbee wiki lint``, checks all wiki pages
6"""
8from __future__ import annotations
10import logging
11from dataclasses import dataclass, field
12from enum import Enum
13from pathlib import Path
15from lilbee.config import Config, cfg
16from lilbee.ingest import file_hash
17from lilbee.security import validate_path_within
18from lilbee.store import CitationRecord, Store
19from lilbee.wiki.citation import (
20 CitationStatus,
21 find_unmarked_claims,
22 verify_citation,
23)
24from lilbee.wiki.grammar import WIKI_LINK_RE
25from lilbee.wiki.index import append_wiki_log
26from lilbee.wiki.shared import (
27 CONCEPTS_SUBDIR,
28 ENTITIES_SUBDIR,
29 WIKI_CONTENT_SUBDIRS,
30 WIKI_LOG_ACTION_LINT,
31 parse_frontmatter,
32)
34_ORPHAN_CANDIDATE_SUBDIRS: tuple[str, ...] = (CONCEPTS_SUBDIR, ENTITIES_SUBDIR)
36log = logging.getLogger(__name__)
39class IssueSeverity(Enum):
40 """Severity level for lint issues."""
42 WARNING = "warning"
43 ERROR = "error"
46class IssueType(Enum):
47 """Classification of lint findings, used by prune to filter programmatically."""
49 PATH_TRAVERSAL = "path_traversal"
50 SOURCE_MISSING = "source_missing"
51 STALE_HASH = "stale_hash"
52 EXCERPT_MISSING = "excerpt_missing"
53 MODEL_CHANGED = "model_changed"
54 UNMARKED_CLAIM = "unmarked_claim"
55 ORPHAN = "orphan"
58@dataclass(frozen=True)
59class LintIssue:
60 """A single lint finding on a wiki page."""
62 wiki_source: str
63 severity: IssueSeverity
64 message: str
65 issue_type: IssueType | None = None
67 def to_dict(self) -> dict[str, str]:
68 """Serialize to a plain dict suitable for JSON output."""
69 return {
70 "wiki_source": self.wiki_source,
71 "severity": self.severity.value,
72 "message": self.message,
73 "issue_type": self.issue_type.value if self.issue_type else "",
74 }
77@dataclass
78class LintReport:
79 """Aggregated results from linting one or more wiki pages."""
81 issues: list[LintIssue] = field(default_factory=list)
83 @property
84 def error_count(self) -> int:
85 return sum(1 for i in self.issues if i.severity == IssueSeverity.ERROR)
87 @property
88 def warning_count(self) -> int:
89 return sum(1 for i in self.issues if i.severity == IssueSeverity.WARNING)
92def _lint_citation(
93 rec: CitationRecord,
94 documents_dir: Path,
95) -> LintIssue | None:
96 """Check a single citation record against the filesystem.
97 Returns a LintIssue if the citation is stale or broken, None if valid.
98 """
99 source_path = documents_dir / rec["source_filename"]
100 wiki_source = rec["wiki_source"]
102 try:
103 validate_path_within(source_path, documents_dir)
104 except ValueError:
105 return LintIssue(
106 wiki_source=wiki_source,
107 severity=IssueSeverity.ERROR,
108 message=f"Source path escapes documents dir: {rec['source_filename']}",
109 issue_type=IssueType.PATH_TRAVERSAL,
110 )
112 if not source_path.exists():
113 return LintIssue(
114 wiki_source=wiki_source,
115 severity=IssueSeverity.ERROR,
116 message=f"Source deleted: {rec['source_filename']}",
117 issue_type=IssueType.SOURCE_MISSING,
118 )
120 current_hash = file_hash(source_path)
121 if current_hash != rec["source_hash"]:
122 return LintIssue(
123 wiki_source=wiki_source,
124 severity=IssueSeverity.WARNING,
125 message=f"Stale hash for {rec['source_filename']} (citation: {rec['citation_key']})",
126 issue_type=IssueType.STALE_HASH,
127 )
129 source_text = source_path.read_text(encoding="utf-8", errors="replace")
130 status = verify_citation(rec, source_text)
131 if status == CitationStatus.EXCERPT_MISSING:
132 return LintIssue(
133 wiki_source=wiki_source,
134 severity=IssueSeverity.WARNING,
135 message=f"Excerpt not found in source for {rec['citation_key']}",
136 issue_type=IssueType.EXCERPT_MISSING,
137 )
138 return None
141def _lint_model_changed(wiki_source: str, text: str, config: Config) -> LintIssue | None:
142 """Flag pages whose generated_by model differs from the current chat model."""
143 generated_by = parse_frontmatter(text).get("generated_by", "")
144 if not generated_by:
145 return None
146 if generated_by != config.chat_model:
147 return LintIssue(
148 wiki_source=wiki_source,
149 severity=IssueSeverity.WARNING,
150 issue_type=IssueType.MODEL_CHANGED,
151 message=(
152 f"model_changed: page generated by {generated_by!r}, "
153 f"current model is {config.chat_model!r}"
154 ),
155 )
156 return None
159def _lint_unmarked(wiki_source: str, text: str) -> list[LintIssue]:
160 """Find unmarked claims in a wiki page."""
161 unmarked = find_unmarked_claims(text)
162 return [
163 LintIssue(
164 wiki_source=wiki_source,
165 severity=IssueSeverity.WARNING,
166 message=f"Unmarked claim: {line[:80]}",
167 issue_type=IssueType.UNMARKED_CLAIM,
168 )
169 for line in unmarked
170 ]
173def lint_wiki_page(
174 wiki_source: str,
175 store: Store,
176 config: Config | None = None,
177) -> list[LintIssue]:
178 """Lint a single wiki page: check citations and unmarked claims."""
179 if config is None:
180 config = cfg
181 issues: list[LintIssue] = []
183 citations = store.get_citations_for_wiki(wiki_source)
184 for rec in citations:
185 issue = _lint_citation(rec, config.documents_dir)
186 if issue is not None:
187 issues.append(issue)
189 wiki_root = config.data_root / config.wiki_dir
190 # wiki_source is like "wiki/summaries/doc.md" — strip the wiki_dir prefix
191 relative = str(wiki_source).removeprefix(str(config.wiki_dir) + "/")
192 wiki_path = wiki_root / relative
193 if wiki_path.exists():
194 text = wiki_path.read_text(encoding="utf-8", errors="replace")
195 issues.extend(_lint_unmarked(wiki_source, text))
196 model_issue = _lint_model_changed(wiki_source, text, config)
197 if model_issue is not None:
198 issues.append(model_issue)
200 return issues
203def lint_changed_sources(
204 changed_sources: list[str],
205 store: Store,
206 config: Config | None = None,
207) -> LintReport:
208 """Lightweight lint for wiki pages citing changed or removed sources.
210 Callable from tools that already know the set of changed sources
211 (e.g. a future `lilbee wiki check <source>` command); the sync
212 pipeline uses `_incremental_wiki_update` instead, which runs full
213 extraction rather than citation replay.
214 """
215 if config is None:
216 config = cfg
217 report = LintReport()
219 seen_pages: set[str] = set()
220 for source_name in changed_sources:
221 citations = store.get_citations_for_source(source_name)
222 for rec in citations:
223 wiki_source = rec["wiki_source"]
224 if wiki_source in seen_pages:
225 continue
226 seen_pages.add(wiki_source)
227 report.issues.extend(lint_wiki_page(wiki_source, store, config))
229 if report.issues:
230 log.info(
231 "Wiki lint: %d error(s), %d warning(s)",
232 report.error_count,
233 report.warning_count,
234 )
235 return report
238def lint_all(
239 store: Store,
240 config: Config | None = None,
241) -> LintReport:
242 """Full lint: check every wiki page in the store."""
243 if config is None:
244 config = cfg
245 report = LintReport()
247 wiki_root = config.data_root / config.wiki_dir
248 if not wiki_root.exists():
249 return report
251 for subdir in WIKI_CONTENT_SUBDIRS:
252 subdir_path = wiki_root / subdir
253 if not subdir_path.is_dir():
254 continue
255 for md_path in sorted(subdir_path.rglob("*.md")):
256 relative = md_path.relative_to(wiki_root)
257 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
258 report.issues.extend(lint_wiki_page(wiki_source, store, config))
260 report.issues.extend(_lint_orphans(wiki_root, config))
261 append_wiki_log(
262 WIKI_LOG_ACTION_LINT,
263 f"{report.error_count} error(s), {report.warning_count} warning(s)",
264 config,
265 )
266 return report
269def _lint_orphans(wiki_root: Path, config: Config) -> list[LintIssue]:
270 """Flag concept/entity pages that no other page links back to.
272 Single-pass over the wiki tree: we collect every inbound
273 ``[[slug]]`` reference and the set of orphan candidates in one
274 ``rglob`` walk, then subtract. The earlier two-pass version
275 re-walked the tree to compute ``referenced`` and again to check
276 candidates, which doubles the file-IO at build time.
277 """
278 referenced: set[str] = set()
279 candidates: list[Path] = []
280 candidate_roots = {wiki_root / sub for sub in _ORPHAN_CANDIDATE_SUBDIRS}
281 for md_path in wiki_root.rglob("*.md"):
282 text = md_path.read_text(encoding="utf-8", errors="replace")
283 for match in WIKI_LINK_RE.finditer(text):
284 slug = match.group(1).split("|", 1)[0].strip().lower()
285 if slug:
286 referenced.add(slug)
287 if any(root in md_path.parents for root in candidate_roots):
288 candidates.append(md_path)
290 issues: list[LintIssue] = []
291 for md_path in sorted(candidates):
292 slug = md_path.stem.lower()
293 if slug in referenced:
294 continue
295 relative = md_path.relative_to(wiki_root)
296 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
297 issues.append(
298 LintIssue(
299 wiki_source=wiki_source,
300 severity=IssueSeverity.WARNING,
301 issue_type=IssueType.ORPHAN,
302 message=f"Orphan: no inbound [[{slug}]] links from any other page",
303 )
304 )
305 return issues