Coverage for src / lilbee / wiki / prune.py: 100%
105 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Prune stale and orphaned wiki pages.
3Pruning rules:
41. All cited sources deleted -> archive the page
52. Concept cluster shrinks below 3 sources -> archive synthesis page
63. >50% of citations are stale (stale_hash or excerpt_missing) -> flag for regeneration
8Archived pages are moved to wiki/archive/ and removed from the vector store.
9"""
11from __future__ import annotations
13import logging
14import shutil
15from dataclasses import dataclass, field
16from enum import Enum
17from pathlib import Path
19from lilbee.config import Config, cfg
20from lilbee.store import Store
21from lilbee.wiki.index import append_wiki_log, update_wiki_index
22from lilbee.wiki.lint import IssueType, lint_wiki_page
23from lilbee.wiki.shared import (
24 ARCHIVE_SUBDIR,
25 MIN_CLUSTER_SOURCES,
26 SYNTHESIS_SUBDIR,
27 WIKI_CONTENT_SUBDIRS,
28)
30log = logging.getLogger(__name__)
32_STALE_TYPES = {IssueType.STALE_HASH, IssueType.EXCERPT_MISSING}
35class PruneAction(Enum):
36 """What happened to a wiki page during pruning."""
38 ARCHIVED = "archived"
39 FLAGGED = "flagged"
42@dataclass(frozen=True)
43class PruneRecord:
44 """A single pruning action taken on a wiki page."""
46 wiki_source: str
47 action: PruneAction
48 reason: str
50 def to_dict(self) -> dict[str, str]:
51 """Serialize to a plain dict suitable for JSON output."""
52 return {
53 "wiki_source": self.wiki_source,
54 "action": self.action.value,
55 "reason": self.reason,
56 }
59@dataclass
60class PruneReport:
61 """Aggregated results from pruning wiki pages."""
63 records: list[PruneRecord] = field(default_factory=list)
65 @property
66 def archived_count(self) -> int:
67 return sum(1 for r in self.records if r.action == PruneAction.ARCHIVED)
69 @property
70 def flagged_count(self) -> int:
71 return sum(1 for r in self.records if r.action == PruneAction.FLAGGED)
74def _archive_page(
75 wiki_source: str,
76 wiki_root: Path,
77 store: Store,
78 config: Config,
79) -> None:
80 """Move a wiki page to wiki/archive/ and clean up store data."""
81 relative = wiki_source.removeprefix(config.wiki_dir + "/")
82 source_path = wiki_root / relative
84 archive_dir = wiki_root / ARCHIVE_SUBDIR
85 archive_dir.mkdir(parents=True, exist_ok=True)
86 archive_path = archive_dir / source_path.name
88 if source_path.exists():
89 shutil.move(source_path, archive_path)
90 log.info("Archived wiki page %s -> %s", source_path, archive_path)
91 else:
92 log.warning("Wiki page file not found for archival: %s", source_path)
94 store.delete_by_source(wiki_source)
95 store.delete_citations_for_wiki(wiki_source)
98def _check_all_sources_deleted(
99 wiki_source: str,
100 store: Store,
101 documents_dir: Path,
102) -> bool:
103 """Return True if every cited source file has been deleted from disk."""
104 citations = store.get_citations_for_wiki(wiki_source)
105 if not citations:
106 return False
107 source_files = {c["source_filename"] for c in citations}
108 return all(not (documents_dir / f).exists() for f in source_files)
111def _check_cluster_below_threshold(
112 wiki_source: str,
113 store: Store,
114 documents_dir: Path,
115 min_sources: int = MIN_CLUSTER_SOURCES,
116) -> bool:
117 """Return True if a synthesis page's live source count dropped below min_sources."""
118 if f"/{SYNTHESIS_SUBDIR}/" not in wiki_source:
119 return False
120 citations = store.get_citations_for_wiki(wiki_source)
121 if not citations:
122 return False
123 source_files = {c["source_filename"] for c in citations}
124 live_count = sum(1 for f in source_files if (documents_dir / f).exists())
125 return live_count < min_sources
128def _check_stale_majority(
129 wiki_source: str,
130 store: Store,
131 config: Config,
132) -> bool:
133 """Return True if >50% of citations are stale (stale_hash or excerpt_missing)."""
134 issues = lint_wiki_page(wiki_source, store, config)
135 if not issues:
136 return False
137 citations = store.get_citations_for_wiki(wiki_source)
138 if not citations:
139 return False
140 stale_count = sum(1 for i in issues if i.issue_type in _STALE_TYPES)
141 return stale_count / len(citations) > config.wiki_stale_citation_threshold
144def _archive_and_record(
145 wiki_source: str,
146 wiki_root: Path,
147 store: Store,
148 config: Config,
149 reason: str,
150) -> PruneRecord:
151 """Archive a wiki page and return a PruneRecord for the action."""
152 _archive_page(wiki_source, wiki_root, store, config)
153 return PruneRecord(wiki_source=wiki_source, action=PruneAction.ARCHIVED, reason=reason)
156def _evaluate_page(
157 wiki_source: str, wiki_root: Path, store: Store, config: Config
158) -> PruneRecord | None:
159 """Check a single wiki page against pruning rules. Returns a record or None."""
160 if _check_all_sources_deleted(wiki_source, store, config.documents_dir):
161 return _archive_and_record(
162 wiki_source, wiki_root, store, config, "all cited sources deleted"
163 )
164 if _check_cluster_below_threshold(wiki_source, store, config.documents_dir):
165 return _archive_and_record(
166 wiki_source,
167 wiki_root,
168 store,
169 config,
170 f"concept cluster below {MIN_CLUSTER_SOURCES} live sources",
171 )
172 if _check_stale_majority(wiki_source, store, config):
173 return PruneRecord(
174 wiki_source=wiki_source,
175 action=PruneAction.FLAGGED,
176 reason="majority of citations stale",
177 )
178 return None
181def _finalize_prune(report: PruneReport, config: Config) -> None:
182 """Update wiki index and log after pruning."""
183 if not report.records:
184 return
185 log.info(
186 "Wiki prune: %d archived, %d flagged",
187 report.archived_count,
188 report.flagged_count,
189 )
190 update_wiki_index(config)
191 for rec in report.records:
192 append_wiki_log(f"pruned ({rec.action.value})", f"{rec.wiki_source}: {rec.reason}", config)
195def prune_wiki(store: Store, config: Config | None = None) -> PruneReport:
196 """Scan all wiki pages and prune stale/orphaned ones."""
197 if config is None:
198 config = cfg
199 wiki_root = config.data_root / config.wiki_dir
200 report = PruneReport()
201 if not wiki_root.exists():
202 return report
203 for subdir in WIKI_CONTENT_SUBDIRS:
204 subdir_path = wiki_root / subdir
205 if not subdir_path.exists():
206 continue
207 for md_path in sorted(subdir_path.rglob("*.md")):
208 relative = md_path.relative_to(wiki_root)
209 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}"
210 record = _evaluate_page(wiki_source, wiki_root, store, config)
211 if record:
212 report.records.append(record)
213 _finalize_prune(report, config)
214 return report