Coverage for src / lilbee / wiki / prune.py: 100%

105 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Prune stale and orphaned wiki pages. 

2 

3Pruning rules: 

41. All cited sources deleted -> archive the page 

52. Concept cluster shrinks below 3 sources -> archive synthesis page 

63. >50% of citations are stale (stale_hash or excerpt_missing) -> flag for regeneration 

7 

8Archived pages are moved to wiki/archive/ and removed from the vector store. 

9""" 

10 

11from __future__ import annotations 

12 

13import logging 

14import shutil 

15from dataclasses import dataclass, field 

16from enum import Enum 

17from pathlib import Path 

18 

19from lilbee.config import Config, cfg 

20from lilbee.store import Store 

21from lilbee.wiki.index import append_wiki_log, update_wiki_index 

22from lilbee.wiki.lint import IssueType, lint_wiki_page 

23from lilbee.wiki.shared import ( 

24 ARCHIVE_SUBDIR, 

25 MIN_CLUSTER_SOURCES, 

26 SYNTHESIS_SUBDIR, 

27 WIKI_CONTENT_SUBDIRS, 

28) 

29 

30log = logging.getLogger(__name__) 

31 

32_STALE_TYPES = {IssueType.STALE_HASH, IssueType.EXCERPT_MISSING} 

33 

34 

35class PruneAction(Enum): 

36 """What happened to a wiki page during pruning.""" 

37 

38 ARCHIVED = "archived" 

39 FLAGGED = "flagged" 

40 

41 

42@dataclass(frozen=True) 

43class PruneRecord: 

44 """A single pruning action taken on a wiki page.""" 

45 

46 wiki_source: str 

47 action: PruneAction 

48 reason: str 

49 

50 def to_dict(self) -> dict[str, str]: 

51 """Serialize to a plain dict suitable for JSON output.""" 

52 return { 

53 "wiki_source": self.wiki_source, 

54 "action": self.action.value, 

55 "reason": self.reason, 

56 } 

57 

58 

59@dataclass 

60class PruneReport: 

61 """Aggregated results from pruning wiki pages.""" 

62 

63 records: list[PruneRecord] = field(default_factory=list) 

64 

65 @property 

66 def archived_count(self) -> int: 

67 return sum(1 for r in self.records if r.action == PruneAction.ARCHIVED) 

68 

69 @property 

70 def flagged_count(self) -> int: 

71 return sum(1 for r in self.records if r.action == PruneAction.FLAGGED) 

72 

73 

74def _archive_page( 

75 wiki_source: str, 

76 wiki_root: Path, 

77 store: Store, 

78 config: Config, 

79) -> None: 

80 """Move a wiki page to wiki/archive/ and clean up store data.""" 

81 relative = wiki_source.removeprefix(config.wiki_dir + "/") 

82 source_path = wiki_root / relative 

83 

84 archive_dir = wiki_root / ARCHIVE_SUBDIR 

85 archive_dir.mkdir(parents=True, exist_ok=True) 

86 archive_path = archive_dir / source_path.name 

87 

88 if source_path.exists(): 

89 shutil.move(source_path, archive_path) 

90 log.info("Archived wiki page %s -> %s", source_path, archive_path) 

91 else: 

92 log.warning("Wiki page file not found for archival: %s", source_path) 

93 

94 store.delete_by_source(wiki_source) 

95 store.delete_citations_for_wiki(wiki_source) 

96 

97 

98def _check_all_sources_deleted( 

99 wiki_source: str, 

100 store: Store, 

101 documents_dir: Path, 

102) -> bool: 

103 """Return True if every cited source file has been deleted from disk.""" 

104 citations = store.get_citations_for_wiki(wiki_source) 

105 if not citations: 

106 return False 

107 source_files = {c["source_filename"] for c in citations} 

108 return all(not (documents_dir / f).exists() for f in source_files) 

109 

110 

111def _check_cluster_below_threshold( 

112 wiki_source: str, 

113 store: Store, 

114 documents_dir: Path, 

115 min_sources: int = MIN_CLUSTER_SOURCES, 

116) -> bool: 

117 """Return True if a synthesis page's live source count dropped below min_sources.""" 

118 if f"/{SYNTHESIS_SUBDIR}/" not in wiki_source: 

119 return False 

120 citations = store.get_citations_for_wiki(wiki_source) 

121 if not citations: 

122 return False 

123 source_files = {c["source_filename"] for c in citations} 

124 live_count = sum(1 for f in source_files if (documents_dir / f).exists()) 

125 return live_count < min_sources 

126 

127 

128def _check_stale_majority( 

129 wiki_source: str, 

130 store: Store, 

131 config: Config, 

132) -> bool: 

133 """Return True if >50% of citations are stale (stale_hash or excerpt_missing).""" 

134 issues = lint_wiki_page(wiki_source, store, config) 

135 if not issues: 

136 return False 

137 citations = store.get_citations_for_wiki(wiki_source) 

138 if not citations: 

139 return False 

140 stale_count = sum(1 for i in issues if i.issue_type in _STALE_TYPES) 

141 return stale_count / len(citations) > config.wiki_stale_citation_threshold 

142 

143 

144def _archive_and_record( 

145 wiki_source: str, 

146 wiki_root: Path, 

147 store: Store, 

148 config: Config, 

149 reason: str, 

150) -> PruneRecord: 

151 """Archive a wiki page and return a PruneRecord for the action.""" 

152 _archive_page(wiki_source, wiki_root, store, config) 

153 return PruneRecord(wiki_source=wiki_source, action=PruneAction.ARCHIVED, reason=reason) 

154 

155 

156def _evaluate_page( 

157 wiki_source: str, wiki_root: Path, store: Store, config: Config 

158) -> PruneRecord | None: 

159 """Check a single wiki page against pruning rules. Returns a record or None.""" 

160 if _check_all_sources_deleted(wiki_source, store, config.documents_dir): 

161 return _archive_and_record( 

162 wiki_source, wiki_root, store, config, "all cited sources deleted" 

163 ) 

164 if _check_cluster_below_threshold(wiki_source, store, config.documents_dir): 

165 return _archive_and_record( 

166 wiki_source, 

167 wiki_root, 

168 store, 

169 config, 

170 f"concept cluster below {MIN_CLUSTER_SOURCES} live sources", 

171 ) 

172 if _check_stale_majority(wiki_source, store, config): 

173 return PruneRecord( 

174 wiki_source=wiki_source, 

175 action=PruneAction.FLAGGED, 

176 reason="majority of citations stale", 

177 ) 

178 return None 

179 

180 

181def _finalize_prune(report: PruneReport, config: Config) -> None: 

182 """Update wiki index and log after pruning.""" 

183 if not report.records: 

184 return 

185 log.info( 

186 "Wiki prune: %d archived, %d flagged", 

187 report.archived_count, 

188 report.flagged_count, 

189 ) 

190 update_wiki_index(config) 

191 for rec in report.records: 

192 append_wiki_log(f"pruned ({rec.action.value})", f"{rec.wiki_source}: {rec.reason}", config) 

193 

194 

195def prune_wiki(store: Store, config: Config | None = None) -> PruneReport: 

196 """Scan all wiki pages and prune stale/orphaned ones.""" 

197 if config is None: 

198 config = cfg 

199 wiki_root = config.data_root / config.wiki_dir 

200 report = PruneReport() 

201 if not wiki_root.exists(): 

202 return report 

203 for subdir in WIKI_CONTENT_SUBDIRS: 

204 subdir_path = wiki_root / subdir 

205 if not subdir_path.exists(): 

206 continue 

207 for md_path in sorted(subdir_path.rglob("*.md")): 

208 relative = md_path.relative_to(wiki_root) 

209 wiki_source = f"{config.wiki_dir}/{relative.as_posix()}" 

210 record = _evaluate_page(wiki_source, wiki_root, store, config) 

211 if record: 

212 report.records.append(record) 

213 _finalize_prune(report, config) 

214 return report