Coverage for src / lilbee / wiki / citation.py: 100%

106 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Parse, render, and verify wiki citations. 

2 

3Pure functions — no LLM dependency. Operates on markdown text and citation records. 

4""" 

5 

6from dataclasses import dataclass 

7from enum import Enum 

8 

9from lilbee.store import CitationRecord 

10from lilbee.wiki.grammar import ( 

11 CITATION_BLOCK_COMMENT, 

12 CITATION_BLOCK_SEP, 

13 CITE_RE, 

14 FOOTNOTE_RE, 

15 INFERENCE_RE, 

16) 

17 

18 

19class CitationStatus(Enum): 

20 """Result of verifying a citation against its source.""" 

21 

22 VALID = "valid" 

23 STALE_HASH = "stale_hash" 

24 SOURCE_DELETED = "source_deleted" 

25 EXCERPT_MISSING = "excerpt_missing" 

26 

27 

28@dataclass(frozen=True) 

29class ParsedCitation: 

30 """A citation anchor extracted from wiki markdown.""" 

31 

32 citation_key: str # e.g. "src1" 

33 source_ref: str # human-readable ref, e.g. "python-docs/typing.md, lines 12-45" 

34 line_number: int # 1-based line number in the markdown 

35 

36 

37def parse_wiki_citations(markdown: str) -> list[ParsedCitation]: 

38 """Extract citation footnote definitions from wiki markdown. 

39 

40 When the auto-generated block comment is present, scans from that 

41 line onward. When a looser model leaves the comment out, falls back 

42 to scanning the whole document for ``[^srcN]: ...`` definition lines. 

43 That pattern unambiguously identifies a citation footnote and only 

44 appears at the block level. 

45 """ 

46 block_start = _find_citation_block_start(markdown) 

47 start = block_start if block_start is not None else 0 

48 

49 lines = markdown.splitlines() 

50 citations: list[ParsedCitation] = [] 

51 for line_idx in range(start, len(lines)): 

52 match = FOOTNOTE_RE.match(lines[line_idx]) 

53 if match: 

54 citations.append( 

55 ParsedCitation( 

56 citation_key=match.group(1), 

57 source_ref=match.group(2).strip(), 

58 line_number=line_idx + 1, # 1-based 

59 ) 

60 ) 

61 return citations 

62 

63 

64def render_citation_block(citations: list[CitationRecord]) -> str: 

65 """Generate the markdown footnote footer from CitationRecord objects. 

66 Returns the full citation block including separator and comment, 

67 or an empty string when there are no citations. 

68 """ 

69 if not citations: 

70 return "" 

71 lines = [CITATION_BLOCK_SEP, CITATION_BLOCK_COMMENT] 

72 for rec in citations: 

73 lines.append(f"[^{rec['citation_key']}]: {_format_source_ref(rec)}") 

74 return "\n".join(lines) + "\n" 

75 

76 

77def verify_citation(citation: CitationRecord, source_text: str) -> CitationStatus: 

78 """Check whether a citation's excerpt exists in the source text. 

79 Does not check hash staleness or source existence — caller handles those 

80 by comparing ``citation.source_hash`` against the current file hash and 

81 checking file presence. 

82 """ 

83 if not citation["excerpt"]: 

84 return CitationStatus.EXCERPT_MISSING 

85 if _normalize(citation["excerpt"]) in _normalize(source_text): 

86 return CitationStatus.VALID 

87 return CitationStatus.EXCERPT_MISSING 

88 

89 

90def find_unmarked_claims(markdown: str) -> list[str]: 

91 """Find statements that are neither cited ``[^srcN]`` nor marked ``[*inference*]``. 

92 Scans non-empty, non-metadata lines in the body (before the citation block). 

93 Returns the text of each unmarked line. 

94 """ 

95 body = extract_body(markdown) 

96 lines = body.splitlines() 

97 unmarked: list[str] = [] 

98 for line in lines: 

99 stripped = line.strip() 

100 if not _is_content_line(stripped): 

101 continue 

102 if CITE_RE.search(stripped) or INFERENCE_RE.search(stripped): 

103 continue 

104 unmarked.append(stripped) 

105 return unmarked 

106 

107 

108def strip_citation_block(markdown: str) -> str: 

109 """Remove the auto-generated citation block (separator + comment + footnotes) from markdown.""" 

110 block_start = _find_citation_block_start(markdown) 

111 if block_start is None: 

112 return markdown 

113 lines = markdown.splitlines() 

114 body_end = _body_end_before_citations(lines, block_start) 

115 return "\n".join(lines[:body_end]).rstrip() + "\n" 

116 

117 

118def _find_citation_block_start(markdown: str) -> int | None: 

119 """Return the 0-based line index where the citation block begins, or None.""" 

120 lines = markdown.splitlines() 

121 for i, line in enumerate(lines): 

122 if line.strip() == CITATION_BLOCK_COMMENT: 

123 return i 

124 return None 

125 

126 

127def _body_end_before_citations(lines: list[str], block_start: int) -> int: 

128 """Return the line index to truncate at, stripping the --- separator if present.""" 

129 body_end = block_start 

130 if body_end > 0 and lines[body_end - 1].strip() == CITATION_BLOCK_SEP: 

131 body_end -= 1 

132 return body_end 

133 

134 

135def extract_body(markdown: str) -> str: 

136 """Return markdown body: strip YAML frontmatter and citation block.""" 

137 text = _strip_frontmatter(markdown) 

138 block_start = _find_citation_block_start(text) 

139 if block_start is None: 

140 return text 

141 lines = text.splitlines() 

142 body_end = _body_end_before_citations(lines, block_start) 

143 return "\n".join(lines[:body_end]) 

144 

145 

146def _strip_frontmatter(markdown: str) -> str: 

147 """Remove YAML frontmatter delimited by ``---`` at the start.""" 

148 if not markdown.startswith("---"): 

149 return markdown 

150 lines = markdown.splitlines() 

151 for i in range(1, len(lines)): 

152 if lines[i].strip() == "---": 

153 return "\n".join(lines[i + 1 :]) 

154 return markdown 

155 

156 

157def _is_content_line(stripped: str) -> bool: 

158 """Return True if a line contains a substantive claim (not heading/blank/marker).""" 

159 if not stripped: 

160 return False 

161 if stripped.startswith("#"): 

162 return False 

163 return stripped != CITATION_BLOCK_SEP 

164 

165 

166def _format_source_ref(rec: CitationRecord) -> str: 

167 """Format a CitationRecord into a human-readable footnote reference.""" 

168 ref = rec["source_filename"] 

169 has_page = rec["page_start"] is not None and rec["page_start"] > 0 

170 has_page_end = rec["page_end"] is not None and rec["page_end"] > 0 

171 has_line = rec["line_start"] is not None and rec["line_start"] > 0 

172 has_line_end = rec["line_end"] is not None and rec["line_end"] > 0 

173 if has_page or has_page_end: 

174 if rec["page_start"] == rec["page_end"]: 

175 ref += f", page {rec['page_start']}" 

176 else: 

177 ref += f", pages {rec['page_start']}-{rec['page_end']}" 

178 elif has_line or has_line_end: 

179 ref += f", lines {rec['line_start']}-{rec['line_end']}" 

180 if rec["excerpt"]: 

181 ref += f', excerpt: "{rec["excerpt"]}"' 

182 return ref 

183 

184 

185def _normalize(text: str) -> str: 

186 """Normalize whitespace for fuzzy excerpt matching.""" 

187 return " ".join(text.split()).lower()