Coverage for src/lilbee/wiki/citation.py: 100%

1"""Parse, render, and verify wiki citations.

3Pure functions — no LLM dependency. Operates on markdown text and citation records.

4"""

6from dataclasses import dataclass

7from enum import Enum

9from lilbee.store import CitationRecord

10from lilbee.wiki.grammar import (

11 CITATION_BLOCK_COMMENT,

12 CITATION_BLOCK_SEP,

13 CITE_RE,

14 FOOTNOTE_RE,

15 INFERENCE_RE,

16)

19class CitationStatus(Enum):

20 """Result of verifying a citation against its source."""

22 VALID = "valid"

23 STALE_HASH = "stale_hash"

24 SOURCE_DELETED = "source_deleted"

25 EXCERPT_MISSING = "excerpt_missing"

28@dataclass(frozen=True)

29class ParsedCitation:

30 """A citation anchor extracted from wiki markdown."""

32 citation_key: str # e.g. "src1"

33 source_ref: str # human-readable ref, e.g. "python-docs/typing.md, lines 12-45"

34 line_number: int # 1-based line number in the markdown

37def parse_wiki_citations(markdown: str) -> list[ParsedCitation]:

38 """Extract citation footnote definitions from wiki markdown.

40 When the auto-generated block comment is present, scans from that

41 line onward. When a looser model leaves the comment out, falls back

42 to scanning the whole document for ``[^srcN]: ...`` definition lines.

43 That pattern unambiguously identifies a citation footnote and only

44 appears at the block level.

45 """

46 block_start = _find_citation_block_start(markdown)

47 start = block_start if block_start is not None else 0

49 lines = markdown.splitlines()

50 citations: list[ParsedCitation] = []

51 for line_idx in range(start, len(lines)):

52 match = FOOTNOTE_RE.match(lines[line_idx])

53 if match:

54 citations.append(

55 ParsedCitation(

56 citation_key=match.group(1),

57 source_ref=match.group(2).strip(),

58 line_number=line_idx + 1, # 1-based

59 )

60 )

61 return citations

64def render_citation_block(citations: list[CitationRecord]) -> str:

65 """Generate the markdown footnote footer from CitationRecord objects.

66 Returns the full citation block including separator and comment,

67 or an empty string when there are no citations.

68 """

69 if not citations:

70 return ""

71 lines = [CITATION_BLOCK_SEP, CITATION_BLOCK_COMMENT]

72 for rec in citations:

73 lines.append(f"[^{rec['citation_key']}]: {_format_source_ref(rec)}")

74 return "\n".join(lines) + "\n"

77def verify_citation(citation: CitationRecord, source_text: str) -> CitationStatus:

78 """Check whether a citation's excerpt exists in the source text.

79 Does not check hash staleness or source existence — caller handles those

80 by comparing ``citation.source_hash`` against the current file hash and

81 checking file presence.

82 """

83 if not citation["excerpt"]:

84 return CitationStatus.EXCERPT_MISSING

85 if _normalize(citation["excerpt"]) in _normalize(source_text):

86 return CitationStatus.VALID

87 return CitationStatus.EXCERPT_MISSING

90def find_unmarked_claims(markdown: str) -> list[str]:

91 """Find statements that are neither cited ``[^srcN]`` nor marked ``[*inference*]``.

92 Scans non-empty, non-metadata lines in the body (before the citation block).

93 Returns the text of each unmarked line.

94 """

95 body = extract_body(markdown)

96 lines = body.splitlines()

97 unmarked: list[str] = []

98 for line in lines:

99 stripped = line.strip()

100 if not _is_content_line(stripped):

101 continue

102 if CITE_RE.search(stripped) or INFERENCE_RE.search(stripped):

103 continue

104 unmarked.append(stripped)

105 return unmarked

106

107

108def strip_citation_block(markdown: str) -> str:

109 """Remove the auto-generated citation block (separator + comment + footnotes) from markdown."""

110 block_start = _find_citation_block_start(markdown)

111 if block_start is None:

112 return markdown

113 lines = markdown.splitlines()

114 body_end = _body_end_before_citations(lines, block_start)

115 return "\n".join(lines[:body_end]).rstrip() + "\n"

116

117

118def _find_citation_block_start(markdown: str) -> int | None:

119 """Return the 0-based line index where the citation block begins, or None."""

120 lines = markdown.splitlines()

121 for i, line in enumerate(lines):

122 if line.strip() == CITATION_BLOCK_COMMENT:

123 return i

124 return None

125

126

127def _body_end_before_citations(lines: list[str], block_start: int) -> int:

128 """Return the line index to truncate at, stripping the --- separator if present."""

129 body_end = block_start

130 if body_end > 0 and lines[body_end - 1].strip() == CITATION_BLOCK_SEP:

131 body_end -= 1

132 return body_end

133

134

135def extract_body(markdown: str) -> str:

136 """Return markdown body: strip YAML frontmatter and citation block."""

137 text = _strip_frontmatter(markdown)

138 block_start = _find_citation_block_start(text)

139 if block_start is None:

140 return text

141 lines = text.splitlines()

142 body_end = _body_end_before_citations(lines, block_start)

143 return "\n".join(lines[:body_end])

144

145

146def _strip_frontmatter(markdown: str) -> str:

147 """Remove YAML frontmatter delimited by ``---`` at the start."""

148 if not markdown.startswith("---"):

149 return markdown

150 lines = markdown.splitlines()

151 for i in range(1, len(lines)):

152 if lines[i].strip() == "---":

153 return "\n".join(lines[i + 1 :])

154 return markdown

155

156

157def _is_content_line(stripped: str) -> bool:

158 """Return True if a line contains a substantive claim (not heading/blank/marker)."""

159 if not stripped:

160 return False

161 if stripped.startswith("#"):

162 return False

163 return stripped != CITATION_BLOCK_SEP

164

165

166def _format_source_ref(rec: CitationRecord) -> str:

167 """Format a CitationRecord into a human-readable footnote reference."""

168 ref = rec["source_filename"]

169 has_page = rec["page_start"] is not None and rec["page_start"] > 0

170 has_page_end = rec["page_end"] is not None and rec["page_end"] > 0

171 has_line = rec["line_start"] is not None and rec["line_start"] > 0

172 has_line_end = rec["line_end"] is not None and rec["line_end"] > 0

173 if has_page or has_page_end:

174 if rec["page_start"] == rec["page_end"]:

175 ref += f", page {rec['page_start']}"

176 else:

177 ref += f", pages {rec['page_start']}-{rec['page_end']}"

178 elif has_line or has_line_end:

179 ref += f", lines {rec['line_start']}-{rec['line_end']}"

180 if rec["excerpt"]:

181 ref += f', excerpt: "{rec["excerpt"]}"'

182 return ref

183

184

185def _normalize(text: str) -> str:

186 """Normalize whitespace for fuzzy excerpt matching."""

187 return " ".join(text.split()).lower()

Coverage for src / lilbee / wiki / citation.py: 100%

106 statements