Coverage for src / lilbee / wiki / citation.py: 100%
106 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Parse, render, and verify wiki citations.
3Pure functions — no LLM dependency. Operates on markdown text and citation records.
4"""
6from dataclasses import dataclass
7from enum import Enum
9from lilbee.store import CitationRecord
10from lilbee.wiki.grammar import (
11 CITATION_BLOCK_COMMENT,
12 CITATION_BLOCK_SEP,
13 CITE_RE,
14 FOOTNOTE_RE,
15 INFERENCE_RE,
16)
19class CitationStatus(Enum):
20 """Result of verifying a citation against its source."""
22 VALID = "valid"
23 STALE_HASH = "stale_hash"
24 SOURCE_DELETED = "source_deleted"
25 EXCERPT_MISSING = "excerpt_missing"
28@dataclass(frozen=True)
29class ParsedCitation:
30 """A citation anchor extracted from wiki markdown."""
32 citation_key: str # e.g. "src1"
33 source_ref: str # human-readable ref, e.g. "python-docs/typing.md, lines 12-45"
34 line_number: int # 1-based line number in the markdown
37def parse_wiki_citations(markdown: str) -> list[ParsedCitation]:
38 """Extract citation footnote definitions from wiki markdown.
40 When the auto-generated block comment is present, scans from that
41 line onward. When a looser model leaves the comment out, falls back
42 to scanning the whole document for ``[^srcN]: ...`` definition lines.
43 That pattern unambiguously identifies a citation footnote and only
44 appears at the block level.
45 """
46 block_start = _find_citation_block_start(markdown)
47 start = block_start if block_start is not None else 0
49 lines = markdown.splitlines()
50 citations: list[ParsedCitation] = []
51 for line_idx in range(start, len(lines)):
52 match = FOOTNOTE_RE.match(lines[line_idx])
53 if match:
54 citations.append(
55 ParsedCitation(
56 citation_key=match.group(1),
57 source_ref=match.group(2).strip(),
58 line_number=line_idx + 1, # 1-based
59 )
60 )
61 return citations
64def render_citation_block(citations: list[CitationRecord]) -> str:
65 """Generate the markdown footnote footer from CitationRecord objects.
66 Returns the full citation block including separator and comment,
67 or an empty string when there are no citations.
68 """
69 if not citations:
70 return ""
71 lines = [CITATION_BLOCK_SEP, CITATION_BLOCK_COMMENT]
72 for rec in citations:
73 lines.append(f"[^{rec['citation_key']}]: {_format_source_ref(rec)}")
74 return "\n".join(lines) + "\n"
77def verify_citation(citation: CitationRecord, source_text: str) -> CitationStatus:
78 """Check whether a citation's excerpt exists in the source text.
79 Does not check hash staleness or source existence — caller handles those
80 by comparing ``citation.source_hash`` against the current file hash and
81 checking file presence.
82 """
83 if not citation["excerpt"]:
84 return CitationStatus.EXCERPT_MISSING
85 if _normalize(citation["excerpt"]) in _normalize(source_text):
86 return CitationStatus.VALID
87 return CitationStatus.EXCERPT_MISSING
90def find_unmarked_claims(markdown: str) -> list[str]:
91 """Find statements that are neither cited ``[^srcN]`` nor marked ``[*inference*]``.
92 Scans non-empty, non-metadata lines in the body (before the citation block).
93 Returns the text of each unmarked line.
94 """
95 body = extract_body(markdown)
96 lines = body.splitlines()
97 unmarked: list[str] = []
98 for line in lines:
99 stripped = line.strip()
100 if not _is_content_line(stripped):
101 continue
102 if CITE_RE.search(stripped) or INFERENCE_RE.search(stripped):
103 continue
104 unmarked.append(stripped)
105 return unmarked
108def strip_citation_block(markdown: str) -> str:
109 """Remove the auto-generated citation block (separator + comment + footnotes) from markdown."""
110 block_start = _find_citation_block_start(markdown)
111 if block_start is None:
112 return markdown
113 lines = markdown.splitlines()
114 body_end = _body_end_before_citations(lines, block_start)
115 return "\n".join(lines[:body_end]).rstrip() + "\n"
118def _find_citation_block_start(markdown: str) -> int | None:
119 """Return the 0-based line index where the citation block begins, or None."""
120 lines = markdown.splitlines()
121 for i, line in enumerate(lines):
122 if line.strip() == CITATION_BLOCK_COMMENT:
123 return i
124 return None
127def _body_end_before_citations(lines: list[str], block_start: int) -> int:
128 """Return the line index to truncate at, stripping the --- separator if present."""
129 body_end = block_start
130 if body_end > 0 and lines[body_end - 1].strip() == CITATION_BLOCK_SEP:
131 body_end -= 1
132 return body_end
135def extract_body(markdown: str) -> str:
136 """Return markdown body: strip YAML frontmatter and citation block."""
137 text = _strip_frontmatter(markdown)
138 block_start = _find_citation_block_start(text)
139 if block_start is None:
140 return text
141 lines = text.splitlines()
142 body_end = _body_end_before_citations(lines, block_start)
143 return "\n".join(lines[:body_end])
146def _strip_frontmatter(markdown: str) -> str:
147 """Remove YAML frontmatter delimited by ``---`` at the start."""
148 if not markdown.startswith("---"):
149 return markdown
150 lines = markdown.splitlines()
151 for i in range(1, len(lines)):
152 if lines[i].strip() == "---":
153 return "\n".join(lines[i + 1 :])
154 return markdown
157def _is_content_line(stripped: str) -> bool:
158 """Return True if a line contains a substantive claim (not heading/blank/marker)."""
159 if not stripped:
160 return False
161 if stripped.startswith("#"):
162 return False
163 return stripped != CITATION_BLOCK_SEP
166def _format_source_ref(rec: CitationRecord) -> str:
167 """Format a CitationRecord into a human-readable footnote reference."""
168 ref = rec["source_filename"]
169 has_page = rec["page_start"] is not None and rec["page_start"] > 0
170 has_page_end = rec["page_end"] is not None and rec["page_end"] > 0
171 has_line = rec["line_start"] is not None and rec["line_start"] > 0
172 has_line_end = rec["line_end"] is not None and rec["line_end"] > 0
173 if has_page or has_page_end:
174 if rec["page_start"] == rec["page_end"]:
175 ref += f", page {rec['page_start']}"
176 else:
177 ref += f", pages {rec['page_start']}-{rec['page_end']}"
178 elif has_line or has_line_end:
179 ref += f", lines {rec['line_start']}-{rec['line_end']}"
180 if rec["excerpt"]:
181 ref += f', excerpt: "{rec["excerpt"]}"'
182 return ref
185def _normalize(text: str) -> str:
186 """Normalize whitespace for fuzzy excerpt matching."""
187 return " ".join(text.split()).lower()