Coverage for src / lilbee / wiki / links.py: 100%
70 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Wiki ``[[link]]`` rewriter.
3Post-processing pass that rewrites concept and entity surface forms to
4Obsidian-style ``[[slug]]`` links in the body of a page. Code fences,
5YAML frontmatter, and the citation block are left untouched so the
6rewriter can run repeatedly over the same file without corrupting
7either the provenance trail or illustrative code blocks.
8"""
10from __future__ import annotations
12import re
13from collections.abc import Mapping
14from dataclasses import dataclass
15from types import MappingProxyType
17from lilbee.wiki.grammar import CITATION_BLOCK_COMMENT, CODE_FENCE_RE
20@dataclass(frozen=True)
21class CompiledRewriter:
22 """Precompiled artifacts shared across a batch of page rewrites.
24 The regex compile + longest-first sort are O(M log M) in the
25 surface-map size. When a build rewrites P pages, computing these
26 once and reusing them across iterations cuts the per-page cost
27 to a single ``pattern.sub`` pass. The ``lookup`` is wrapped in a
28 read-only view so callers can't accidentally poison a shared
29 rewriter mid-loop.
30 """
32 pattern: re.Pattern[str]
33 lookup: Mapping[str, str]
36def compile_rewriter(surface_to_slug: dict[str, str]) -> CompiledRewriter | None:
37 """Compile the regex + lowercase lookup for a surface-to-slug map.
39 Returns ``None`` when the map is empty so the caller can short-circuit.
40 """
41 if not surface_to_slug:
42 return None
43 return CompiledRewriter(
44 pattern=_compile_surface_pattern(surface_to_slug),
45 lookup=MappingProxyType(
46 {surface.lower(): slug for surface, slug in surface_to_slug.items()}
47 ),
48 )
51def rewrite_wiki_links(
52 content: str,
53 surface_to_slug: dict[str, str],
54 skip_slug: str | None = None,
55) -> str:
56 """Return *content* with slug surface forms rewritten to ``[[slug]]``.
58 *surface_to_slug* maps the human-readable surface form (e.g.
59 ``"tire pressure"``) to its slug (``"tire-pressure"``). Matching is
60 case-insensitive, respects word boundaries, and skips occurrences
61 already wrapped in ``[[...]]``. When two surface forms overlap
62 (e.g. ``"ford"`` and ``"ford motor company"``) the longer form
63 wins, since the alternation regex is ordered longest-first.
65 *skip_slug* suppresses self-links: a match that resolves to this
66 slug is left as raw text. Callers pass the owning page's slug so
67 ``braking.md`` does not gain a ``[[braking]]`` reference to itself.
68 Filtering in the replace callback is O(1) per match; pre-filtering
69 the dict would be O(M) per page.
71 For batch work over many pages, call :func:`compile_rewriter` once
72 and pass the result to :func:`apply_rewriter` to skip the per-call
73 compile + sort.
74 """
75 rewriter = compile_rewriter(surface_to_slug)
76 if rewriter is None or not content:
77 return content
78 return apply_rewriter(content, rewriter, skip_slug)
81def apply_rewriter(
82 content: str,
83 rewriter: CompiledRewriter,
84 skip_slug: str | None = None,
85) -> str:
86 """Apply a precompiled rewriter to *content*, returning the rewritten text."""
87 if not content:
88 return content
90 ending_newline = content.endswith("\n")
91 lines = content.splitlines()
92 rewritten = [
93 _rewrite_line(line, rewriter.pattern, rewriter.lookup, skip_slug) if writable else line
94 for line, writable in _classify_lines(lines)
95 ]
96 result = "\n".join(rewritten)
97 if ending_newline:
98 result += "\n"
99 return result
102def _compile_surface_pattern(surface_to_slug: dict[str, str]) -> re.Pattern[str]:
103 """Compile one alternation regex ordered longest-first.
105 Longest-first matters so ``"ford motor company"`` beats ``"ford"``
106 when both are in the slug set. The lookbehind blocks matching
107 inside an existing ``[[...]]`` link by rejecting a preceding ``[``,
108 and the lookahead blocks matching inside the closing ``]]``.
109 """
110 sorted_surfaces = sorted(surface_to_slug, key=len, reverse=True)
111 alternation = "|".join(re.escape(s) for s in sorted_surfaces if s)
112 return re.compile(
113 r"(?<![\w\[])(" + alternation + r")(?![\w\]])",
114 re.IGNORECASE,
115 )
118def _rewrite_line(
119 line: str,
120 pattern: re.Pattern[str],
121 lookup: Mapping[str, str],
122 skip_slug: str | None = None,
123) -> str:
124 def replace(match: re.Match[str]) -> str:
125 slug = lookup[match.group(0).lower()]
126 if slug == skip_slug:
127 return match.group(0)
128 return f"[[{slug}]]"
130 return pattern.sub(replace, line)
133def _classify_lines(lines: list[str]) -> list[tuple[str, bool]]:
134 """Tag each line with whether it's part of a rewritable body region."""
135 tagged: list[tuple[str, bool]] = []
136 in_frontmatter = False
137 in_code_fence = False
138 in_citation = False
140 for idx, line in enumerate(lines):
141 stripped = line.strip()
143 if idx == 0 and stripped == "---":
144 in_frontmatter = True
145 tagged.append((line, False))
146 continue
147 if in_frontmatter:
148 tagged.append((line, False))
149 if stripped == "---":
150 in_frontmatter = False
151 continue
153 # The citation block is terminal: once its comment marker appears
154 # every following line is citation, so ``in_citation`` never resets.
155 if stripped == CITATION_BLOCK_COMMENT:
156 in_citation = True
157 if in_citation:
158 tagged.append((line, False))
159 continue
161 if CODE_FENCE_RE.match(stripped):
162 tagged.append((line, False))
163 in_code_fence = not in_code_fence
164 continue
165 if in_code_fence:
166 tagged.append((line, False))
167 continue
169 tagged.append((line, True))
170 return tagged