Coverage for src / lilbee / wiki / links.py: 100%

70 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Wiki ``[[link]]`` rewriter. 

2 

3Post-processing pass that rewrites concept and entity surface forms to 

4Obsidian-style ``[[slug]]`` links in the body of a page. Code fences, 

5YAML frontmatter, and the citation block are left untouched so the 

6rewriter can run repeatedly over the same file without corrupting 

7either the provenance trail or illustrative code blocks. 

8""" 

9 

10from __future__ import annotations 

11 

12import re 

13from collections.abc import Mapping 

14from dataclasses import dataclass 

15from types import MappingProxyType 

16 

17from lilbee.wiki.grammar import CITATION_BLOCK_COMMENT, CODE_FENCE_RE 

18 

19 

20@dataclass(frozen=True) 

21class CompiledRewriter: 

22 """Precompiled artifacts shared across a batch of page rewrites. 

23 

24 The regex compile + longest-first sort are O(M log M) in the 

25 surface-map size. When a build rewrites P pages, computing these 

26 once and reusing them across iterations cuts the per-page cost 

27 to a single ``pattern.sub`` pass. The ``lookup`` is wrapped in a 

28 read-only view so callers can't accidentally poison a shared 

29 rewriter mid-loop. 

30 """ 

31 

32 pattern: re.Pattern[str] 

33 lookup: Mapping[str, str] 

34 

35 

36def compile_rewriter(surface_to_slug: dict[str, str]) -> CompiledRewriter | None: 

37 """Compile the regex + lowercase lookup for a surface-to-slug map. 

38 

39 Returns ``None`` when the map is empty so the caller can short-circuit. 

40 """ 

41 if not surface_to_slug: 

42 return None 

43 return CompiledRewriter( 

44 pattern=_compile_surface_pattern(surface_to_slug), 

45 lookup=MappingProxyType( 

46 {surface.lower(): slug for surface, slug in surface_to_slug.items()} 

47 ), 

48 ) 

49 

50 

51def rewrite_wiki_links( 

52 content: str, 

53 surface_to_slug: dict[str, str], 

54 skip_slug: str | None = None, 

55) -> str: 

56 """Return *content* with slug surface forms rewritten to ``[[slug]]``. 

57 

58 *surface_to_slug* maps the human-readable surface form (e.g. 

59 ``"tire pressure"``) to its slug (``"tire-pressure"``). Matching is 

60 case-insensitive, respects word boundaries, and skips occurrences 

61 already wrapped in ``[[...]]``. When two surface forms overlap 

62 (e.g. ``"ford"`` and ``"ford motor company"``) the longer form 

63 wins, since the alternation regex is ordered longest-first. 

64 

65 *skip_slug* suppresses self-links: a match that resolves to this 

66 slug is left as raw text. Callers pass the owning page's slug so 

67 ``braking.md`` does not gain a ``[[braking]]`` reference to itself. 

68 Filtering in the replace callback is O(1) per match; pre-filtering 

69 the dict would be O(M) per page. 

70 

71 For batch work over many pages, call :func:`compile_rewriter` once 

72 and pass the result to :func:`apply_rewriter` to skip the per-call 

73 compile + sort. 

74 """ 

75 rewriter = compile_rewriter(surface_to_slug) 

76 if rewriter is None or not content: 

77 return content 

78 return apply_rewriter(content, rewriter, skip_slug) 

79 

80 

81def apply_rewriter( 

82 content: str, 

83 rewriter: CompiledRewriter, 

84 skip_slug: str | None = None, 

85) -> str: 

86 """Apply a precompiled rewriter to *content*, returning the rewritten text.""" 

87 if not content: 

88 return content 

89 

90 ending_newline = content.endswith("\n") 

91 lines = content.splitlines() 

92 rewritten = [ 

93 _rewrite_line(line, rewriter.pattern, rewriter.lookup, skip_slug) if writable else line 

94 for line, writable in _classify_lines(lines) 

95 ] 

96 result = "\n".join(rewritten) 

97 if ending_newline: 

98 result += "\n" 

99 return result 

100 

101 

102def _compile_surface_pattern(surface_to_slug: dict[str, str]) -> re.Pattern[str]: 

103 """Compile one alternation regex ordered longest-first. 

104 

105 Longest-first matters so ``"ford motor company"`` beats ``"ford"`` 

106 when both are in the slug set. The lookbehind blocks matching 

107 inside an existing ``[[...]]`` link by rejecting a preceding ``[``, 

108 and the lookahead blocks matching inside the closing ``]]``. 

109 """ 

110 sorted_surfaces = sorted(surface_to_slug, key=len, reverse=True) 

111 alternation = "|".join(re.escape(s) for s in sorted_surfaces if s) 

112 return re.compile( 

113 r"(?<![\w\[])(" + alternation + r")(?![\w\]])", 

114 re.IGNORECASE, 

115 ) 

116 

117 

118def _rewrite_line( 

119 line: str, 

120 pattern: re.Pattern[str], 

121 lookup: Mapping[str, str], 

122 skip_slug: str | None = None, 

123) -> str: 

124 def replace(match: re.Match[str]) -> str: 

125 slug = lookup[match.group(0).lower()] 

126 if slug == skip_slug: 

127 return match.group(0) 

128 return f"[[{slug}]]" 

129 

130 return pattern.sub(replace, line) 

131 

132 

133def _classify_lines(lines: list[str]) -> list[tuple[str, bool]]: 

134 """Tag each line with whether it's part of a rewritable body region.""" 

135 tagged: list[tuple[str, bool]] = [] 

136 in_frontmatter = False 

137 in_code_fence = False 

138 in_citation = False 

139 

140 for idx, line in enumerate(lines): 

141 stripped = line.strip() 

142 

143 if idx == 0 and stripped == "---": 

144 in_frontmatter = True 

145 tagged.append((line, False)) 

146 continue 

147 if in_frontmatter: 

148 tagged.append((line, False)) 

149 if stripped == "---": 

150 in_frontmatter = False 

151 continue 

152 

153 # The citation block is terminal: once its comment marker appears 

154 # every following line is citation, so ``in_citation`` never resets. 

155 if stripped == CITATION_BLOCK_COMMENT: 

156 in_citation = True 

157 if in_citation: 

158 tagged.append((line, False)) 

159 continue 

160 

161 if CODE_FENCE_RE.match(stripped): 

162 tagged.append((line, False)) 

163 in_code_fence = not in_code_fence 

164 continue 

165 if in_code_fence: 

166 tagged.append((line, False)) 

167 continue 

168 

169 tagged.append((line, True)) 

170 return tagged