Coverage for src / lilbee / code_chunker.py: 100%

91 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Code chunking via tree-sitter AST analysis. 

2 

3Extracts structured symbol information (functions, classes, imports) 

4and builds enriched chunk headers with symbol metadata. 

5""" 

6 

7import logging 

8from dataclasses import dataclass 

9from pathlib import Path 

10from typing import Any 

11 

12from tree_sitter_language_pack import ( 

13 ProcessConfig, 

14 detect_language, # TODO: use public API once tree-sitter-language-pack >= 1.3.4 

15 has_language, 

16 init, 

17 process, 

18) 

19 

20from lilbee.chunk import chunk_text 

21from lilbee.config import cfg 

22 

23log = logging.getLogger(__name__) 

24 

25 

26@dataclass 

27class SymbolInfo: 

28 """Extracted symbol metadata from tree-sitter process().""" 

29 

30 name: str 

31 kind: str 

32 line_start: int 

33 line_end: int 

34 text: str 

35 

36 

37@dataclass 

38class CodeChunk: 

39 """A chunk of source code with line location metadata.""" 

40 

41 chunk: str 

42 line_start: int 

43 line_end: int 

44 chunk_index: int 

45 

46 

47def _detect_language(file_path: Path) -> str | None: 

48 """Detect language from file path using tree-sitter-language-pack.""" 

49 result: str | None = detect_language(str(file_path)) 

50 return result 

51 

52 

53def _ensure_language(lang: str) -> bool: 

54 """Download language parser if not already available.""" 

55 try: 

56 if has_language(lang): 

57 return True 

58 init({"languages": [lang]}) 

59 return has_language(lang) 

60 except Exception: 

61 log.debug("Failed to download tree-sitter language: %s", lang) 

62 return False 

63 

64 

65def find_line(needle: str, lines: list[str], start: int) -> int: 

66 """Find the first line index (1-based) containing needle, from start.""" 

67 for i in range(start, len(lines)): 

68 if needle and needle in lines[i]: 

69 return i + 1 

70 return start + 1 

71 

72 

73def _fallback_chunks(text: str) -> list[CodeChunk]: 

74 """Fallback text chunking with approximate line tracking.""" 

75 raw = chunk_text(text) 

76 lines = text.split("\n") 

77 results: list[CodeChunk] = [] 

78 search_from = 0 

79 

80 for idx, chunk in enumerate(raw): 

81 first_line = chunk.split("\n")[0][:80] 

82 line_start = find_line(first_line, lines, search_from) 

83 line_end = min(line_start + chunk.count("\n"), len(lines)) 

84 results.append( 

85 CodeChunk( 

86 chunk=chunk, 

87 line_start=line_start, 

88 line_end=line_end, 

89 chunk_index=idx, 

90 ) 

91 ) 

92 search_from = line_start 

93 

94 return results 

95 

96 

97def _extract_symbols(result: Any, source_text: str) -> list[SymbolInfo]: 

98 """Parse process() result into typed SymbolInfo objects.""" 

99 raw = result.get("structure", []) 

100 if not isinstance(raw, list): 

101 return [] 

102 symbols: list[SymbolInfo] = [] 

103 for entry in raw: 

104 if not isinstance(entry, dict): 

105 continue 

106 span = entry.get("span", {}) 

107 start_byte = span.get("start_byte", 0) 

108 end_byte = span.get("end_byte", len(source_text)) 

109 symbols.append( 

110 SymbolInfo( 

111 name=str(entry.get("name", "")), 

112 kind=str(entry.get("kind", "")).lower(), 

113 line_start=int(span.get("start_line", 0)) + 1, 

114 line_end=int(span.get("end_line", 0)) + 1, 

115 text=source_text[start_byte:end_byte], 

116 ) 

117 ) 

118 return symbols 

119 

120 

121def chunk_code(file_path: Path) -> list[CodeChunk]: 

122 """Chunk a source file using tree-sitter-language-pack's process() API. 

123 Extracts structural symbols (functions, classes) and builds enriched 

124 chunks with metadata headers. Falls back to token-based chunking 

125 if the language isn't supported or parsing fails. 

126 """ 

127 source_text = file_path.read_text(encoding="utf-8", errors="replace") 

128 if not source_text.strip(): 

129 return [] 

130 

131 lang = _detect_language(file_path) 

132 if not lang: 

133 return _fallback_chunks(source_text) 

134 

135 try: 

136 if not _ensure_language(lang): 

137 return _fallback_chunks(source_text) 

138 config = ProcessConfig( 

139 lang, 

140 structure=True, 

141 symbols=True, 

142 docstrings=True, 

143 chunk_max_size=cfg.chunk_size, 

144 ) 

145 result = process(source_text, config) 

146 except Exception: 

147 log.debug("tree-sitter process() failed for %s", file_path, exc_info=True) 

148 return _fallback_chunks(source_text) 

149 

150 symbols = _extract_symbols(result, source_text) 

151 if not symbols: 

152 return _fallback_chunks(source_text) 

153 

154 chunks: list[CodeChunk] = [] 

155 for i, sym in enumerate(symbols): 

156 header = f"# File: {file_path}" 

157 if sym.name and sym.kind: 

158 header += f" | {sym.kind}: {sym.name}" 

159 header += f" (lines {sym.line_start}-{sym.line_end})" 

160 

161 chunks.append( 

162 CodeChunk( 

163 chunk=f"{header}\n\n{sym.text}", 

164 line_start=sym.line_start, 

165 line_end=sym.line_end, 

166 chunk_index=i, 

167 ) 

168 ) 

169 

170 return chunks 

171 

172 

173def is_code_file(file_path: Path) -> bool: 

174 """Check if a file is supported by tree-sitter chunking.""" 

175 return detect_language(str(file_path)) is not None