Coverage for src/lilbee/code

1"""Code chunking via tree-sitter AST analysis.

3Extracts structured symbol information (functions, classes, imports)

4and builds enriched chunk headers with symbol metadata.

5"""

7import logging

8from dataclasses import dataclass

9from pathlib import Path

10from typing import Any

12from tree_sitter_language_pack import (

13 ProcessConfig,

14 detect_language, # TODO: use public API once tree-sitter-language-pack >= 1.3.4

15 has_language,

16 init,

17 process,

18)

20from lilbee.chunk import chunk_text

21from lilbee.config import cfg

23log = logging.getLogger(__name__)

26@dataclass

27class SymbolInfo:

28 """Extracted symbol metadata from tree-sitter process()."""

30 name: str

31 kind: str

32 line_start: int

33 line_end: int

34 text: str

37@dataclass

38class CodeChunk:

39 """A chunk of source code with line location metadata."""

41 chunk: str

42 line_start: int

43 line_end: int

44 chunk_index: int

47def _detect_language(file_path: Path) -> str | None:

48 """Detect language from file path using tree-sitter-language-pack."""

49 result: str | None = detect_language(str(file_path))

50 return result

53def _ensure_language(lang: str) -> bool:

54 """Download language parser if not already available."""

55 try:

56 if has_language(lang):

57 return True

58 init({"languages": [lang]})

59 return has_language(lang)

60 except Exception:

61 log.debug("Failed to download tree-sitter language: %s", lang)

62 return False

65def find_line(needle: str, lines: list[str], start: int) -> int:

66 """Find the first line index (1-based) containing needle, from start."""

67 for i in range(start, len(lines)):

68 if needle and needle in lines[i]:

69 return i + 1

70 return start + 1

73def _fallback_chunks(text: str) -> list[CodeChunk]:

74 """Fallback text chunking with approximate line tracking."""

75 raw = chunk_text(text)

76 lines = text.split("\n")

77 results: list[CodeChunk] = []

78 search_from = 0

80 for idx, chunk in enumerate(raw):

81 first_line = chunk.split("\n")[0][:80]

82 line_start = find_line(first_line, lines, search_from)

83 line_end = min(line_start + chunk.count("\n"), len(lines))

84 results.append(

85 CodeChunk(

86 chunk=chunk,

87 line_start=line_start,

88 line_end=line_end,

89 chunk_index=idx,

90 )

91 )

92 search_from = line_start

94 return results

97def _extract_symbols(result: Any, source_text: str) -> list[SymbolInfo]:

98 """Parse process() result into typed SymbolInfo objects."""

99 raw = result.get("structure", [])

100 if not isinstance(raw, list):

101 return []

102 symbols: list[SymbolInfo] = []

103 for entry in raw:

104 if not isinstance(entry, dict):

105 continue

106 span = entry.get("span", {})

107 start_byte = span.get("start_byte", 0)

108 end_byte = span.get("end_byte", len(source_text))

109 symbols.append(

110 SymbolInfo(

111 name=str(entry.get("name", "")),

112 kind=str(entry.get("kind", "")).lower(),

113 line_start=int(span.get("start_line", 0)) + 1,

114 line_end=int(span.get("end_line", 0)) + 1,

115 text=source_text[start_byte:end_byte],

116 )

117 )

118 return symbols

119

120

121def chunk_code(file_path: Path) -> list[CodeChunk]:

122 """Chunk a source file using tree-sitter-language-pack's process() API.

123 Extracts structural symbols (functions, classes) and builds enriched

124 chunks with metadata headers. Falls back to token-based chunking

125 if the language isn't supported or parsing fails.

126 """

127 source_text = file_path.read_text(encoding="utf-8", errors="replace")

128 if not source_text.strip():

129 return []

130

131 lang = _detect_language(file_path)

132 if not lang:

133 return _fallback_chunks(source_text)

134

135 try:

136 if not _ensure_language(lang):

137 return _fallback_chunks(source_text)

138 config = ProcessConfig(

139 lang,

140 structure=True,

141 symbols=True,

142 docstrings=True,

143 chunk_max_size=cfg.chunk_size,

144 )

145 result = process(source_text, config)

146 except Exception:

147 log.debug("tree-sitter process() failed for %s", file_path, exc_info=True)

148 return _fallback_chunks(source_text)

149

150 symbols = _extract_symbols(result, source_text)

151 if not symbols:

152 return _fallback_chunks(source_text)

153

154 chunks: list[CodeChunk] = []

155 for i, sym in enumerate(symbols):

156 header = f"# File: {file_path}"

157 if sym.name and sym.kind:

158 header += f" | {sym.kind}: {sym.name}"

159 header += f" (lines {sym.line_start}-{sym.line_end})"

160

161 chunks.append(

162 CodeChunk(

163 chunk=f"{header}\n\n{sym.text}",

164 line_start=sym.line_start,

165 line_end=sym.line_end,

166 chunk_index=i,

167 )

168 )

169

170 return chunks

171

172

173def is_code_file(file_path: Path) -> bool:

174 """Check if a file is supported by tree-sitter chunking."""

175 return detect_language(str(file_path)) is not None

Coverage for src / lilbee / code_chunker.py: 100%

91 statements