Coverage for src / lilbee / code_chunker.py: 100%
91 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Code chunking via tree-sitter AST analysis.
3Extracts structured symbol information (functions, classes, imports)
4and builds enriched chunk headers with symbol metadata.
5"""
7import logging
8from dataclasses import dataclass
9from pathlib import Path
10from typing import Any
12from tree_sitter_language_pack import (
13 ProcessConfig,
14 detect_language, # TODO: use public API once tree-sitter-language-pack >= 1.3.4
15 has_language,
16 init,
17 process,
18)
20from lilbee.chunk import chunk_text
21from lilbee.config import cfg
23log = logging.getLogger(__name__)
26@dataclass
27class SymbolInfo:
28 """Extracted symbol metadata from tree-sitter process()."""
30 name: str
31 kind: str
32 line_start: int
33 line_end: int
34 text: str
37@dataclass
38class CodeChunk:
39 """A chunk of source code with line location metadata."""
41 chunk: str
42 line_start: int
43 line_end: int
44 chunk_index: int
47def _detect_language(file_path: Path) -> str | None:
48 """Detect language from file path using tree-sitter-language-pack."""
49 result: str | None = detect_language(str(file_path))
50 return result
53def _ensure_language(lang: str) -> bool:
54 """Download language parser if not already available."""
55 try:
56 if has_language(lang):
57 return True
58 init({"languages": [lang]})
59 return has_language(lang)
60 except Exception:
61 log.debug("Failed to download tree-sitter language: %s", lang)
62 return False
65def find_line(needle: str, lines: list[str], start: int) -> int:
66 """Find the first line index (1-based) containing needle, from start."""
67 for i in range(start, len(lines)):
68 if needle and needle in lines[i]:
69 return i + 1
70 return start + 1
73def _fallback_chunks(text: str) -> list[CodeChunk]:
74 """Fallback text chunking with approximate line tracking."""
75 raw = chunk_text(text)
76 lines = text.split("\n")
77 results: list[CodeChunk] = []
78 search_from = 0
80 for idx, chunk in enumerate(raw):
81 first_line = chunk.split("\n")[0][:80]
82 line_start = find_line(first_line, lines, search_from)
83 line_end = min(line_start + chunk.count("\n"), len(lines))
84 results.append(
85 CodeChunk(
86 chunk=chunk,
87 line_start=line_start,
88 line_end=line_end,
89 chunk_index=idx,
90 )
91 )
92 search_from = line_start
94 return results
97def _extract_symbols(result: Any, source_text: str) -> list[SymbolInfo]:
98 """Parse process() result into typed SymbolInfo objects."""
99 raw = result.get("structure", [])
100 if not isinstance(raw, list):
101 return []
102 symbols: list[SymbolInfo] = []
103 for entry in raw:
104 if not isinstance(entry, dict):
105 continue
106 span = entry.get("span", {})
107 start_byte = span.get("start_byte", 0)
108 end_byte = span.get("end_byte", len(source_text))
109 symbols.append(
110 SymbolInfo(
111 name=str(entry.get("name", "")),
112 kind=str(entry.get("kind", "")).lower(),
113 line_start=int(span.get("start_line", 0)) + 1,
114 line_end=int(span.get("end_line", 0)) + 1,
115 text=source_text[start_byte:end_byte],
116 )
117 )
118 return symbols
121def chunk_code(file_path: Path) -> list[CodeChunk]:
122 """Chunk a source file using tree-sitter-language-pack's process() API.
123 Extracts structural symbols (functions, classes) and builds enriched
124 chunks with metadata headers. Falls back to token-based chunking
125 if the language isn't supported or parsing fails.
126 """
127 source_text = file_path.read_text(encoding="utf-8", errors="replace")
128 if not source_text.strip():
129 return []
131 lang = _detect_language(file_path)
132 if not lang:
133 return _fallback_chunks(source_text)
135 try:
136 if not _ensure_language(lang):
137 return _fallback_chunks(source_text)
138 config = ProcessConfig(
139 lang,
140 structure=True,
141 symbols=True,
142 docstrings=True,
143 chunk_max_size=cfg.chunk_size,
144 )
145 result = process(source_text, config)
146 except Exception:
147 log.debug("tree-sitter process() failed for %s", file_path, exc_info=True)
148 return _fallback_chunks(source_text)
150 symbols = _extract_symbols(result, source_text)
151 if not symbols:
152 return _fallback_chunks(source_text)
154 chunks: list[CodeChunk] = []
155 for i, sym in enumerate(symbols):
156 header = f"# File: {file_path}"
157 if sym.name and sym.kind:
158 header += f" | {sym.kind}: {sym.name}"
159 header += f" (lines {sym.line_start}-{sym.line_end})"
161 chunks.append(
162 CodeChunk(
163 chunk=f"{header}\n\n{sym.text}",
164 line_start=sym.line_start,
165 line_end=sym.line_end,
166 chunk_index=i,
167 )
168 )
170 return chunks
173def is_code_file(file_path: Path) -> bool:
174 """Check if a file is supported by tree-sitter chunking."""
175 return detect_language(str(file_path)) is not None