Coverage for src / lilbee / chunk.py: 100%
28 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Text chunking with optional heading-aware and topic-aware splitting."""
3from __future__ import annotations
5from typing import TYPE_CHECKING
7from lilbee.config import cfg
9if TYPE_CHECKING:
10 from kreuzberg import ChunkingConfig
12CHARS_PER_TOKEN = 4
14_SEMANTIC_CHUNKER = "semantic"
15_MARKDOWN_CHUNKER = "markdown"
16# Kreuzberg silently falls back to a non-semantic path when embedding is None.
17_SEMANTIC_EMBEDDING_PRESET = "fast"
20def build_chunking_config(*, use_semantic: bool = True) -> ChunkingConfig:
21 """Build a kreuzberg ChunkingConfig from the current cfg."""
22 from kreuzberg import ChunkingConfig, EmbeddingConfig, EmbeddingModelType
24 max_chars = cfg.chunk_size * CHARS_PER_TOKEN
25 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2)
27 if use_semantic and cfg.semantic_chunking:
28 return ChunkingConfig(
29 chunker_type=_SEMANTIC_CHUNKER,
30 embedding=EmbeddingConfig(
31 model=EmbeddingModelType.preset(_SEMANTIC_EMBEDDING_PRESET),
32 show_download_progress=True,
33 ),
34 topic_threshold=cfg.topic_threshold,
35 max_chars=max_chars,
36 max_overlap=max_overlap,
37 )
38 return ChunkingConfig(max_chars=max_chars, max_overlap=max_overlap)
41def chunk_text(
42 text: str,
43 *,
44 mime_type: str = "text/plain",
45 heading_context: bool = False,
46 use_semantic: bool = True,
47) -> list[str]:
48 """Split text into chunks; heading_context wins over use_semantic wins over char-budget."""
49 if not text or not text.strip():
50 return []
52 from kreuzberg import ChunkingConfig, ExtractionConfig, extract_bytes_sync
54 if heading_context:
55 max_chars = cfg.chunk_size * CHARS_PER_TOKEN
56 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2)
57 chunking = ChunkingConfig(
58 max_chars=max_chars,
59 max_overlap=max_overlap,
60 chunker_type=_MARKDOWN_CHUNKER,
61 prepend_heading_context=True, # type: ignore[call-arg]
62 )
63 else:
64 chunking = build_chunking_config(use_semantic=use_semantic)
66 config = ExtractionConfig(chunking=chunking)
67 result = extract_bytes_sync(text.encode("utf-8"), mime_type, config=config)
68 if result.chunks:
69 return [c.content for c in result.chunks]
70 return []