Coverage for src / lilbee / chunk.py: 100%

28 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Text chunking with optional heading-aware and topic-aware splitting.""" 

2 

3from __future__ import annotations 

4 

5from typing import TYPE_CHECKING 

6 

7from lilbee.config import cfg 

8 

9if TYPE_CHECKING: 

10 from kreuzberg import ChunkingConfig 

11 

12CHARS_PER_TOKEN = 4 

13 

14_SEMANTIC_CHUNKER = "semantic" 

15_MARKDOWN_CHUNKER = "markdown" 

16# Kreuzberg silently falls back to a non-semantic path when embedding is None. 

17_SEMANTIC_EMBEDDING_PRESET = "fast" 

18 

19 

20def build_chunking_config(*, use_semantic: bool = True) -> ChunkingConfig: 

21 """Build a kreuzberg ChunkingConfig from the current cfg.""" 

22 from kreuzberg import ChunkingConfig, EmbeddingConfig, EmbeddingModelType 

23 

24 max_chars = cfg.chunk_size * CHARS_PER_TOKEN 

25 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2) 

26 

27 if use_semantic and cfg.semantic_chunking: 

28 return ChunkingConfig( 

29 chunker_type=_SEMANTIC_CHUNKER, 

30 embedding=EmbeddingConfig( 

31 model=EmbeddingModelType.preset(_SEMANTIC_EMBEDDING_PRESET), 

32 show_download_progress=True, 

33 ), 

34 topic_threshold=cfg.topic_threshold, 

35 max_chars=max_chars, 

36 max_overlap=max_overlap, 

37 ) 

38 return ChunkingConfig(max_chars=max_chars, max_overlap=max_overlap) 

39 

40 

41def chunk_text( 

42 text: str, 

43 *, 

44 mime_type: str = "text/plain", 

45 heading_context: bool = False, 

46 use_semantic: bool = True, 

47) -> list[str]: 

48 """Split text into chunks; heading_context wins over use_semantic wins over char-budget.""" 

49 if not text or not text.strip(): 

50 return [] 

51 

52 from kreuzberg import ChunkingConfig, ExtractionConfig, extract_bytes_sync 

53 

54 if heading_context: 

55 max_chars = cfg.chunk_size * CHARS_PER_TOKEN 

56 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2) 

57 chunking = ChunkingConfig( 

58 max_chars=max_chars, 

59 max_overlap=max_overlap, 

60 chunker_type=_MARKDOWN_CHUNKER, 

61 prepend_heading_context=True, # type: ignore[call-arg] 

62 ) 

63 else: 

64 chunking = build_chunking_config(use_semantic=use_semantic) 

65 

66 config = ExtractionConfig(chunking=chunking) 

67 result = extract_bytes_sync(text.encode("utf-8"), mime_type, config=config) 

68 if result.chunks: 

69 return [c.content for c in result.chunks] 

70 return []