Coverage for src/lilbee/chunk.py: 100%

1"""Text chunking with optional heading-aware and topic-aware splitting."""

3from __future__ import annotations

5from typing import TYPE_CHECKING

7from lilbee.config import cfg

9if TYPE_CHECKING:

10 from kreuzberg import ChunkingConfig

12CHARS_PER_TOKEN = 4

14_SEMANTIC_CHUNKER = "semantic"

15_MARKDOWN_CHUNKER = "markdown"

16# Kreuzberg silently falls back to a non-semantic path when embedding is None.

17_SEMANTIC_EMBEDDING_PRESET = "fast"

20def build_chunking_config(*, use_semantic: bool = True) -> ChunkingConfig:

21 """Build a kreuzberg ChunkingConfig from the current cfg."""

22 from kreuzberg import ChunkingConfig, EmbeddingConfig, EmbeddingModelType

24 max_chars = cfg.chunk_size * CHARS_PER_TOKEN

25 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2)

27 if use_semantic and cfg.semantic_chunking:

28 return ChunkingConfig(

29 chunker_type=_SEMANTIC_CHUNKER,

30 embedding=EmbeddingConfig(

31 model=EmbeddingModelType.preset(_SEMANTIC_EMBEDDING_PRESET),

32 show_download_progress=True,

33 ),

34 topic_threshold=cfg.topic_threshold,

35 max_chars=max_chars,

36 max_overlap=max_overlap,

37 )

38 return ChunkingConfig(max_chars=max_chars, max_overlap=max_overlap)

41def chunk_text(

42 text: str,

43 *,

44 mime_type: str = "text/plain",

45 heading_context: bool = False,

46 use_semantic: bool = True,

47) -> list[str]:

48 """Split text into chunks; heading_context wins over use_semantic wins over char-budget."""

49 if not text or not text.strip():

50 return []

52 from kreuzberg import ChunkingConfig, ExtractionConfig, extract_bytes_sync

54 if heading_context:

55 max_chars = cfg.chunk_size * CHARS_PER_TOKEN

56 max_overlap = min(cfg.chunk_overlap * CHARS_PER_TOKEN, max_chars // 2)

57 chunking = ChunkingConfig(

58 max_chars=max_chars,

59 max_overlap=max_overlap,

60 chunker_type=_MARKDOWN_CHUNKER,

61 prepend_heading_context=True, # type: ignore[call-arg]

62 )

63 else:

64 chunking = build_chunking_config(use_semantic=use_semantic)

66 config = ExtractionConfig(chunking=chunking)

67 result = extract_bytes_sync(text.encode("utf-8"), mime_type, config=config)

68 if result.chunks:

69 return [c.content for c in result.chunks]

70 return []

Coverage for src / lilbee / chunk.py: 100%

28 statements