Coverage for src / lilbee / results.py: 100%

34 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-16 08:27 +0000

1from __future__ import annotations 

2 

3from pydantic import BaseModel 

4 

5from lilbee.store import SearchChunk 

6 

7 

8class Excerpt(BaseModel): 

9 content: str 

10 page_start: int | None 

11 page_end: int | None 

12 line_start: int | None 

13 line_end: int | None 

14 relevance: float # 0.0-1.0 (1 = best match) 

15 

16 

17class DocumentResult(BaseModel): 

18 source: str 

19 content_type: str 

20 excerpts: list[Excerpt] 

21 best_relevance: float 

22 

23 

24def _zero_to_none(val: int) -> int | None: 

25 return None if val == 0 else val 

26 

27 

28def _to_excerpt(chunk: SearchChunk) -> Excerpt: 

29 distance = float(chunk["_distance"]) 

30 relevance = 1.0 / (1.0 + distance) 

31 return Excerpt( 

32 content=str(chunk["chunk"]), 

33 page_start=_zero_to_none(chunk["page_start"]), 

34 page_end=_zero_to_none(chunk["page_end"]), 

35 line_start=_zero_to_none(chunk["line_start"]), 

36 line_end=_zero_to_none(chunk["line_end"]), 

37 relevance=relevance, 

38 ) 

39 

40 

41def group(chunks: list[SearchChunk]) -> list[DocumentResult]: 

42 """Group raw LanceDB chunks into document-centric results.""" 

43 by_source: dict[str, list[SearchChunk]] = {} 

44 for chunk in chunks: 

45 source = str(chunk["source"]) 

46 by_source.setdefault(source, []).append(chunk) 

47 

48 results: list[DocumentResult] = [] 

49 for source, source_chunks in by_source.items(): 

50 excerpts = sorted( 

51 [_to_excerpt(c) for c in source_chunks], 

52 key=lambda e: e.relevance, 

53 reverse=True, 

54 ) 

55 results.append( 

56 DocumentResult( 

57 source=source, 

58 content_type=str(source_chunks[0]["content_type"]), 

59 excerpts=excerpts, 

60 best_relevance=excerpts[0].relevance, 

61 ) 

62 ) 

63 

64 results.sort(key=lambda r: r.best_relevance, reverse=True) 

65 return results 

66 

67 

68def to_dicts(results: list[DocumentResult]) -> list[dict[str, object]]: 

69 """Serialize DocumentResults to JSON-safe dicts.""" 

70 return [r.model_dump() for r in results]