Coverage for src / lilbee / results.py: 100%
34 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-16 08:27 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-16 08:27 +0000
1from __future__ import annotations
3from pydantic import BaseModel
5from lilbee.store import SearchChunk
8class Excerpt(BaseModel):
9 content: str
10 page_start: int | None
11 page_end: int | None
12 line_start: int | None
13 line_end: int | None
14 relevance: float # 0.0-1.0 (1 = best match)
17class DocumentResult(BaseModel):
18 source: str
19 content_type: str
20 excerpts: list[Excerpt]
21 best_relevance: float
24def _zero_to_none(val: int) -> int | None:
25 return None if val == 0 else val
28def _to_excerpt(chunk: SearchChunk) -> Excerpt:
29 distance = float(chunk["_distance"])
30 relevance = 1.0 / (1.0 + distance)
31 return Excerpt(
32 content=str(chunk["chunk"]),
33 page_start=_zero_to_none(chunk["page_start"]),
34 page_end=_zero_to_none(chunk["page_end"]),
35 line_start=_zero_to_none(chunk["line_start"]),
36 line_end=_zero_to_none(chunk["line_end"]),
37 relevance=relevance,
38 )
41def group(chunks: list[SearchChunk]) -> list[DocumentResult]:
42 """Group raw LanceDB chunks into document-centric results."""
43 by_source: dict[str, list[SearchChunk]] = {}
44 for chunk in chunks:
45 source = str(chunk["source"])
46 by_source.setdefault(source, []).append(chunk)
48 results: list[DocumentResult] = []
49 for source, source_chunks in by_source.items():
50 excerpts = sorted(
51 [_to_excerpt(c) for c in source_chunks],
52 key=lambda e: e.relevance,
53 reverse=True,
54 )
55 results.append(
56 DocumentResult(
57 source=source,
58 content_type=str(source_chunks[0]["content_type"]),
59 excerpts=excerpts,
60 best_relevance=excerpts[0].relevance,
61 )
62 )
64 results.sort(key=lambda r: r.best_relevance, reverse=True)
65 return results
68def to_dicts(results: list[DocumentResult]) -> list[dict[str, object]]:
69 """Serialize DocumentResults to JSON-safe dicts."""
70 return [r.model_dump() for r in results]