Coverage for src / lilbee / wiki / entity_extractor / base.py: 100%

18 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Protocol and record types for entity/concept extractors.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass 

6from enum import StrEnum 

7from typing import TYPE_CHECKING, Protocol, runtime_checkable 

8 

9if TYPE_CHECKING: 

10 from lilbee.store import SearchChunk 

11 

12 

13class EntityKind(StrEnum): 

14 """Whether an ``ExtractedEntity`` is a concept or a proper-noun entity.""" 

15 

16 CONCEPT = "concept" 

17 ENTITY = "entity" 

18 

19 

20@dataclass(frozen=True) 

21class ChunkRef: 

22 """Stable identifier for a chunk inside the store.""" 

23 

24 source: str 

25 chunk_index: int 

26 

27 

28@dataclass(frozen=True) 

29class ExtractedEntity: 

30 """One concept or entity discovered in the corpus. 

31 

32 All fields are populated by the extractor regardless of strategy, so 

33 downstream page generation, [[link]] rewriting, and index building 

34 never branch on which extractor ran. 

35 """ 

36 

37 slug: str 

38 kind: EntityKind 

39 label: str 

40 type_hint: str 

41 chunk_refs: tuple[ChunkRef, ...] 

42 

43 

44@runtime_checkable 

45class EntityExtractor(Protocol): 

46 """Strategy that turns a chunk corpus into ``ExtractedEntity`` records.""" 

47 

48 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]: 

49 """Return the deduplicated set of concepts and entities in *chunks*.""" 

50 ...