Coverage for src / lilbee / clustering.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Source clustering abstraction for wiki synthesis pages. 

2 

3Defines the :class:`SourceClusterer` protocol, the :class:`ClustererBackend` 

4enum of known backend identifiers, and the :class:`Clusterer` facade. The 

5facade is the single class the services container constructs and it picks 

6the right backend from ``config.wiki_clusterer`` so callers never need to 

7know which implementation they got. 

8""" 

9 

10from __future__ import annotations 

11 

12import logging 

13from dataclasses import dataclass 

14from typing import TYPE_CHECKING, Protocol, runtime_checkable 

15 

16from lilbee.config import ClustererBackend 

17 

18if TYPE_CHECKING: 

19 from lilbee.config import Config 

20 from lilbee.store import Store 

21 

22log = logging.getLogger(__name__) 

23 

24 

25@dataclass(frozen=True) 

26class SourceCluster: 

27 """A group of related documents identified by a clustering strategy.""" 

28 

29 cluster_id: str 

30 """Opaque stable identifier, used for filesystem slugs.""" 

31 

32 label: str 

33 """Human-readable topic label for the cluster.""" 

34 

35 sources: frozenset[str] 

36 """Set of source document filenames in the cluster.""" 

37 

38 

39@runtime_checkable 

40class SourceClusterer(Protocol): 

41 """Finds clusters of related source documents for cross-source synthesis.""" 

42 

43 def available(self) -> bool: 

44 """Return True if this clusterer can produce clusters in the current env.""" 

45 ... 

46 

47 def get_clusters(self, min_sources: int = 3) -> list[SourceCluster]: 

48 """Return clusters spanning at least ``min_sources`` distinct documents.""" 

49 ... 

50 

51 

52def _select_backend(config: Config, store: Store) -> SourceClusterer: 

53 """Pick a backend based on ``config.wiki_clusterer`` with safe fallback. 

54 

55 Concrete backends are imported inside the function to break a hard 

56 circular dependency: ``clustering_embedding`` re-exports 

57 :class:`SourceCluster` from this module, so importing it at module 

58 level here would fail during package initialization. 

59 """ 

60 from lilbee.clustering_embedding import EmbeddingClusterer 

61 from lilbee.concepts import ConceptGraphClusterer 

62 

63 if config.wiki_clusterer == ClustererBackend.CONCEPTS: 

64 graph_clusterer = ConceptGraphClusterer(config, store) 

65 if graph_clusterer.available(): 

66 return graph_clusterer 

67 log.warning( 

68 "wiki_clusterer=concepts but the [graph] extra is not installed or " 

69 "the concept graph has not been built. Falling back to the " 

70 "embedding clusterer." 

71 ) 

72 return EmbeddingClusterer(config, store) 

73 

74 

75class Clusterer: 

76 """Wiki synthesis clusterer facade with backend selection.""" 

77 

78 def __init__(self, config: Config, store: Store) -> None: 

79 self._backend: SourceClusterer = _select_backend(config, store) 

80 

81 @property 

82 def backend(self) -> SourceClusterer: 

83 """Return the underlying backend (useful for tests and introspection).""" 

84 return self._backend 

85 

86 def available(self) -> bool: 

87 return self._backend.available() 

88 

89 def get_clusters(self, min_sources: int = 3) -> list[SourceCluster]: 

90 return self._backend.get_clusters(min_sources=min_sources)