Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%

85 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""spaCy NER entity extractor (default strategy). 

2 

3Phase D removed the noun-chunk "concept" path from this extractor. The 

4per-source batched call in :mod:`lilbee.wiki.gen` now proposes concept 

5pages through the LLM. This module produces typed NER entities only. 

6""" 

7 

8from __future__ import annotations 

9 

10import logging 

11import re 

12from typing import TYPE_CHECKING, Any 

13 

14from lilbee.wiki.entity_extractor.base import ( 

15 ChunkRef, 

16 EntityKind, 

17 ExtractedEntity, 

18) 

19from lilbee.wiki.shared import is_valid_label, make_slug 

20 

21if TYPE_CHECKING: 

22 from lilbee.config import Config 

23 from lilbee.providers.base import LLMProvider 

24 from lilbee.store import SearchChunk 

25 

26log = logging.getLogger(__name__) 

27 

28_WHITESPACE_RE = re.compile(r"\s+") 

29 

30# Pre-spaCy markdown-noise strippers. Compiled once at module scope so 

31# the extractor's hot path does not recompile them per chunk. Match on 

32# line boundaries via re.MULTILINE; each sub() empties the matched 

33# line so downstream line-joins collapse the hole to a single newline. 

34_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE) 

35_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE) 

36_NAV_CHROME_RE = re.compile( 

37 r"^\s*(?:Home|Menu|Navigation|Edit this page|Jump to navigation|Jump to search)\s*$", 

38 re.MULTILINE, 

39) 

40 

41 

42def _normalize(text: str) -> str: 

43 """Lowercase, strip, and collapse internal whitespace for dedup keys.""" 

44 return _WHITESPACE_RE.sub(" ", text.strip().lower()) 

45 

46 

47def pre_clean_for_ner(text: str) -> str: 

48 """Strip markdown-structural noise before handing text to spaCy. 

49 

50 Removes whole-line markdown-table rows (``| Designer | Irv ... |``), 

51 standalone page-number lines from PDF extraction (``42``), and 

52 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves 

53 prose untouched: every regex anchors to a full line and emits an 

54 empty line in place of the match, which spaCy treats as a sentence 

55 break. 

56 

57 Only targets the noise patterns actually observed in the bb-8b7s 

58 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean 

59 is sufficient for the current signal-to-noise ratio. 

60 """ 

61 text = _TABLE_ROW_RE.sub("", text) 

62 text = _PAGE_NUMBER_RE.sub("", text) 

63 return _NAV_CHROME_RE.sub("", text) 

64 

65 

66class NerConceptsExtractor: 

67 """Emit typed NER entities (``EntityKind.ENTITY`` only). 

68 

69 Phase D removed the noun-chunk concept loop: LLM-curated concept 

70 pages are produced downstream by the per-source batched call in 

71 :mod:`lilbee.wiki.gen`. The class name is kept for backwards 

72 compatibility at the factory dispatch site; the implementation 

73 emits only ``EntityKind.ENTITY`` records now. 

74 """ 

75 

76 def __init__(self, provider: LLMProvider, config: Config) -> None: 

77 self._provider = provider 

78 self._config = config 

79 

80 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]: 

81 if not chunks: 

82 return [] 

83 nlp = _load_spacy() 

84 if nlp is None: 

85 return [] 

86 

87 entity_records: dict[str, _Aggregate] = {} 

88 allowed_ent_types = self._config.concept_allowed_ent_types 

89 

90 debug_enabled = log.isEnabledFor(logging.DEBUG) 

91 # Per-pass funnel counters; emitted once after the loop so the 

92 # DEBUG trace captures the whole corpus in one line instead of 

93 # one per chunk. 

94 funnel = { 

95 "raw_ents": 0, 

96 "type_filter_dropped": 0, 

97 "label_sanity_dropped_entities": 0, 

98 "kept_entity_surfaces": 0, 

99 } 

100 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks) 

101 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True): 

102 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index) 

103 for ent in doc.ents: 

104 funnel["raw_ents"] += 1 

105 if ent.label_ not in allowed_ent_types: 

106 funnel["type_filter_dropped"] += 1 

107 continue 

108 surface = ent.text.strip() 

109 if not is_valid_label(surface): 

110 funnel["label_sanity_dropped_entities"] += 1 

111 if debug_enabled: 

112 log.debug("label-sanity: rejected entity %r", surface) 

113 continue 

114 key = _normalize(surface) 

115 rec = entity_records.setdefault( 

116 key, _Aggregate(label=surface, type_hint=ent.label_) 

117 ) 

118 rec.refs.add(ref) 

119 funnel["kept_entity_surfaces"] += 1 

120 

121 if debug_enabled: 

122 log.debug( 

123 "ner funnel: raw_ents=%(raw_ents)d " 

124 "type_filter_dropped=%(type_filter_dropped)d " 

125 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d " 

126 "kept_entity_surfaces=%(kept_entity_surfaces)d", 

127 funnel, 

128 ) 

129 

130 min_mentions = self._config.wiki_entity_min_mentions 

131 results: list[ExtractedEntity] = [] 

132 for agg in entity_records.values(): 

133 record = _make_record(agg, EntityKind.ENTITY, min_mentions) 

134 if record is not None: 

135 results.append(record) 

136 results.sort(key=lambda e: (e.kind.value, e.slug)) 

137 return results 

138 

139 

140class _Aggregate: 

141 """Mutable accumulator used only while folding per-chunk hits.""" 

142 

143 __slots__ = ("label", "refs", "type_hint") 

144 

145 def __init__(self, label: str, type_hint: str) -> None: 

146 self.label = label 

147 self.type_hint = type_hint 

148 self.refs: set[ChunkRef] = set() 

149 

150 

151def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]: 

152 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index))) 

153 

154 

155def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None: 

156 """Turn an aggregate into an ``ExtractedEntity`` or drop it. 

157 

158 Filters records below the mention threshold and records whose label 

159 slug-cleans to an empty string (e.g. labels of only punctuation); 

160 without the empty-slug guard those would try to write files named 

161 just ``.md`` on disk. 

162 """ 

163 if len(agg.refs) < min_mentions: 

164 return None 

165 slug = make_slug(agg.label) 

166 if not slug: 

167 return None 

168 return ExtractedEntity( 

169 slug=slug, 

170 kind=kind, 

171 label=agg.label, 

172 type_hint=agg.type_hint, 

173 chunk_refs=_sorted_refs(agg.refs), 

174 ) 

175 

176 

177def _load_spacy() -> Any | None: 

178 """Load the shared spaCy pipeline, or return None if unavailable.""" 

179 try: 

180 from lilbee.concepts import load_spacy_pipeline 

181 except ImportError: 

182 log.warning("Entity extraction disabled: lilbee.concepts unavailable") 

183 return None 

184 try: 

185 return load_spacy_pipeline() 

186 except ImportError: 

187 log.warning("Entity extraction disabled: spaCy model unavailable") 

188 return None