Coverage for src/lilbee/wiki/entity_extractor/ner

1"""spaCy NER entity extractor (default strategy).

3Phase D removed the noun-chunk "concept" path from this extractor. The

4per-source batched call in :mod:`lilbee.wiki.gen` now proposes concept

5pages through the LLM. This module produces typed NER entities only.

6"""

8from __future__ import annotations

10import logging

11import re

12from typing import TYPE_CHECKING, Any

14from lilbee.wiki.entity_extractor.base import (

15 ChunkRef,

16 EntityKind,

17 ExtractedEntity,

18)

19from lilbee.wiki.shared import is_valid_label, make_slug

21if TYPE_CHECKING:

22 from lilbee.config import Config

23 from lilbee.providers.base import LLMProvider

24 from lilbee.store import SearchChunk

26log = logging.getLogger(__name__)

28_WHITESPACE_RE = re.compile(r"\s+")

30# Pre-spaCy markdown-noise strippers. Compiled once at module scope so

31# the extractor's hot path does not recompile them per chunk. Match on

32# line boundaries via re.MULTILINE; each sub() empties the matched

33# line so downstream line-joins collapse the hole to a single newline.

34_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE)

35_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE)

36_NAV_CHROME_RE = re.compile(

38 re.MULTILINE,

39)

42def _normalize(text: str) -> str:

43 """Lowercase, strip, and collapse internal whitespace for dedup keys."""

44 return _WHITESPACE_RE.sub(" ", text.strip().lower())

47def pre_clean_for_ner(text: str) -> str:

48 """Strip markdown-structural noise before handing text to spaCy.

50 Removes whole-line markdown-table rows (``| Designer | Irv ... |``),

51 standalone page-number lines from PDF extraction (``42``), and

52 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves

53 prose untouched: every regex anchors to a full line and emits an

54 empty line in place of the match, which spaCy treats as a sentence

55 break.

57 Only targets the noise patterns actually observed in the bb-8b7s

58 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean

59 is sufficient for the current signal-to-noise ratio.

60 """

61 text = _TABLE_ROW_RE.sub("", text)

62 text = _PAGE_NUMBER_RE.sub("", text)

63 return _NAV_CHROME_RE.sub("", text)

66class NerConceptsExtractor:

67 """Emit typed NER entities (``EntityKind.ENTITY`` only).

69 Phase D removed the noun-chunk concept loop: LLM-curated concept

70 pages are produced downstream by the per-source batched call in

71 :mod:`lilbee.wiki.gen`. The class name is kept for backwards

72 compatibility at the factory dispatch site; the implementation

73 emits only ``EntityKind.ENTITY`` records now.

74 """

76 def __init__(self, provider: LLMProvider, config: Config) -> None:

77 self._provider = provider

78 self._config = config

80 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]:

81 if not chunks:

82 return []

83 nlp = _load_spacy()

84 if nlp is None:

85 return []

87 entity_records: dict[str, _Aggregate] = {}

88 allowed_ent_types = self._config.concept_allowed_ent_types

90 debug_enabled = log.isEnabledFor(logging.DEBUG)

91 # Per-pass funnel counters; emitted once after the loop so the

92 # DEBUG trace captures the whole corpus in one line instead of

93 # one per chunk.

94 funnel = {

95 "raw_ents": 0,

96 "type_filter_dropped": 0,

97 "label_sanity_dropped_entities": 0,

98 "kept_entity_surfaces": 0,

99 }

100 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks)

101 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True):

102 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index)

103 for ent in doc.ents:

104 funnel["raw_ents"] += 1

105 if ent.label_ not in allowed_ent_types:

106 funnel["type_filter_dropped"] += 1

107 continue

108 surface = ent.text.strip()

109 if not is_valid_label(surface):

110 funnel["label_sanity_dropped_entities"] += 1

111 if debug_enabled:

112 log.debug("label-sanity: rejected entity %r", surface)

113 continue

114 key = _normalize(surface)

115 rec = entity_records.setdefault(

116 key, _Aggregate(label=surface, type_hint=ent.label_)

117 )

118 rec.refs.add(ref)

119 funnel["kept_entity_surfaces"] += 1

120

121 if debug_enabled:

122 log.debug(

123 "ner funnel: raw_ents=%(raw_ents)d "

124 "type_filter_dropped=%(type_filter_dropped)d "

125 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d "

126 "kept_entity_surfaces=%(kept_entity_surfaces)d",

127 funnel,

128 )

129

130 min_mentions = self._config.wiki_entity_min_mentions

131 results: list[ExtractedEntity] = []

132 for agg in entity_records.values():

133 record = _make_record(agg, EntityKind.ENTITY, min_mentions)

134 if record is not None:

135 results.append(record)

136 results.sort(key=lambda e: (e.kind.value, e.slug))

137 return results

138

139

140class _Aggregate:

141 """Mutable accumulator used only while folding per-chunk hits."""

142

143 __slots__ = ("label", "refs", "type_hint")

144

145 def __init__(self, label: str, type_hint: str) -> None:

146 self.label = label

147 self.type_hint = type_hint

148 self.refs: set[ChunkRef] = set()

149

150

151def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]:

152 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index)))

153

154

155def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None:

156 """Turn an aggregate into an ``ExtractedEntity`` or drop it.

157

158 Filters records below the mention threshold and records whose label

159 slug-cleans to an empty string (e.g. labels of only punctuation);

160 without the empty-slug guard those would try to write files named

161 just ``.md`` on disk.

162 """

163 if len(agg.refs) < min_mentions:

164 return None

165 slug = make_slug(agg.label)

166 if not slug:

167 return None

168 return ExtractedEntity(

169 slug=slug,

170 kind=kind,

171 label=agg.label,

172 type_hint=agg.type_hint,

173 chunk_refs=_sorted_refs(agg.refs),

174 )

175

176

177def _load_spacy() -> Any | None:

178 """Load the shared spaCy pipeline, or return None if unavailable."""

179 try:

180 from lilbee.concepts import load_spacy_pipeline

181 except ImportError:

182 log.warning("Entity extraction disabled: lilbee.concepts unavailable")

183 return None

184 try:

185 return load_spacy_pipeline()

186 except ImportError:

187 log.warning("Entity extraction disabled: spaCy model unavailable")

188 return None

Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%

85 statements