Coverage for src / lilbee / wiki / entity_extractor / ner_concepts.py: 100%
85 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""spaCy NER entity extractor (default strategy).
3Phase D removed the noun-chunk "concept" path from this extractor. The
4per-source batched call in :mod:`lilbee.wiki.gen` now proposes concept
5pages through the LLM. This module produces typed NER entities only.
6"""
8from __future__ import annotations
10import logging
11import re
12from typing import TYPE_CHECKING, Any
14from lilbee.wiki.entity_extractor.base import (
15 ChunkRef,
16 EntityKind,
17 ExtractedEntity,
18)
19from lilbee.wiki.shared import is_valid_label, make_slug
21if TYPE_CHECKING:
22 from lilbee.config import Config
23 from lilbee.providers.base import LLMProvider
24 from lilbee.store import SearchChunk
26log = logging.getLogger(__name__)
28_WHITESPACE_RE = re.compile(r"\s+")
30# Pre-spaCy markdown-noise strippers. Compiled once at module scope so
31# the extractor's hot path does not recompile them per chunk. Match on
32# line boundaries via re.MULTILINE; each sub() empties the matched
33# line so downstream line-joins collapse the hole to a single newline.
34_TABLE_ROW_RE = re.compile(r"^\|.*\|\s*$", re.MULTILINE)
35_PAGE_NUMBER_RE = re.compile(r"^\s*\d{1,4}\s*$", re.MULTILINE)
36_NAV_CHROME_RE = re.compile(
37 r"^\s*(?:Home|Menu|Navigation|Edit this page|Jump to navigation|Jump to search)\s*$",
38 re.MULTILINE,
39)
42def _normalize(text: str) -> str:
43 """Lowercase, strip, and collapse internal whitespace for dedup keys."""
44 return _WHITESPACE_RE.sub(" ", text.strip().lower())
47def pre_clean_for_ner(text: str) -> str:
48 """Strip markdown-structural noise before handing text to spaCy.
50 Removes whole-line markdown-table rows (``| Designer | Irv ... |``),
51 standalone page-number lines from PDF extraction (``42``), and
52 Wikipedia / CMS navigation chrome (``Edit this page``). Leaves
53 prose untouched: every regex anchors to a full line and emits an
54 empty line in place of the match, which spaCy treats as a sentence
55 break.
57 Only targets the noise patterns actually observed in the bb-8b7s
58 QA corpus. Fuller markdown parsing is deferred; a regex pre-clean
59 is sufficient for the current signal-to-noise ratio.
60 """
61 text = _TABLE_ROW_RE.sub("", text)
62 text = _PAGE_NUMBER_RE.sub("", text)
63 return _NAV_CHROME_RE.sub("", text)
66class NerConceptsExtractor:
67 """Emit typed NER entities (``EntityKind.ENTITY`` only).
69 Phase D removed the noun-chunk concept loop: LLM-curated concept
70 pages are produced downstream by the per-source batched call in
71 :mod:`lilbee.wiki.gen`. The class name is kept for backwards
72 compatibility at the factory dispatch site; the implementation
73 emits only ``EntityKind.ENTITY`` records now.
74 """
76 def __init__(self, provider: LLMProvider, config: Config) -> None:
77 self._provider = provider
78 self._config = config
80 def extract(self, chunks: list[SearchChunk]) -> list[ExtractedEntity]:
81 if not chunks:
82 return []
83 nlp = _load_spacy()
84 if nlp is None:
85 return []
87 entity_records: dict[str, _Aggregate] = {}
88 allowed_ent_types = self._config.concept_allowed_ent_types
90 debug_enabled = log.isEnabledFor(logging.DEBUG)
91 # Per-pass funnel counters; emitted once after the loop so the
92 # DEBUG trace captures the whole corpus in one line instead of
93 # one per chunk.
94 funnel = {
95 "raw_ents": 0,
96 "type_filter_dropped": 0,
97 "label_sanity_dropped_entities": 0,
98 "kept_entity_surfaces": 0,
99 }
100 cleaned_texts = (pre_clean_for_ner(c.chunk) for c in chunks)
101 for chunk, doc in zip(chunks, nlp.pipe(cleaned_texts), strict=True):
102 ref = ChunkRef(source=chunk.source, chunk_index=chunk.chunk_index)
103 for ent in doc.ents:
104 funnel["raw_ents"] += 1
105 if ent.label_ not in allowed_ent_types:
106 funnel["type_filter_dropped"] += 1
107 continue
108 surface = ent.text.strip()
109 if not is_valid_label(surface):
110 funnel["label_sanity_dropped_entities"] += 1
111 if debug_enabled:
112 log.debug("label-sanity: rejected entity %r", surface)
113 continue
114 key = _normalize(surface)
115 rec = entity_records.setdefault(
116 key, _Aggregate(label=surface, type_hint=ent.label_)
117 )
118 rec.refs.add(ref)
119 funnel["kept_entity_surfaces"] += 1
121 if debug_enabled:
122 log.debug(
123 "ner funnel: raw_ents=%(raw_ents)d "
124 "type_filter_dropped=%(type_filter_dropped)d "
125 "label_sanity_dropped_entities=%(label_sanity_dropped_entities)d "
126 "kept_entity_surfaces=%(kept_entity_surfaces)d",
127 funnel,
128 )
130 min_mentions = self._config.wiki_entity_min_mentions
131 results: list[ExtractedEntity] = []
132 for agg in entity_records.values():
133 record = _make_record(agg, EntityKind.ENTITY, min_mentions)
134 if record is not None:
135 results.append(record)
136 results.sort(key=lambda e: (e.kind.value, e.slug))
137 return results
140class _Aggregate:
141 """Mutable accumulator used only while folding per-chunk hits."""
143 __slots__ = ("label", "refs", "type_hint")
145 def __init__(self, label: str, type_hint: str) -> None:
146 self.label = label
147 self.type_hint = type_hint
148 self.refs: set[ChunkRef] = set()
151def _sorted_refs(refs: set[ChunkRef]) -> tuple[ChunkRef, ...]:
152 return tuple(sorted(refs, key=lambda r: (r.source, r.chunk_index)))
155def _make_record(agg: _Aggregate, kind: EntityKind, min_mentions: int) -> ExtractedEntity | None:
156 """Turn an aggregate into an ``ExtractedEntity`` or drop it.
158 Filters records below the mention threshold and records whose label
159 slug-cleans to an empty string (e.g. labels of only punctuation);
160 without the empty-slug guard those would try to write files named
161 just ``.md`` on disk.
162 """
163 if len(agg.refs) < min_mentions:
164 return None
165 slug = make_slug(agg.label)
166 if not slug:
167 return None
168 return ExtractedEntity(
169 slug=slug,
170 kind=kind,
171 label=agg.label,
172 type_hint=agg.type_hint,
173 chunk_refs=_sorted_refs(agg.refs),
174 )
177def _load_spacy() -> Any | None:
178 """Load the shared spaCy pipeline, or return None if unavailable."""
179 try:
180 from lilbee.concepts import load_spacy_pipeline
181 except ImportError:
182 log.warning("Entity extraction disabled: lilbee.concepts unavailable")
183 return None
184 try:
185 return load_spacy_pipeline()
186 except ImportError:
187 log.warning("Entity extraction disabled: spaCy model unavailable")
188 return None