Coverage for src / lilbee / providers / mtmd_backend.py: 100%
62 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Vision OCR loader that drives llama.cpp's mtmd pipeline with the GGUF's
2own chat template, so there's no projector-type-to-handler lookup table.
3"""
5from __future__ import annotations
7import logging
8from pathlib import Path
9from typing import Any
11from gguf import GGUFReader
13from lilbee.config import cfg
14from lilbee.providers.llama_cpp_provider import (
15 find_mmproj_for_model,
16 install_llama_log_handler,
17 read_gguf_metadata,
18 suppress_native_stderr,
19)
21log = logging.getLogger(__name__)
24# Image-placeholder tokens seen in GGUF chat templates. The upstream
25# mtmd pipeline substitutes image URLs with mtmd's media marker, so
26# these get rewritten to {{ content.image_url.url }} before rendering.
27# Case matters: GGUF templates are machine-emitted and stable, so a
28# case-insensitive replace would risk corrupting unrelated Jinja
29# identifiers.
30_GGUF_IMAGE_TOKENS: tuple[str, ...] = (
31 "<|image_pad|>",
32 "<image>",
33 "<IMAGE>",
34 "<__media__>",
35 "<__image__>",
36)
37_IMAGE_URL_JINJA = "{{ content.image_url.url }}"
39_TOKENIZER_CHAT_TEMPLATE_KEY = "tokenizer.chat_template"
42def read_chat_template(model_path: Path) -> str | None:
43 """Return the Jinja chat template embedded in a GGUF model, or None."""
44 try:
45 reader = GGUFReader(str(model_path))
46 field = reader.get_field(_TOKENIZER_CHAT_TEMPLATE_KEY)
47 except (OSError, ValueError, IndexError, KeyError):
48 log.debug("Failed to read chat template from %s", model_path, exc_info=True)
49 return None
50 if field is None:
51 return None
52 return bytes(field.parts[field.data[0]]).decode("utf-8", errors="replace")
55def adapt_gguf_template_for_mtmd(template: str) -> str:
56 """Rewrite known image-placeholder tokens to ``{{ content.image_url.url }}``."""
57 for token in _GGUF_IMAGE_TOKENS:
58 if token in template:
59 template = template.replace(token, _IMAGE_URL_JINJA)
60 return template
63def build_vision_chat_handler(model_path: Path, mmproj_path: Path) -> Any:
64 """Return the mtmd chat handler configured with the GGUF's embedded template.
66 ``DEFAULT_SYSTEM_MESSAGE`` is set to ``None`` so no stray system turn
67 is injected. Falls back to the upstream default template when the
68 GGUF has no ``tokenizer.chat_template``.
69 """
70 from llama_cpp.llama_chat_format import Llava15ChatHandler
72 # Defined per call so each loaded model binds its own ``CHAT_FORMAT``
73 # (set below) to a fresh class; hoisting this to module scope would
74 # make the first loaded model's template leak into every subsequent
75 # one.
76 class _GgufTemplateChatHandler(Llava15ChatHandler):
77 DEFAULT_SYSTEM_MESSAGE = None
79 handler_cls: type[Llava15ChatHandler] = _GgufTemplateChatHandler
81 template = read_chat_template(model_path)
82 if template is not None:
83 handler_cls.CHAT_FORMAT = adapt_gguf_template_for_mtmd(template)
84 log.info(
85 "Vision chat handler: using GGUF-embedded template (%d bytes) from %s",
86 len(template),
87 model_path.name,
88 )
89 else:
90 log.info(
91 "Vision chat handler: no GGUF-embedded chat template for %s; using upstream default",
92 model_path.name,
93 )
95 return handler_cls(str(mmproj_path), verbose=False)
98def load_vision_llama(model_path: Path, mmproj_path: Path | None = None) -> Any:
99 """Load a vision-capable ``Llama`` using the GGUF-templated chat handler."""
100 from llama_cpp import Llama # heavy native lib; keep import lazy
102 install_llama_log_handler()
103 if mmproj_path is None:
104 mmproj_path = find_mmproj_for_model(model_path)
106 chat_handler = build_vision_chat_handler(model_path, mmproj_path)
108 kwargs: dict[str, Any] = {
109 "model_path": str(model_path),
110 "chat_handler": chat_handler,
111 "verbose": False,
112 "n_gpu_layers": -1,
113 "n_ctx": _resolve_vision_n_ctx(model_path),
114 }
116 llama = suppress_native_stderr(Llama, **kwargs)
117 metadata = getattr(llama, "metadata", {}) or {}
118 n_ctx_fn = getattr(llama, "n_ctx", None)
119 n_ctx = n_ctx_fn() if callable(n_ctx_fn) else "?"
120 log.info(
121 "Vision model loaded: model=%s mmproj=%s n_ctx=%s arch=%s",
122 model_path.name,
123 mmproj_path.name,
124 n_ctx,
125 metadata.get("general.architecture", "?"),
126 )
127 return llama
130def _resolve_vision_n_ctx(model_path: Path) -> int:
131 """Pick n_ctx for a vision load, clamped to the model's training context."""
132 try:
133 meta = read_gguf_metadata(model_path)
134 except Exception:
135 log.debug("read_gguf_metadata failed for vision %s", model_path, exc_info=True)
136 meta = None
137 train_ctx = int((meta or {}).get("context_length", "0"))
138 if cfg.num_ctx is None:
139 return 0 # 0 -> llama.cpp uses the model's training context
140 if train_ctx <= 0:
141 return cfg.num_ctx
142 return min(cfg.num_ctx, train_ctx)