Coverage for src / lilbee / providers / mtmd_backend.py: 100%

62 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Vision OCR loader that drives llama.cpp's mtmd pipeline with the GGUF's 

2own chat template, so there's no projector-type-to-handler lookup table. 

3""" 

4 

5from __future__ import annotations 

6 

7import logging 

8from pathlib import Path 

9from typing import Any 

10 

11from gguf import GGUFReader 

12 

13from lilbee.config import cfg 

14from lilbee.providers.llama_cpp_provider import ( 

15 find_mmproj_for_model, 

16 install_llama_log_handler, 

17 read_gguf_metadata, 

18 suppress_native_stderr, 

19) 

20 

21log = logging.getLogger(__name__) 

22 

23 

24# Image-placeholder tokens seen in GGUF chat templates. The upstream 

25# mtmd pipeline substitutes image URLs with mtmd's media marker, so 

26# these get rewritten to {{ content.image_url.url }} before rendering. 

27# Case matters: GGUF templates are machine-emitted and stable, so a 

28# case-insensitive replace would risk corrupting unrelated Jinja 

29# identifiers. 

30_GGUF_IMAGE_TOKENS: tuple[str, ...] = ( 

31 "<|image_pad|>", 

32 "<image>", 

33 "<IMAGE>", 

34 "<__media__>", 

35 "<__image__>", 

36) 

37_IMAGE_URL_JINJA = "{{ content.image_url.url }}" 

38 

39_TOKENIZER_CHAT_TEMPLATE_KEY = "tokenizer.chat_template" 

40 

41 

42def read_chat_template(model_path: Path) -> str | None: 

43 """Return the Jinja chat template embedded in a GGUF model, or None.""" 

44 try: 

45 reader = GGUFReader(str(model_path)) 

46 field = reader.get_field(_TOKENIZER_CHAT_TEMPLATE_KEY) 

47 except (OSError, ValueError, IndexError, KeyError): 

48 log.debug("Failed to read chat template from %s", model_path, exc_info=True) 

49 return None 

50 if field is None: 

51 return None 

52 return bytes(field.parts[field.data[0]]).decode("utf-8", errors="replace") 

53 

54 

55def adapt_gguf_template_for_mtmd(template: str) -> str: 

56 """Rewrite known image-placeholder tokens to ``{{ content.image_url.url }}``.""" 

57 for token in _GGUF_IMAGE_TOKENS: 

58 if token in template: 

59 template = template.replace(token, _IMAGE_URL_JINJA) 

60 return template 

61 

62 

63def build_vision_chat_handler(model_path: Path, mmproj_path: Path) -> Any: 

64 """Return the mtmd chat handler configured with the GGUF's embedded template. 

65 

66 ``DEFAULT_SYSTEM_MESSAGE`` is set to ``None`` so no stray system turn 

67 is injected. Falls back to the upstream default template when the 

68 GGUF has no ``tokenizer.chat_template``. 

69 """ 

70 from llama_cpp.llama_chat_format import Llava15ChatHandler 

71 

72 # Defined per call so each loaded model binds its own ``CHAT_FORMAT`` 

73 # (set below) to a fresh class; hoisting this to module scope would 

74 # make the first loaded model's template leak into every subsequent 

75 # one. 

76 class _GgufTemplateChatHandler(Llava15ChatHandler): 

77 DEFAULT_SYSTEM_MESSAGE = None 

78 

79 handler_cls: type[Llava15ChatHandler] = _GgufTemplateChatHandler 

80 

81 template = read_chat_template(model_path) 

82 if template is not None: 

83 handler_cls.CHAT_FORMAT = adapt_gguf_template_for_mtmd(template) 

84 log.info( 

85 "Vision chat handler: using GGUF-embedded template (%d bytes) from %s", 

86 len(template), 

87 model_path.name, 

88 ) 

89 else: 

90 log.info( 

91 "Vision chat handler: no GGUF-embedded chat template for %s; using upstream default", 

92 model_path.name, 

93 ) 

94 

95 return handler_cls(str(mmproj_path), verbose=False) 

96 

97 

98def load_vision_llama(model_path: Path, mmproj_path: Path | None = None) -> Any: 

99 """Load a vision-capable ``Llama`` using the GGUF-templated chat handler.""" 

100 from llama_cpp import Llama # heavy native lib; keep import lazy 

101 

102 install_llama_log_handler() 

103 if mmproj_path is None: 

104 mmproj_path = find_mmproj_for_model(model_path) 

105 

106 chat_handler = build_vision_chat_handler(model_path, mmproj_path) 

107 

108 kwargs: dict[str, Any] = { 

109 "model_path": str(model_path), 

110 "chat_handler": chat_handler, 

111 "verbose": False, 

112 "n_gpu_layers": -1, 

113 "n_ctx": _resolve_vision_n_ctx(model_path), 

114 } 

115 

116 llama = suppress_native_stderr(Llama, **kwargs) 

117 metadata = getattr(llama, "metadata", {}) or {} 

118 n_ctx_fn = getattr(llama, "n_ctx", None) 

119 n_ctx = n_ctx_fn() if callable(n_ctx_fn) else "?" 

120 log.info( 

121 "Vision model loaded: model=%s mmproj=%s n_ctx=%s arch=%s", 

122 model_path.name, 

123 mmproj_path.name, 

124 n_ctx, 

125 metadata.get("general.architecture", "?"), 

126 ) 

127 return llama 

128 

129 

130def _resolve_vision_n_ctx(model_path: Path) -> int: 

131 """Pick n_ctx for a vision load, clamped to the model's training context.""" 

132 try: 

133 meta = read_gguf_metadata(model_path) 

134 except Exception: 

135 log.debug("read_gguf_metadata failed for vision %s", model_path, exc_info=True) 

136 meta = None 

137 train_ctx = int((meta or {}).get("context_length", "0")) 

138 if cfg.num_ctx is None: 

139 return 0 # 0 -> llama.cpp uses the model's training context 

140 if train_ctx <= 0: 

141 return cfg.num_ctx 

142 return min(cfg.num_ctx, train_ctx)