Coverage for src / lilbee / registry.py: 100%
149 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-29 19:16 +0000
1"""Manifest store keyed by ``(hf_repo, gguf_filename)`` over the HF cache.
3Canonical ref: ``<hf_repo>/<gguf_filename>``. Two quants of the same
4repo are two distinct installations. Manifests live at
5``manifests/<repo--repo>/<filename>.json``; blobs at
6``models--<repo--repo>/blobs/<sha>``.
7"""
9from __future__ import annotations
11import hashlib
12import json
13import logging
14import os
15import re
16import tempfile
17from dataclasses import asdict, dataclass
18from pathlib import Path
20from lilbee.security import validate_path_within
22log = logging.getLogger(__name__)
24_HASH_ALGORITHM = "sha256"
25_HASH_CHUNK_SIZE = 8192 # bytes read per iteration when hashing
26_REPO_SEGMENT_RE = re.compile(r"^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$")
27_FILENAME_RE = re.compile(r"^[a-zA-Z0-9._-]+\.gguf$")
29REPO_DIR_SEPARATOR = "--"
32def _validate_hf_repo(hf_repo: str) -> str:
33 """Validate that a HuggingFace repo id has the form ``org/name``."""
34 if not hf_repo or not _REPO_SEGMENT_RE.match(hf_repo) or ".." in hf_repo:
35 raise ValueError(f"Invalid hf_repo: {hf_repo!r}")
36 return hf_repo
39def _validate_gguf_filename(filename: str) -> str:
40 """Validate that a filename is a safe ``.gguf`` basename (no path separators)."""
41 if not filename or not _FILENAME_RE.match(filename) or ".." in filename:
42 raise ValueError(f"Invalid gguf_filename: {filename!r}")
43 return filename
46_REF_SHAPE_HINT = "Use '<org>/<repo>/<filename>.gguf'."
49def parse_hf_ref(ref: str) -> tuple[str, str]:
50 """Split ``<org>/<repo>/<file>.gguf`` into ``(hf_repo, gguf_filename)``."""
51 if not ref.endswith(".gguf") or "/" not in ref:
52 raise ValueError(f"Model ref {ref!r} is not a HuggingFace ref. {_REF_SHAPE_HINT}")
53 hf_repo, gguf_filename = ref.rsplit("/", 1)
54 return _validate_hf_repo(hf_repo), _validate_gguf_filename(gguf_filename)
57def repo_to_dir(hf_repo: str) -> str:
58 """Encode an HF repo for use as a directory name (HF cache convention)."""
59 return hf_repo.replace("/", REPO_DIR_SEPARATOR)
62@dataclass
63class ModelManifest:
64 """One installed model's metadata. Identity: ``(hf_repo, gguf_filename)``."""
66 hf_repo: str
67 gguf_filename: str
68 size_bytes: int
69 task: str # use lilbee.models.ModelTask values
70 downloaded_at: str # ISO 8601
71 blob: str = "" # SHA-256 hex of the blob in the HF cache
73 @property
74 def ref(self) -> str:
75 return f"{self.hf_repo}/{self.gguf_filename}"
78def _sha256_file(path: Path) -> str:
79 """Compute SHA-256 hex digest of a file."""
80 h = hashlib.sha256()
81 with path.open("rb") as f:
82 while True:
83 chunk = f.read(_HASH_CHUNK_SIZE)
84 if not chunk:
85 break
86 h.update(chunk)
87 return h.hexdigest()
90class ModelRegistry:
91 """Read/write manifests and resolve refs to blobs in the HF cache."""
93 def __init__(self, models_dir: Path) -> None:
94 self._root = models_dir
95 self._manifests_dir = models_dir / "manifests"
97 def resolve(self, ref: str) -> Path:
98 """Return the blob path for *ref*; ``KeyError`` if not installed."""
99 hf_repo, gguf_filename = parse_hf_ref(ref)
100 manifest = self._read_manifest(hf_repo, gguf_filename)
101 if manifest is None:
102 raise KeyError(f"Model {ref} not installed")
103 cache_path = self._root / f"models--{repo_to_dir(manifest.hf_repo)}"
104 if not cache_path.exists():
105 raise KeyError(f"Cache folder missing for {ref}: {cache_path.name}")
106 blob_file = cache_path / "blobs" / manifest.blob
107 if not blob_file.exists():
108 raise KeyError(f"Blob file missing for {ref}: {manifest.blob}")
109 return blob_file
111 def is_installed(self, ref: str) -> bool:
112 """Return True if a model is installed and its blob is present."""
113 try:
114 self.resolve(ref)
115 return True
116 except (KeyError, ValueError):
117 return False
119 def install(
120 self,
121 hf_repo: str,
122 gguf_filename: str,
123 source_path: Path,
124 manifest: ModelManifest,
125 ) -> Path:
126 """Write a manifest, copying *source_path* into the HF cache if needed."""
127 import shutil
129 digest = _sha256_file(source_path)
130 cache_path = self._root / f"models--{repo_to_dir(hf_repo)}"
131 blobs_dir = cache_path / "blobs"
132 blob_path = blobs_dir / digest
133 if not blob_path.exists():
134 blobs_dir.mkdir(parents=True, exist_ok=True)
135 shutil.copy2(source_path, blob_path)
137 updated = ModelManifest(
138 hf_repo=hf_repo,
139 gguf_filename=gguf_filename,
140 size_bytes=manifest.size_bytes,
141 task=manifest.task,
142 downloaded_at=manifest.downloaded_at,
143 blob=digest,
144 )
145 self._write_manifest(updated)
146 return blob_path
148 def remove(self, ref: str) -> bool:
149 """Remove a manifest. Does not delete the cached blob."""
150 try:
151 hf_repo, gguf_filename = parse_hf_ref(ref)
152 except ValueError:
153 return False
154 manifest_path = self._manifest_path(hf_repo, gguf_filename)
155 if not manifest_path.exists():
156 return False
157 manifest_path.unlink()
158 repo_dir = manifest_path.parent
159 if repo_dir.exists() and not any(repo_dir.iterdir()):
160 repo_dir.rmdir()
161 log.info("Removed manifest for %s (cache file untouched)", ref)
162 return True
164 def list_installed(self) -> list[ModelManifest]:
165 """Return manifests for all installed models."""
166 manifests: list[ModelManifest] = []
167 if not self._manifests_dir.exists():
168 return manifests
169 for repo_dir in sorted(self._manifests_dir.iterdir()):
170 if not repo_dir.is_dir():
171 continue
172 for tag_file in sorted(repo_dir.glob("*.gguf.json")):
173 manifest = self._load_manifest_file(tag_file)
174 if manifest is not None:
175 manifests.append(manifest)
176 return manifests
178 def get_manifest(self, ref: str) -> ModelManifest | None:
179 """Return the manifest for *ref* or None if not installed."""
180 try:
181 hf_repo, gguf_filename = parse_hf_ref(ref)
182 except ValueError:
183 return None
184 return self._read_manifest(hf_repo, gguf_filename)
186 def _manifest_path(self, hf_repo: str, gguf_filename: str) -> Path:
187 repo = _validate_hf_repo(hf_repo)
188 filename = _validate_gguf_filename(gguf_filename)
189 path = self._manifests_dir / repo_to_dir(repo) / f"{filename}.json"
190 validate_path_within(path, self._manifests_dir)
191 return path
193 def _read_manifest(self, hf_repo: str, gguf_filename: str) -> ModelManifest | None:
194 return self._load_manifest_file(self._manifest_path(hf_repo, gguf_filename))
196 def _write_manifest(self, manifest: ModelManifest) -> None:
197 path = self._manifest_path(manifest.hf_repo, manifest.gguf_filename)
198 path.parent.mkdir(parents=True, exist_ok=True)
199 data = json.dumps(asdict(manifest), indent=2)
200 tmp_path: str | None = None
201 try:
202 with tempfile.NamedTemporaryFile(
203 dir=path.parent, suffix=".tmp", mode="w", delete=False
204 ) as tmp:
205 tmp_path = tmp.name
206 tmp.write(data)
207 os.replace(tmp_path, path)
208 except BaseException:
209 if tmp_path is not None:
210 Path(tmp_path).unlink(missing_ok=True)
211 raise
213 def _load_manifest_file(self, path: Path) -> ModelManifest | None:
214 if not path.exists():
215 return None
216 try:
217 data = json.loads(path.read_text())
218 return ModelManifest(**data)
219 except (json.JSONDecodeError, TypeError, KeyError):
220 log.warning("Corrupt manifest: %s", path)
221 return None