Coverage for src / lilbee / registry.py: 100%

149 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""Manifest store keyed by ``(hf_repo, gguf_filename)`` over the HF cache. 

2 

3Canonical ref: ``<hf_repo>/<gguf_filename>``. Two quants of the same 

4repo are two distinct installations. Manifests live at 

5``manifests/<repo--repo>/<filename>.json``; blobs at 

6``models--<repo--repo>/blobs/<sha>``. 

7""" 

8 

9from __future__ import annotations 

10 

11import hashlib 

12import json 

13import logging 

14import os 

15import re 

16import tempfile 

17from dataclasses import asdict, dataclass 

18from pathlib import Path 

19 

20from lilbee.security import validate_path_within 

21 

22log = logging.getLogger(__name__) 

23 

24_HASH_ALGORITHM = "sha256" 

25_HASH_CHUNK_SIZE = 8192 # bytes read per iteration when hashing 

26_REPO_SEGMENT_RE = re.compile(r"^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$") 

27_FILENAME_RE = re.compile(r"^[a-zA-Z0-9._-]+\.gguf$") 

28 

29REPO_DIR_SEPARATOR = "--" 

30 

31 

32def _validate_hf_repo(hf_repo: str) -> str: 

33 """Validate that a HuggingFace repo id has the form ``org/name``.""" 

34 if not hf_repo or not _REPO_SEGMENT_RE.match(hf_repo) or ".." in hf_repo: 

35 raise ValueError(f"Invalid hf_repo: {hf_repo!r}") 

36 return hf_repo 

37 

38 

39def _validate_gguf_filename(filename: str) -> str: 

40 """Validate that a filename is a safe ``.gguf`` basename (no path separators).""" 

41 if not filename or not _FILENAME_RE.match(filename) or ".." in filename: 

42 raise ValueError(f"Invalid gguf_filename: {filename!r}") 

43 return filename 

44 

45 

46_REF_SHAPE_HINT = "Use '<org>/<repo>/<filename>.gguf'." 

47 

48 

49def parse_hf_ref(ref: str) -> tuple[str, str]: 

50 """Split ``<org>/<repo>/<file>.gguf`` into ``(hf_repo, gguf_filename)``.""" 

51 if not ref.endswith(".gguf") or "/" not in ref: 

52 raise ValueError(f"Model ref {ref!r} is not a HuggingFace ref. {_REF_SHAPE_HINT}") 

53 hf_repo, gguf_filename = ref.rsplit("/", 1) 

54 return _validate_hf_repo(hf_repo), _validate_gguf_filename(gguf_filename) 

55 

56 

57def repo_to_dir(hf_repo: str) -> str: 

58 """Encode an HF repo for use as a directory name (HF cache convention).""" 

59 return hf_repo.replace("/", REPO_DIR_SEPARATOR) 

60 

61 

62@dataclass 

63class ModelManifest: 

64 """One installed model's metadata. Identity: ``(hf_repo, gguf_filename)``.""" 

65 

66 hf_repo: str 

67 gguf_filename: str 

68 size_bytes: int 

69 task: str # use lilbee.models.ModelTask values 

70 downloaded_at: str # ISO 8601 

71 blob: str = "" # SHA-256 hex of the blob in the HF cache 

72 

73 @property 

74 def ref(self) -> str: 

75 return f"{self.hf_repo}/{self.gguf_filename}" 

76 

77 

78def _sha256_file(path: Path) -> str: 

79 """Compute SHA-256 hex digest of a file.""" 

80 h = hashlib.sha256() 

81 with path.open("rb") as f: 

82 while True: 

83 chunk = f.read(_HASH_CHUNK_SIZE) 

84 if not chunk: 

85 break 

86 h.update(chunk) 

87 return h.hexdigest() 

88 

89 

90class ModelRegistry: 

91 """Read/write manifests and resolve refs to blobs in the HF cache.""" 

92 

93 def __init__(self, models_dir: Path) -> None: 

94 self._root = models_dir 

95 self._manifests_dir = models_dir / "manifests" 

96 

97 def resolve(self, ref: str) -> Path: 

98 """Return the blob path for *ref*; ``KeyError`` if not installed.""" 

99 hf_repo, gguf_filename = parse_hf_ref(ref) 

100 manifest = self._read_manifest(hf_repo, gguf_filename) 

101 if manifest is None: 

102 raise KeyError(f"Model {ref} not installed") 

103 cache_path = self._root / f"models--{repo_to_dir(manifest.hf_repo)}" 

104 if not cache_path.exists(): 

105 raise KeyError(f"Cache folder missing for {ref}: {cache_path.name}") 

106 blob_file = cache_path / "blobs" / manifest.blob 

107 if not blob_file.exists(): 

108 raise KeyError(f"Blob file missing for {ref}: {manifest.blob}") 

109 return blob_file 

110 

111 def is_installed(self, ref: str) -> bool: 

112 """Return True if a model is installed and its blob is present.""" 

113 try: 

114 self.resolve(ref) 

115 return True 

116 except (KeyError, ValueError): 

117 return False 

118 

119 def install( 

120 self, 

121 hf_repo: str, 

122 gguf_filename: str, 

123 source_path: Path, 

124 manifest: ModelManifest, 

125 ) -> Path: 

126 """Write a manifest, copying *source_path* into the HF cache if needed.""" 

127 import shutil 

128 

129 digest = _sha256_file(source_path) 

130 cache_path = self._root / f"models--{repo_to_dir(hf_repo)}" 

131 blobs_dir = cache_path / "blobs" 

132 blob_path = blobs_dir / digest 

133 if not blob_path.exists(): 

134 blobs_dir.mkdir(parents=True, exist_ok=True) 

135 shutil.copy2(source_path, blob_path) 

136 

137 updated = ModelManifest( 

138 hf_repo=hf_repo, 

139 gguf_filename=gguf_filename, 

140 size_bytes=manifest.size_bytes, 

141 task=manifest.task, 

142 downloaded_at=manifest.downloaded_at, 

143 blob=digest, 

144 ) 

145 self._write_manifest(updated) 

146 return blob_path 

147 

148 def remove(self, ref: str) -> bool: 

149 """Remove a manifest. Does not delete the cached blob.""" 

150 try: 

151 hf_repo, gguf_filename = parse_hf_ref(ref) 

152 except ValueError: 

153 return False 

154 manifest_path = self._manifest_path(hf_repo, gguf_filename) 

155 if not manifest_path.exists(): 

156 return False 

157 manifest_path.unlink() 

158 repo_dir = manifest_path.parent 

159 if repo_dir.exists() and not any(repo_dir.iterdir()): 

160 repo_dir.rmdir() 

161 log.info("Removed manifest for %s (cache file untouched)", ref) 

162 return True 

163 

164 def list_installed(self) -> list[ModelManifest]: 

165 """Return manifests for all installed models.""" 

166 manifests: list[ModelManifest] = [] 

167 if not self._manifests_dir.exists(): 

168 return manifests 

169 for repo_dir in sorted(self._manifests_dir.iterdir()): 

170 if not repo_dir.is_dir(): 

171 continue 

172 for tag_file in sorted(repo_dir.glob("*.gguf.json")): 

173 manifest = self._load_manifest_file(tag_file) 

174 if manifest is not None: 

175 manifests.append(manifest) 

176 return manifests 

177 

178 def get_manifest(self, ref: str) -> ModelManifest | None: 

179 """Return the manifest for *ref* or None if not installed.""" 

180 try: 

181 hf_repo, gguf_filename = parse_hf_ref(ref) 

182 except ValueError: 

183 return None 

184 return self._read_manifest(hf_repo, gguf_filename) 

185 

186 def _manifest_path(self, hf_repo: str, gguf_filename: str) -> Path: 

187 repo = _validate_hf_repo(hf_repo) 

188 filename = _validate_gguf_filename(gguf_filename) 

189 path = self._manifests_dir / repo_to_dir(repo) / f"{filename}.json" 

190 validate_path_within(path, self._manifests_dir) 

191 return path 

192 

193 def _read_manifest(self, hf_repo: str, gguf_filename: str) -> ModelManifest | None: 

194 return self._load_manifest_file(self._manifest_path(hf_repo, gguf_filename)) 

195 

196 def _write_manifest(self, manifest: ModelManifest) -> None: 

197 path = self._manifest_path(manifest.hf_repo, manifest.gguf_filename) 

198 path.parent.mkdir(parents=True, exist_ok=True) 

199 data = json.dumps(asdict(manifest), indent=2) 

200 tmp_path: str | None = None 

201 try: 

202 with tempfile.NamedTemporaryFile( 

203 dir=path.parent, suffix=".tmp", mode="w", delete=False 

204 ) as tmp: 

205 tmp_path = tmp.name 

206 tmp.write(data) 

207 os.replace(tmp_path, path) 

208 except BaseException: 

209 if tmp_path is not None: 

210 Path(tmp_path).unlink(missing_ok=True) 

211 raise 

212 

213 def _load_manifest_file(self, path: Path) -> ModelManifest | None: 

214 if not path.exists(): 

215 return None 

216 try: 

217 data = json.loads(path.read_text()) 

218 return ModelManifest(**data) 

219 except (json.JSONDecodeError, TypeError, KeyError): 

220 log.warning("Corrupt manifest: %s", path) 

221 return None