Coverage for src / lilbee / crawler / save.py: 100%

101 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-29 19:16 +0000

1"""URL → filename mapping, metadata I/O, and per-page save-to-disk. 

2 

3Backend-agnostic: all I/O lives here so a future adapter doesn't 

4need to reinvent the crawl metadata sidecar or the ``_web/`` layout. 

5""" 

6 

7from __future__ import annotations 

8 

9import hashlib 

10import json 

11import logging 

12import re 

13import tempfile 

14from dataclasses import dataclass 

15from pathlib import Path 

16from urllib.parse import urlparse 

17 

18from lilbee.config import cfg 

19from lilbee.crawler.models import CrawlResult 

20from lilbee.security import validate_path_within 

21 

22log = logging.getLogger(__name__) 

23 

24# Maximum filename length before truncation (most filesystems cap at 255 bytes) 

25_MAX_FILENAME_LEN = 200 

26 

27# Sentinel for index pages (trailing slash or empty path) 

28_INDEX_FILENAME = "index.md" 

29 

30# How often the crawl metadata JSON is rewritten during a streaming crawl. 

31# Markdown files are durable per-page; metadata batches to keep write volume 

32# bounded. Worst-case loss on crash is N-1 entries, recoverable from the files. 

33METADATA_FLUSH_INTERVAL = 10 

34 

35 

36def url_to_filename(url: str) -> str: 

37 """Convert a URL to a safe filesystem path ending in .md. 

38 

39 Examples: 

40 https://docs.python.org/3/tutorial/ → docs.python.org/3/tutorial/index.md 

41 https://example.com/page?q=1#frag → example.com/page.md 

42 https://example.com/ → example.com/index.md 

43 """ 

44 parsed = urlparse(url) 

45 host = parsed.hostname or "unknown" 

46 path = parsed.path.rstrip("/") 

47 

48 if not path or path == "/": 

49 return f"{host}/{_INDEX_FILENAME}" 

50 

51 # Strip leading slash 

52 path = path.lstrip("/") 

53 

54 # Neutralize path traversal segments 

55 path = re.sub(r"\.\.+", "_", path) 

56 

57 # Replace unsafe filesystem characters 

58 path = re.sub(r'[<>:"|?*]', "_", path) 

59 

60 # If the last segment has no extension, treat as directory 

61 last_segment = path.rsplit("/", 1)[-1] 

62 if "." not in last_segment: 

63 path = f"{path}/{_INDEX_FILENAME}" 

64 else: 

65 # Replace existing extension with .md 

66 path = re.sub(r"\.[^./]+$", ".md", path) 

67 

68 full = f"{host}/{path}" 

69 

70 # Truncate if too long, preserving .md extension 

71 if len(full) > _MAX_FILENAME_LEN: 

72 url_hash = hashlib.sha256(url.encode()).hexdigest()[:12] 

73 full = full[: _MAX_FILENAME_LEN - 16] + f"_{url_hash}.md" 

74 

75 return full 

76 

77 

78def _web_dir() -> Path: 

79 """Return the _web/ subdirectory under documents.""" 

80 return cfg.documents_dir / "_web" 

81 

82 

83def _crawl_meta_path() -> Path: 

84 """Path to the crawl metadata sidecar JSON.""" 

85 return cfg.data_dir / "crawl_meta.json" 

86 

87 

88@dataclass 

89class CrawlMeta: 

90 """Metadata for a single crawled URL.""" 

91 

92 file: str 

93 content_hash: str 

94 crawled_at: str 

95 

96 

97def load_crawl_metadata() -> dict[str, CrawlMeta]: 

98 """Load URL→metadata mapping from the JSON sidecar.""" 

99 path = _crawl_meta_path() 

100 if not path.exists(): 

101 return {} 

102 try: 

103 raw = json.loads(path.read_text(encoding="utf-8")) 

104 except (json.JSONDecodeError, OSError): 

105 return {} 

106 result: dict[str, CrawlMeta] = {} 

107 for url, data in raw.items(): 

108 try: 

109 result[url] = CrawlMeta(**data) 

110 except (TypeError, KeyError): 

111 log.warning("Skipping malformed crawl metadata entry: %s", url) 

112 return result 

113 

114 

115def save_crawl_metadata(meta: dict[str, CrawlMeta]) -> None: 

116 """Persist URL→metadata mapping to the JSON sidecar (atomic write).""" 

117 path = _crawl_meta_path() 

118 path.parent.mkdir(parents=True, exist_ok=True) 

119 serializable = { 

120 url: {"file": m.file, "content_hash": m.content_hash, "crawled_at": m.crawled_at} 

121 for url, m in meta.items() 

122 } 

123 tmp_name: str | None = None 

124 try: 

125 with tempfile.NamedTemporaryFile(dir=path.parent, suffix=".tmp", delete=False) as tmp: 

126 tmp_name = tmp.name 

127 tmp.write(json.dumps(serializable, indent=2).encode("utf-8")) 

128 Path(tmp_name).replace(path) 

129 except BaseException: 

130 if tmp_name is not None: 

131 Path(tmp_name).unlink(missing_ok=True) 

132 raise 

133 

134 

135def content_hash(text: str) -> str: 

136 """SHA-256 hex digest of text content.""" 

137 return hashlib.sha256(text.encode()).hexdigest() 

138 

139 

140@dataclass(frozen=True) 

141class SaveOutcome: 

142 """Return value of ``_save_single_result``: written path and the hash/filename used.""" 

143 

144 path: Path 

145 filename: str 

146 content_hash: str 

147 

148 

149def _save_single_result(result: CrawlResult, meta: dict[str, CrawlMeta]) -> SaveOutcome | None: 

150 """Write one crawl result to disk if it's new or changed. 

151 

152 Returns the outcome (written path plus reusable filename/hash), or 

153 None if skipped (failure, empty markdown, unchanged hash with file 

154 on disk, or blocked by path traversal). 

155 """ 

156 if not result.success or not result.markdown.strip(): 

157 return None 

158 filename = url_to_filename(result.url) 

159 web_dir = _web_dir() 

160 file_path = web_dir / filename 

161 resolved_web_dir = web_dir.resolve() 

162 try: 

163 validate_path_within(file_path, resolved_web_dir) 

164 except ValueError: 

165 log.warning("Path traversal blocked: %s -> %s", result.url, file_path) 

166 return None 

167 new_hash = content_hash(result.markdown) 

168 prev = meta.get(result.url) 

169 if prev is not None and prev.content_hash == new_hash and file_path.exists(): 

170 log.info("Content unchanged, skipping save: %s", result.url) 

171 return None 

172 file_path.parent.mkdir(parents=True, exist_ok=True) 

173 file_path.write_text(result.markdown, encoding="utf-8") 

174 return SaveOutcome(path=file_path, filename=filename, content_hash=new_hash) 

175 

176 

177def _update_single_metadata( 

178 meta: dict[str, CrawlMeta], 

179 url: str, 

180 outcome: SaveOutcome, 

181 now: str, 

182) -> None: 

183 """Update the metadata dict in place with a previously-computed outcome.""" 

184 meta[url] = CrawlMeta( 

185 file=outcome.filename, 

186 content_hash=outcome.content_hash, 

187 crawled_at=now, 

188 )