Python · 2050 bytes Raw Blame History
1 """JSON load/save for the corpus index.
2
3 The index is a flat array of `IndexEntry` objects. We sort entries by
4 `section_id` before serializing so byte-identical corpora + identical
5 insertion orders produce byte-identical index files.
6
7 The JSON format is `pydantic.TypeAdapter`-serialized with sorted keys
8 and a trailing newline. I/O is atomic via `dlm.io.atomic.write_bytes`
9 so concurrent readers never see a torn file.
10 """
11
12 from __future__ import annotations
13
14 import json
15 from pathlib import Path
16 from typing import Any
17
18 from pydantic import TypeAdapter, ValidationError
19
20 from dlm.io.atomic import write_bytes
21 from dlm.replay.errors import IndexCorruptError
22 from dlm.replay.models import IndexEntry
23
24 _INDEX_ADAPTER: TypeAdapter[list[IndexEntry]] = TypeAdapter(list[IndexEntry])
25
26
27 def load_index(path: Path) -> list[IndexEntry]:
28 """Return the list of entries at `path`, or `[]` if `path` is missing.
29
30 Raises `IndexCorruptError` if the file exists but isn't a valid
31 JSON array of `IndexEntry` records.
32 """
33 if not path.exists():
34 return []
35 try:
36 raw = path.read_bytes()
37 except OSError as exc:
38 raise IndexCorruptError(f"cannot read {path}: {exc}") from exc
39 try:
40 data: Any = json.loads(raw)
41 except json.JSONDecodeError as exc:
42 raise IndexCorruptError(f"{path} is not valid JSON: {exc}") from exc
43 try:
44 return _INDEX_ADAPTER.validate_python(data)
45 except ValidationError as exc:
46 raise IndexCorruptError(f"{path} has invalid entries: {exc}") from exc
47
48
49 def save_index(path: Path, entries: list[IndexEntry]) -> None:
50 """Atomically write `entries` to `path`, sorted by `section_id`.
51
52 Serializes with `mode="json"` so `datetime` fields become ISO-8601
53 strings. Parent directory must already exist.
54 """
55 sorted_entries = sorted(entries, key=lambda e: e.section_id)
56 payload = _INDEX_ADAPTER.dump_python(sorted_entries, mode="json")
57 blob = (json.dumps(payload, sort_keys=True, indent=2) + "\n").encode("utf-8")
58 write_bytes(path, blob)