| 1 |
"""JSON load/save for the corpus index. |
| 2 |
|
| 3 |
The index is a flat array of `IndexEntry` objects. We sort entries by |
| 4 |
`section_id` before serializing so byte-identical corpora + identical |
| 5 |
insertion orders produce byte-identical index files. |
| 6 |
|
| 7 |
The JSON format is `pydantic.TypeAdapter`-serialized with sorted keys |
| 8 |
and a trailing newline. I/O is atomic via `dlm.io.atomic.write_bytes` |
| 9 |
so concurrent readers never see a torn file. |
| 10 |
""" |
| 11 |
|
| 12 |
from __future__ import annotations |
| 13 |
|
| 14 |
import json |
| 15 |
from pathlib import Path |
| 16 |
from typing import Any |
| 17 |
|
| 18 |
from pydantic import TypeAdapter, ValidationError |
| 19 |
|
| 20 |
from dlm.io.atomic import write_bytes |
| 21 |
from dlm.replay.errors import IndexCorruptError |
| 22 |
from dlm.replay.models import IndexEntry |
| 23 |
|
| 24 |
_INDEX_ADAPTER: TypeAdapter[list[IndexEntry]] = TypeAdapter(list[IndexEntry]) |
| 25 |
|
| 26 |
|
| 27 |
def load_index(path: Path) -> list[IndexEntry]: |
| 28 |
"""Return the list of entries at `path`, or `[]` if `path` is missing. |
| 29 |
|
| 30 |
Raises `IndexCorruptError` if the file exists but isn't a valid |
| 31 |
JSON array of `IndexEntry` records. |
| 32 |
""" |
| 33 |
if not path.exists(): |
| 34 |
return [] |
| 35 |
try: |
| 36 |
raw = path.read_bytes() |
| 37 |
except OSError as exc: |
| 38 |
raise IndexCorruptError(f"cannot read {path}: {exc}") from exc |
| 39 |
try: |
| 40 |
data: Any = json.loads(raw) |
| 41 |
except json.JSONDecodeError as exc: |
| 42 |
raise IndexCorruptError(f"{path} is not valid JSON: {exc}") from exc |
| 43 |
try: |
| 44 |
return _INDEX_ADAPTER.validate_python(data) |
| 45 |
except ValidationError as exc: |
| 46 |
raise IndexCorruptError(f"{path} has invalid entries: {exc}") from exc |
| 47 |
|
| 48 |
|
| 49 |
def save_index(path: Path, entries: list[IndexEntry]) -> None: |
| 50 |
"""Atomically write `entries` to `path`, sorted by `section_id`. |
| 51 |
|
| 52 |
Serializes with `mode="json"` so `datetime` fields become ISO-8601 |
| 53 |
strings. Parent directory must already exist. |
| 54 |
""" |
| 55 |
sorted_entries = sorted(entries, key=lambda e: e.section_id) |
| 56 |
payload = _INDEX_ADAPTER.dump_python(sorted_entries, mode="json") |
| 57 |
blob = (json.dumps(payload, sort_keys=True, indent=2) + "\n").encode("utf-8") |
| 58 |
write_bytes(path, blob) |