documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 2050 bytes Raw Blame History

  
        1
        """JSON load/save for the corpus index.
      
        2
        
        3
        The index is a flat array of `IndexEntry` objects. We sort entries by
      
        4
        `section_id` before serializing so byte-identical corpora + identical
      
        5
        insertion orders produce byte-identical index files.
      
        6
        
        7
        The JSON format is `pydantic.TypeAdapter`-serialized with sorted keys
      
        8
        and a trailing newline. I/O is atomic via `dlm.io.atomic.write_bytes`
      
        9
        so concurrent readers never see a torn file.
      
        10
        """
      
        11
        
        12
        from __future__ import annotations
      
        13
        
        14
        import json
      
        15
        from pathlib import Path
      
        16
        from typing import Any
      
        17
        
        18
        from pydantic import TypeAdapter, ValidationError
      
        19
        
        20
        from dlm.io.atomic import write_bytes
      
        21
        from dlm.replay.errors import IndexCorruptError
      
        22
        from dlm.replay.models import IndexEntry
      
        23
        
        24
        _INDEX_ADAPTER: TypeAdapter[list[IndexEntry]] = TypeAdapter(list[IndexEntry])
      
        25
        
        26
        
        27
        def load_index(path: Path) -> list[IndexEntry]:
      
        28
            """Return the list of entries at `path`, or `[]` if `path` is missing.
      
        29
        
        30
            Raises `IndexCorruptError` if the file exists but isn't a valid
      
        31
            JSON array of `IndexEntry` records.
      
        32
            """
      
        33
            if not path.exists():
      
        34
                return []
      
        35
            try:
      
        36
                raw = path.read_bytes()
      
        37
            except OSError as exc:
      
        38
                raise IndexCorruptError(f"cannot read {path}: {exc}") from exc
      
        39
            try:
      
        40
                data: Any = json.loads(raw)
      
        41
            except json.JSONDecodeError as exc:
      
        42
                raise IndexCorruptError(f"{path} is not valid JSON: {exc}") from exc
      
        43
            try:
      
        44
                return _INDEX_ADAPTER.validate_python(data)
      
        45
            except ValidationError as exc:
      
        46
                raise IndexCorruptError(f"{path} has invalid entries: {exc}") from exc
      
        47
        
        48
        
        49
        def save_index(path: Path, entries: list[IndexEntry]) -> None:
      
        50
            """Atomically write `entries` to `path`, sorted by `section_id`.
      
        51
        
        52
            Serializes with `mode="json"` so `datetime` fields become ISO-8601
      
        53
            strings. Parent directory must already exist.
      
        54
            """
      
        55
            sorted_entries = sorted(entries, key=lambda e: e.section_id)
      
        56
            payload = _INDEX_ADAPTER.dump_python(sorted_entries, mode="json")
      
        57
            blob = (json.dumps(payload, sort_keys=True, indent=2) + "\n").encode("utf-8")
      
        58
            write_bytes(path, blob)

1	"""JSON load/save for the corpus index.
2
3	The index is a flat array of `IndexEntry` objects. We sort entries by
4	`section_id` before serializing so byte-identical corpora + identical
5	insertion orders produce byte-identical index files.
6
7	The JSON format is `pydantic.TypeAdapter`-serialized with sorted keys
8	and a trailing newline. I/O is atomic via `dlm.io.atomic.write_bytes`
9	so concurrent readers never see a torn file.
10	"""
11
12	from __future__ import annotations
13
14	import json
15	from pathlib import Path
16	from typing import Any
17
18	from pydantic import TypeAdapter, ValidationError
19
20	from dlm.io.atomic import write_bytes
21	from dlm.replay.errors import IndexCorruptError
22	from dlm.replay.models import IndexEntry
23
24	_INDEX_ADAPTER: TypeAdapter[list[IndexEntry]] = TypeAdapter(list[IndexEntry])
25
26
27	def load_index(path: Path) -> list[IndexEntry]:
28	"""Return the list of entries at `path`, or `[]` if `path` is missing.
29
30	Raises `IndexCorruptError` if the file exists but isn't a valid
31	JSON array of `IndexEntry` records.
32	"""
33	if not path.exists():
34	return []
35	try:
36	raw = path.read_bytes()
37	except OSError as exc:
38	raise IndexCorruptError(f"cannot read {path}: {exc}") from exc
39	try:
40	data: Any = json.loads(raw)
41	except json.JSONDecodeError as exc:
42	raise IndexCorruptError(f"{path} is not valid JSON: {exc}") from exc
43	try:
44	return _INDEX_ADAPTER.validate_python(data)
45	except ValidationError as exc:
46	raise IndexCorruptError(f"{path} has invalid entries: {exc}") from exc
47
48
49	def save_index(path: Path, entries: list[IndexEntry]) -> None:
50	"""Atomically write `entries` to `path`, sorted by `section_id`.
51
52	Serializes with `mode="json"` so `datetime` fields become ISO-8601
53	strings. Parent directory must already exist.
54	"""
55	sorted_entries = sorted(entries, key=lambda e: e.section_id)
56	payload = _INDEX_ADAPTER.dump_python(sorted_entries, mode="json")
57	blob = (json.dumps(payload, sort_keys=True, indent=2) + "\n").encode("utf-8")
58	write_bytes(path, blob)