@@ -1,28 +1,31 @@ |
| 1 | 1 | """Atomic filesystem writes. |
| 2 | 2 | |
| 3 | 3 | Single entry point for "write to a tmp sibling then `os.replace` onto the |
| 4 | | -final name." Three Phase-0 modules (`dlm.io.text`, `dlm.store.manifest`, |
| 4 | +final name." Three early modules (`dlm.io.text`, `dlm.store.manifest`, |
| 5 | 5 | `dlm.store.paths`) independently grew this pattern; consolidating here |
| 6 | 6 | gives us one place to add `fsync` / directory-sync semantics later. |
| 7 | 7 | |
| 8 | | -The tmp file carries the writer's PID so concurrent writers don't stomp |
| 9 | | -each other's scratch files mid-write. After a crash, stale `.tmp.<pid>` |
| 10 | | -files are harmless: they sit next to the real file and are swept up by |
| 11 | | -`cleanup_stale_tmp_files` from within sprints that notice them (e.g., |
| 12 | | -Sprint 04's store load path). |
| 8 | +Each tmp file carries the writer's PID plus a random 8-hex-char nonce so |
| 9 | +concurrent writers don't stomp each other's scratch files — and so PID |
| 10 | +reuse (after a parent process dies and the kernel recycles the number) |
| 11 | +can't make a stale tmp match a live peer. After a crash, stale tmps are |
| 12 | +harmless: they sit next to the real file and are swept up by |
| 13 | +`cleanup_stale_tmp_files` from the store load path. |
| 13 | 14 | """ |
| 14 | 15 | |
| 15 | 16 | from __future__ import annotations |
| 16 | 17 | |
| 17 | 18 | import os |
| 19 | +import re |
| 18 | 20 | from pathlib import Path |
| 19 | 21 | |
| 20 | 22 | |
| 21 | 23 | def write_bytes(path: Path, data: bytes) -> None: |
| 22 | 24 | """Atomically replace `path` with `data`. |
| 23 | 25 | |
| 24 | | - Writes to `<path>.tmp.<pid>`, then `os.replace()` to the final name. |
| 25 | | - Atomic on POSIX and on Windows NTFS. Parent directory must exist. |
| 26 | + Writes to `<path>.tmp.<pid>.<nonce>`, then `os.replace()` to the |
| 27 | + final name. Atomic on POSIX and on Windows NTFS. Parent directory |
| 28 | + must exist. |
| 26 | 29 | """ |
| 27 | 30 | tmp = _tmp_path(path) |
| 28 | 31 | tmp.write_bytes(data) |
@@ -38,7 +41,7 @@ def write_text(path: Path, text: str, *, encoding: str = "utf-8") -> None: |
| 38 | 41 | |
| 39 | 42 | |
| 40 | 43 | def cleanup_stale_tmp_files(directory: Path) -> list[Path]: |
| 41 | | - """Remove `*.tmp.<pid>` files in `directory` whose PID is no longer alive. |
| 44 | + """Remove tmp-suffix files in `directory` whose PID is no longer alive. |
| 42 | 45 | |
| 43 | 46 | Returns the list of files actually removed. Safe to call on a missing |
| 44 | 47 | directory (returns empty). Never removes the final target or files |
@@ -63,14 +66,33 @@ def cleanup_stale_tmp_files(directory: Path) -> list[Path]: |
| 63 | 66 | |
| 64 | 67 | # --- internals --------------------------------------------------------------- |
| 65 | 68 | |
| 69 | +# `<suffix>.tmp.<pid>.<nonce>` where nonce is 8 hex chars. The nonce |
| 70 | +# makes PID reuse harmless: a recycled PID on a stale tmp can't collide |
| 71 | +# with a live peer because the nonce differs. |
| 72 | +_TMP_RE = re.compile(r"\.tmp\.(?P<pid>\d+)\.(?P<nonce>[0-9a-f]{8})$") |
| 73 | + |
| 66 | 74 | |
| 67 | 75 | def _tmp_path(path: Path) -> Path: |
| 68 | | - return path.with_suffix(path.suffix + f".tmp.{os.getpid()}") |
| 76 | + nonce = os.urandom(4).hex() |
| 77 | + return path.with_suffix(path.suffix + f".tmp.{os.getpid()}.{nonce}") |
| 69 | 78 | |
| 70 | 79 | |
| 71 | 80 | def _tmp_pid(path: Path) -> int | None: |
| 72 | | - """Return the PID embedded in a `<name>.tmp.<pid>` filename, or None.""" |
| 81 | + """Return the PID embedded in a tmp-suffixed filename, or None. |
| 82 | + |
| 83 | + Accepts both the nonce-suffixed shape (`<name>.tmp.<pid>.<hex8>`) |
| 84 | + and the legacy nonce-less shape (`<name>.tmp.<pid>`) so sweeps on a |
| 85 | + store that spans a pre- and post-upgrade writer still clean up |
| 86 | + correctly. |
| 87 | + """ |
| 73 | 88 | name = path.name |
| 89 | + m = _TMP_RE.search(name) |
| 90 | + if m is not None: |
| 91 | + try: |
| 92 | + return int(m.group("pid")) |
| 93 | + except ValueError: |
| 94 | + return None |
| 95 | + # Legacy fallback: `<name>.tmp.<pid>` with no nonce. |
| 74 | 96 | marker = ".tmp." |
| 75 | 97 | idx = name.rfind(marker) |
| 76 | 98 | if idx == -1: |