tenseleyflow/documentlanguagemodel / 3c29221

Browse files

fix(io/atomic): nonce-suffix tmp files to survive PID reuse

Tmp-sibling filenames move from `<path>.tmp.<pid>` to
`<path>.tmp.<pid>.<hex8>`. The nonce is 4 random bytes hex-encoded
so two writers that happen to share a PID (after fork/recycling)
can&#39;t collide on the same scratch path. Legacy nonce-less tmps
still get cleaned up by the stale-sweeper — back-compat for a store
that spans a pre- and post-upgrade writer.
Authored by espadonne
SHA
3c29221dd244679e127a45dab9bbd744e2298baa
Parents
2c05e5d
Tree
3a5ecd5

2 changed files

StatusFile+-
M src/dlm/io/atomic.py 33 11
M tests/unit/test_io_atomic.py 40 0
src/dlm/io/atomic.pymodified
@@ -1,28 +1,31 @@
11
 """Atomic filesystem writes.
22
 
33
 Single entry point for "write to a tmp sibling then `os.replace` onto the
4
-final name." Three Phase-0 modules (`dlm.io.text`, `dlm.store.manifest`,
4
+final name." Three early modules (`dlm.io.text`, `dlm.store.manifest`,
55
 `dlm.store.paths`) independently grew this pattern; consolidating here
66
 gives us one place to add `fsync` / directory-sync semantics later.
77
 
8
-The tmp file carries the writer's PID so concurrent writers don't stomp
9
-each other's scratch files mid-write. After a crash, stale `.tmp.<pid>`
10
-files are harmless: they sit next to the real file and are swept up by
11
-`cleanup_stale_tmp_files` from within sprints that notice them (e.g.,
12
-Sprint 04's store load path).
8
+Each tmp file carries the writer's PID plus a random 8-hex-char nonce so
9
+concurrent writers don't stomp each other's scratch files — and so PID
10
+reuse (after a parent process dies and the kernel recycles the number)
11
+can't make a stale tmp match a live peer. After a crash, stale tmps are
12
+harmless: they sit next to the real file and are swept up by
13
+`cleanup_stale_tmp_files` from the store load path.
1314
 """
1415
 
1516
 from __future__ import annotations
1617
 
1718
 import os
19
+import re
1820
 from pathlib import Path
1921
 
2022
 
2123
 def write_bytes(path: Path, data: bytes) -> None:
2224
     """Atomically replace `path` with `data`.
2325
 
24
-    Writes to `<path>.tmp.<pid>`, then `os.replace()` to the final name.
25
-    Atomic on POSIX and on Windows NTFS. Parent directory must exist.
26
+    Writes to `<path>.tmp.<pid>.<nonce>`, then `os.replace()` to the
27
+    final name. Atomic on POSIX and on Windows NTFS. Parent directory
28
+    must exist.
2629
     """
2730
     tmp = _tmp_path(path)
2831
     tmp.write_bytes(data)
@@ -38,7 +41,7 @@ def write_text(path: Path, text: str, *, encoding: str = "utf-8") -> None:
3841
 
3942
 
4043
 def cleanup_stale_tmp_files(directory: Path) -> list[Path]:
41
-    """Remove `*.tmp.<pid>` files in `directory` whose PID is no longer alive.
44
+    """Remove tmp-suffix files in `directory` whose PID is no longer alive.
4245
 
4346
     Returns the list of files actually removed. Safe to call on a missing
4447
     directory (returns empty). Never removes the final target or files
@@ -63,14 +66,33 @@ def cleanup_stale_tmp_files(directory: Path) -> list[Path]:
6366
 
6467
 # --- internals ---------------------------------------------------------------
6568
 
69
+# `<suffix>.tmp.<pid>.<nonce>` where nonce is 8 hex chars. The nonce
70
+# makes PID reuse harmless: a recycled PID on a stale tmp can't collide
71
+# with a live peer because the nonce differs.
72
+_TMP_RE = re.compile(r"\.tmp\.(?P<pid>\d+)\.(?P<nonce>[0-9a-f]{8})$")
73
+
6674
 
6775
 def _tmp_path(path: Path) -> Path:
68
-    return path.with_suffix(path.suffix + f".tmp.{os.getpid()}")
76
+    nonce = os.urandom(4).hex()
77
+    return path.with_suffix(path.suffix + f".tmp.{os.getpid()}.{nonce}")
6978
 
7079
 
7180
 def _tmp_pid(path: Path) -> int | None:
72
-    """Return the PID embedded in a `<name>.tmp.<pid>` filename, or None."""
81
+    """Return the PID embedded in a tmp-suffixed filename, or None.
82
+
83
+    Accepts both the nonce-suffixed shape (`<name>.tmp.<pid>.<hex8>`)
84
+    and the legacy nonce-less shape (`<name>.tmp.<pid>`) so sweeps on a
85
+    store that spans a pre- and post-upgrade writer still clean up
86
+    correctly.
87
+    """
7388
     name = path.name
89
+    m = _TMP_RE.search(name)
90
+    if m is not None:
91
+        try:
92
+            return int(m.group("pid"))
93
+        except ValueError:
94
+            return None
95
+    # Legacy fallback: `<name>.tmp.<pid>` with no nonce.
7496
     marker = ".tmp."
7597
     idx = name.rfind(marker)
7698
     if idx == -1:
tests/unit/test_io_atomic.pymodified
@@ -43,8 +43,48 @@ class TestWriteText:
4343
         assert target.read_bytes() == b"hello"
4444
 
4545
 
46
+class TestNonceSuffix:
47
+    """Audit-11 M9: tmp files carry a random nonce so PID reuse can't
48
+    collide a stale tmp with a live peer's scratch file."""
49
+
50
+    def test_tmp_path_includes_nonce(self, tmp_path: Path) -> None:
51
+        target = tmp_path / "file.bin"
52
+        tmp = atomic._tmp_path(target)
53
+        # Shape: `file.bin.tmp.<pid>.<8 hex chars>`
54
+        parts = tmp.name.rsplit(".", maxsplit=2)
55
+        assert len(parts) == 3
56
+        assert parts[1].isdigit()  # PID
57
+        assert len(parts[2]) == 8
58
+        assert all(c in "0123456789abcdef" for c in parts[2])
59
+
60
+    def test_two_calls_yield_different_tmp_names(self, tmp_path: Path) -> None:
61
+        """Same PID, two writers, two distinct tmps — nonce distinguishes."""
62
+        target = tmp_path / "file.bin"
63
+        a = atomic._tmp_path(target)
64
+        b = atomic._tmp_path(target)
65
+        assert a != b
66
+
67
+    def test_cleanup_recognises_nonce_suffixed_tmp(self, tmp_path: Path) -> None:
68
+        live = tmp_path / "real.txt.tmp.1.0a1b2c3d"
69
+        dead = tmp_path / "real.txt.tmp.99999999.deadbeef"
70
+        live.write_bytes(b"live")
71
+        dead.write_bytes(b"dead")
72
+
73
+        def fake_is_alive(pid: int) -> bool:
74
+            return pid == 1
75
+
76
+        with patch("dlm.io.atomic._is_alive", side_effect=fake_is_alive):
77
+            removed = atomic.cleanup_stale_tmp_files(tmp_path)
78
+
79
+        assert removed == [dead]
80
+        assert live.exists()
81
+        assert not dead.exists()
82
+
83
+
4684
 class TestCleanupStaleTmp:
4785
     def test_removes_only_dead_pid_tmp_files(self, tmp_path: Path) -> None:
86
+        """Legacy nonce-less tmps still get cleaned up — back-compat for
87
+        sweeps that span a pre-/post-upgrade writer on the same store."""
4888
         live = tmp_path / "real.txt.tmp.1"
4989
         dead = tmp_path / "real.txt.tmp.99999999"
5090
         live.write_bytes(b"live")