Python · 6036 bytes Raw Blame History
1 """S26 X3 prove-the-value: sway run on an unpacked swaypack matches
2 the pre-pack verdict bit-for-bit (modulo timestamp fields).
3
4 This is the integration test the sprint DoD requires: pack a real
5 spec → unpack into a tmp dir → run the unpacked spec → compare
6 verdict + per-probe scores against the original run. If the round
7 trip is lossy the audit's "share an adapter audit with a coworker"
8 flow doesn't actually reproduce.
9
10 Marked ``slow + online`` because it goes through the full runner +
11 spec-loader machinery on a non-trivial fixture.
12 """
13
14 from __future__ import annotations
15
16 import json
17 import os
18 from pathlib import Path
19
20 import pytest
21 import yaml
22
23 pytestmark = [pytest.mark.slow, pytest.mark.online]
24
25
26 def _write_two_prompt_spec(spec_path: Path) -> None:
27 """A minimal-but-real sway.yaml that produces a deterministic
28 verdict against the dummy backend (no model load required)."""
29 body = {
30 "version": 1,
31 "models": {
32 "base": {"kind": "dummy", "base": "dummy-base"},
33 "ft": {"kind": "dummy", "base": "dummy-base"},
34 },
35 "defaults": {"seed": 0},
36 "suite": [
37 {
38 "name": "dk",
39 "kind": "delta_kl",
40 "prompts": ["alpha", "beta"],
41 "assert_mean_gte": 0.001, # Easy bar for the dummy ft view.
42 }
43 ],
44 }
45 spec_path.write_text(yaml.safe_dump(body, sort_keys=False), encoding="utf-8")
46
47
48 def _run_spec_via_dummy(spec_path: Path) -> tuple[str, list[tuple[str, str, float | None]]]:
49 """Load + run ``spec_path`` against the dummy differential backend.
50
51 Returns ``(suite_verdict, [(probe_name, verdict, score)])`` for
52 the round-trip comparison. Bypasses the CLI to avoid subprocess
53 flakiness; the runner is what S26 needs to round-trip cleanly.
54 """
55 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
56 from dlm_sway.suite.loader import load_spec
57 from dlm_sway.suite.runner import run as run_suite
58 from dlm_sway.suite.score import compute as compute_score
59
60 spec = load_spec(spec_path)
61 backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
62 # Dummy backend has no .close(); _close_if_possible-style guard.
63 try:
64 result = run_suite(spec, backend, spec_path=str(spec_path))
65 score = compute_score(result)
66 finally:
67 close = getattr(backend, "close", None)
68 if callable(close):
69 close()
70
71 probe_summary = [(p.name, str(p.verdict), p.score) for p in result.probes]
72 return str(score.band), probe_summary
73
74
75 def test_pack_run_round_trip_matches(tmp_path: Path) -> None:
76 """Pack → unpack → run gives the same per-probe verdict + score
77 as the pre-pack run, with the SWAY_NULL_CACHE_DIR env honored."""
78 from dlm_sway.cli._pack import pack_spec
79 from dlm_sway.cli._unpack import unpack_swaypack
80
81 # 1) Build a spec + run it as the "source of truth."
82 src_dir = tmp_path / "source"
83 src_dir.mkdir()
84 src_spec = src_dir / "sway.yaml"
85 _write_two_prompt_spec(src_spec)
86 pre_band, pre_probes = _run_spec_via_dummy(src_spec)
87
88 # 2) Pack it. include_null_cache=False because the dummy backend
89 # doesn't write null-stats to disk anyway, and the test runs
90 # against an isolated tmp_path so we shouldn't pollute the
91 # user's home cache either way.
92 pack_path = tmp_path / "test.swaypack.tar.gz"
93 pack_report = pack_spec(src_spec, out_path=pack_path, include_null_cache=False)
94 assert pack_report.size_bytes > 0
95
96 # 3) Unpack into a fresh tmp dir.
97 unpack_dst = tmp_path / "destination"
98 unpack_report = unpack_swaypack(pack_path, target_dir=unpack_dst)
99
100 # 4) Run the unpacked spec, with SWAY_NULL_CACHE_DIR pointing at
101 # the unpacked dir if the pack carried one (it didn't here,
102 # but exercise the env-var honoring path).
103 prev_env = os.environ.get("SWAY_NULL_CACHE_DIR")
104 if unpack_report.null_stats_dir is not None:
105 os.environ["SWAY_NULL_CACHE_DIR"] = str(unpack_report.null_stats_dir)
106 try:
107 post_band, post_probes = _run_spec_via_dummy(unpack_report.spec_path)
108 finally:
109 if prev_env is None:
110 os.environ.pop("SWAY_NULL_CACHE_DIR", None)
111 else:
112 os.environ["SWAY_NULL_CACHE_DIR"] = prev_env
113
114 # 5) Verdict + per-probe scores must round-trip exactly.
115 assert pre_band == post_band, f"band drifted: {pre_band!r}{post_band!r}"
116 assert pre_probes == post_probes, (
117 f"per-probe results drifted:\n pre: {pre_probes}\n post: {post_probes}"
118 )
119
120
121 def test_unpack_with_null_cache_sets_env_pointer(tmp_path: Path) -> None:
122 """If the pack DID carry a null-stats cache, the unpack report's
123 null_stats_dir is non-None and points at a real directory the
124 caller can hand to ``SWAY_NULL_CACHE_DIR``."""
125 from dlm_sway.cli._pack import pack_spec
126 from dlm_sway.cli._unpack import unpack_swaypack
127
128 # Build a fake null-stats cache + redirect the pack reader at it.
129 fake_cache = tmp_path / "xdg-cache" / "dlm-sway" / "null-stats"
130 fake_cache.mkdir(parents=True)
131 (fake_cache / "abc123.json").write_text(
132 json.dumps({"mean": 1.0, "std": 0.1, "runs": 3}),
133 encoding="utf-8",
134 )
135 prev_xdg = os.environ.get("XDG_CACHE_HOME")
136 os.environ["XDG_CACHE_HOME"] = str(tmp_path / "xdg-cache")
137 try:
138 spec_path = tmp_path / "sway.yaml"
139 _write_two_prompt_spec(spec_path)
140 out = tmp_path / "with-cache.swaypack.tar.gz"
141 report = pack_spec(spec_path, out_path=out, include_null_cache=True)
142 assert report.null_stats_count == 1
143 finally:
144 if prev_xdg is None:
145 os.environ.pop("XDG_CACHE_HOME", None)
146 else:
147 os.environ["XDG_CACHE_HOME"] = prev_xdg
148
149 target = tmp_path / "u"
150 unpack_report = unpack_swaypack(out, target_dir=target)
151 assert unpack_report.null_stats_dir is not None
152 assert (unpack_report.null_stats_dir / "abc123.json").exists()