| 1 | """S26 X3 prove-the-value: sway run on an unpacked swaypack matches |
| 2 | the pre-pack verdict bit-for-bit (modulo timestamp fields). |
| 3 | |
| 4 | This is the integration test the sprint DoD requires: pack a real |
| 5 | spec → unpack into a tmp dir → run the unpacked spec → compare |
| 6 | verdict + per-probe scores against the original run. If the round |
| 7 | trip is lossy the audit's "share an adapter audit with a coworker" |
| 8 | flow doesn't actually reproduce. |
| 9 | |
| 10 | Marked ``slow + online`` because it goes through the full runner + |
| 11 | spec-loader machinery on a non-trivial fixture. |
| 12 | """ |
| 13 | |
| 14 | from __future__ import annotations |
| 15 | |
| 16 | import json |
| 17 | import os |
| 18 | from pathlib import Path |
| 19 | |
| 20 | import pytest |
| 21 | import yaml |
| 22 | |
| 23 | pytestmark = [pytest.mark.slow, pytest.mark.online] |
| 24 | |
| 25 | |
| 26 | def _write_two_prompt_spec(spec_path: Path) -> None: |
| 27 | """A minimal-but-real sway.yaml that produces a deterministic |
| 28 | verdict against the dummy backend (no model load required).""" |
| 29 | body = { |
| 30 | "version": 1, |
| 31 | "models": { |
| 32 | "base": {"kind": "dummy", "base": "dummy-base"}, |
| 33 | "ft": {"kind": "dummy", "base": "dummy-base"}, |
| 34 | }, |
| 35 | "defaults": {"seed": 0}, |
| 36 | "suite": [ |
| 37 | { |
| 38 | "name": "dk", |
| 39 | "kind": "delta_kl", |
| 40 | "prompts": ["alpha", "beta"], |
| 41 | "assert_mean_gte": 0.001, # Easy bar for the dummy ft view. |
| 42 | } |
| 43 | ], |
| 44 | } |
| 45 | spec_path.write_text(yaml.safe_dump(body, sort_keys=False), encoding="utf-8") |
| 46 | |
| 47 | |
| 48 | def _run_spec_via_dummy(spec_path: Path) -> tuple[str, list[tuple[str, str, float | None]]]: |
| 49 | """Load + run ``spec_path`` against the dummy differential backend. |
| 50 | |
| 51 | Returns ``(suite_verdict, [(probe_name, verdict, score)])`` for |
| 52 | the round-trip comparison. Bypasses the CLI to avoid subprocess |
| 53 | flakiness; the runner is what S26 needs to round-trip cleanly. |
| 54 | """ |
| 55 | from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses |
| 56 | from dlm_sway.suite.loader import load_spec |
| 57 | from dlm_sway.suite.runner import run as run_suite |
| 58 | from dlm_sway.suite.score import compute as compute_score |
| 59 | |
| 60 | spec = load_spec(spec_path) |
| 61 | backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) |
| 62 | # Dummy backend has no .close(); _close_if_possible-style guard. |
| 63 | try: |
| 64 | result = run_suite(spec, backend, spec_path=str(spec_path)) |
| 65 | score = compute_score(result) |
| 66 | finally: |
| 67 | close = getattr(backend, "close", None) |
| 68 | if callable(close): |
| 69 | close() |
| 70 | |
| 71 | probe_summary = [(p.name, str(p.verdict), p.score) for p in result.probes] |
| 72 | return str(score.band), probe_summary |
| 73 | |
| 74 | |
| 75 | def test_pack_run_round_trip_matches(tmp_path: Path) -> None: |
| 76 | """Pack → unpack → run gives the same per-probe verdict + score |
| 77 | as the pre-pack run, with the SWAY_NULL_CACHE_DIR env honored.""" |
| 78 | from dlm_sway.cli._pack import pack_spec |
| 79 | from dlm_sway.cli._unpack import unpack_swaypack |
| 80 | |
| 81 | # 1) Build a spec + run it as the "source of truth." |
| 82 | src_dir = tmp_path / "source" |
| 83 | src_dir.mkdir() |
| 84 | src_spec = src_dir / "sway.yaml" |
| 85 | _write_two_prompt_spec(src_spec) |
| 86 | pre_band, pre_probes = _run_spec_via_dummy(src_spec) |
| 87 | |
| 88 | # 2) Pack it. include_null_cache=False because the dummy backend |
| 89 | # doesn't write null-stats to disk anyway, and the test runs |
| 90 | # against an isolated tmp_path so we shouldn't pollute the |
| 91 | # user's home cache either way. |
| 92 | pack_path = tmp_path / "test.swaypack.tar.gz" |
| 93 | pack_report = pack_spec(src_spec, out_path=pack_path, include_null_cache=False) |
| 94 | assert pack_report.size_bytes > 0 |
| 95 | |
| 96 | # 3) Unpack into a fresh tmp dir. |
| 97 | unpack_dst = tmp_path / "destination" |
| 98 | unpack_report = unpack_swaypack(pack_path, target_dir=unpack_dst) |
| 99 | |
| 100 | # 4) Run the unpacked spec, with SWAY_NULL_CACHE_DIR pointing at |
| 101 | # the unpacked dir if the pack carried one (it didn't here, |
| 102 | # but exercise the env-var honoring path). |
| 103 | prev_env = os.environ.get("SWAY_NULL_CACHE_DIR") |
| 104 | if unpack_report.null_stats_dir is not None: |
| 105 | os.environ["SWAY_NULL_CACHE_DIR"] = str(unpack_report.null_stats_dir) |
| 106 | try: |
| 107 | post_band, post_probes = _run_spec_via_dummy(unpack_report.spec_path) |
| 108 | finally: |
| 109 | if prev_env is None: |
| 110 | os.environ.pop("SWAY_NULL_CACHE_DIR", None) |
| 111 | else: |
| 112 | os.environ["SWAY_NULL_CACHE_DIR"] = prev_env |
| 113 | |
| 114 | # 5) Verdict + per-probe scores must round-trip exactly. |
| 115 | assert pre_band == post_band, f"band drifted: {pre_band!r} → {post_band!r}" |
| 116 | assert pre_probes == post_probes, ( |
| 117 | f"per-probe results drifted:\n pre: {pre_probes}\n post: {post_probes}" |
| 118 | ) |
| 119 | |
| 120 | |
| 121 | def test_unpack_with_null_cache_sets_env_pointer(tmp_path: Path) -> None: |
| 122 | """If the pack DID carry a null-stats cache, the unpack report's |
| 123 | null_stats_dir is non-None and points at a real directory the |
| 124 | caller can hand to ``SWAY_NULL_CACHE_DIR``.""" |
| 125 | from dlm_sway.cli._pack import pack_spec |
| 126 | from dlm_sway.cli._unpack import unpack_swaypack |
| 127 | |
| 128 | # Build a fake null-stats cache + redirect the pack reader at it. |
| 129 | fake_cache = tmp_path / "xdg-cache" / "dlm-sway" / "null-stats" |
| 130 | fake_cache.mkdir(parents=True) |
| 131 | (fake_cache / "abc123.json").write_text( |
| 132 | json.dumps({"mean": 1.0, "std": 0.1, "runs": 3}), |
| 133 | encoding="utf-8", |
| 134 | ) |
| 135 | prev_xdg = os.environ.get("XDG_CACHE_HOME") |
| 136 | os.environ["XDG_CACHE_HOME"] = str(tmp_path / "xdg-cache") |
| 137 | try: |
| 138 | spec_path = tmp_path / "sway.yaml" |
| 139 | _write_two_prompt_spec(spec_path) |
| 140 | out = tmp_path / "with-cache.swaypack.tar.gz" |
| 141 | report = pack_spec(spec_path, out_path=out, include_null_cache=True) |
| 142 | assert report.null_stats_count == 1 |
| 143 | finally: |
| 144 | if prev_xdg is None: |
| 145 | os.environ.pop("XDG_CACHE_HOME", None) |
| 146 | else: |
| 147 | os.environ["XDG_CACHE_HOME"] = prev_xdg |
| 148 | |
| 149 | target = tmp_path / "u" |
| 150 | unpack_report = unpack_swaypack(out, target_dir=target) |
| 151 | assert unpack_report.null_stats_dir is not None |
| 152 | assert (unpack_report.null_stats_dir / "abc123.json").exists() |