Python · 8853 bytes Raw Blame History
1 """Cross-platform determinism golden — S18 / stretch-list F-item.
2
3 Runs a minimal 2-probe suite against a deterministically-seeded LoRA
4 adapter on SmolLM2-135M, then compares the JSON output against a
5 platform-pinned golden file (``tests/golden/expected_<platform>.json``).
6
7 Marked ``slow+online``: needs network for the tiny_model fixture and
8 HF weights for the adapter build. Runs in a dedicated CI matrix
9 (ubuntu-latest + macos-latest) via ``.github/workflows/ci.yml``'s
10 ``determinism-golden`` job.
11
12 Determinism contract this test pins:
13
14 - **Within a platform**, two runs of the same spec + adapter produce
15 byte-identical JSON (after masking timestamps + wall time). The
16 existing ``seed_everything`` wiring already holds this — this test
17 just encodes the check.
18 - **Across platforms**, numeric drift is bounded: raw metrics within
19 1e-6, scores within 1e-4. Looser than bitwise (BLAS implementation
20 differences make bitwise impossible) but tight enough that a silent
21 algorithm change — say a ``top_k=256`` default flipped to 128 —
22 surfaces as a clear drift report on both legs.
23
24 Regeneration recipes:
25
26 - **Locally**: ``SWAY_UPDATE_GOLDENS=1 uv run pytest tests/integration/
27 test_determinism_golden.py -m "slow or online"`` writes the
28 current-platform golden to ``tests/golden/expected_<platform>.json``.
29 - **Via CI**: dispatch the ``determinism-golden`` workflow with
30 ``regenerate_goldens=true``; download the uploaded artifact; commit
31 the platform file to the branch.
32
33 First-time Linux runs SKIP with a clear regen-recipe message when the
34 ``expected_linux.json`` file is missing — generated via the CI recipe
35 above and committed as a follow-up to the opening PR.
36 """
37
38 from __future__ import annotations
39
40 import json
41 import os
42 import sys
43 from collections.abc import Iterator
44 from pathlib import Path
45
46 import pytest
47
48 from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
49 from dlm_sway.core.golden import compare_goldens, mask_variable_fields
50 from dlm_sway.core.model import ModelSpec
51 from dlm_sway.suite import report
52 from dlm_sway.suite.runner import run as run_suite
53 from dlm_sway.suite.score import compute as compute_score
54 from dlm_sway.suite.spec import SwaySpec
55
56 pytestmark = [
57 pytest.mark.slow,
58 pytest.mark.online,
59 # F03 (Audit 03) — macOS CI observed a 20m stall inside
60 # ``snapshot_download`` on a run that normally completes in
61 # ~1m. Hard cap at 10m so a silent network hang fails as a
62 # test (actionable error in the CI log) rather than a
63 # workflow timeout (zero output).
64 pytest.mark.timeout(600),
65 ]
66
67
68 GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden"
69 GOLDEN_SPEC_PATH = GOLDEN_DIR / "spec.yaml"
70
71
72 def _platform_tag() -> str:
73 """Map ``sys.platform`` to the golden filename suffix.
74
75 ``darwin`` → ``darwin``; ``linux`` → ``linux``. Other platforms
76 (windows, freebsd) skip the test in the caller below; the tag
77 here still returns something usable for diagnostic messages.
78 """
79 if sys.platform.startswith("darwin"):
80 return "darwin"
81 if sys.platform.startswith("linux"):
82 return "linux"
83 return sys.platform
84
85
86 def _golden_path() -> Path:
87 return GOLDEN_DIR / f"expected_{_platform_tag()}.json"
88
89
90 def _build_deterministic_lora_adapter(base_dir: Path, out_dir: Path) -> None:
91 """Build a LoRA adapter deterministically from a fixed seed.
92
93 The goal is "bit-identical adapter given the same torch version".
94 ``torch.manual_seed(0)`` + a fixed init scale achieves that; any
95 drift in the ranker's per-ULP output downstream is caught by the
96 golden's tolerance.
97
98 Same seed + init shape as ``test_external_perplexity_e2e``'s
99 fixture — intentionally reused so the two integration tests
100 stress the same code path.
101 """
102 import torch
103 from peft import LoraConfig, get_peft_model
104 from transformers import AutoModelForCausalLM, AutoTokenizer
105
106 torch.manual_seed(0)
107 tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
108 if tokenizer.pad_token_id is None:
109 tokenizer.pad_token = tokenizer.eos_token
110 base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
111 cfg = LoraConfig(
112 r=8,
113 lora_alpha=16,
114 target_modules=["q_proj", "v_proj"],
115 lora_dropout=0.0,
116 bias="none",
117 task_type="CAUSAL_LM",
118 )
119 peft_model = get_peft_model(base, cfg)
120 with torch.no_grad():
121 for name, param in peft_model.named_parameters():
122 if "lora_B" in name:
123 param.copy_(torch.randn_like(param) * 0.05)
124 peft_model.save_pretrained(str(out_dir))
125 tokenizer.save_pretrained(str(out_dir))
126
127
128 @pytest.fixture(scope="module")
129 def golden_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
130 adapter_dir = tmp_path_factory.mktemp("golden-adapter")
131 _build_deterministic_lora_adapter(tiny_model_dir, adapter_dir)
132 return adapter_dir
133
134
135 @pytest.fixture(scope="module")
136 def golden_backend(
137 tiny_model_dir: Path, golden_adapter: Path
138 ) -> Iterator[HuggingFaceDifferentialBackend]:
139 backend = HuggingFaceDifferentialBackend(
140 base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
141 adapter_path=golden_adapter,
142 )
143 yield backend
144 backend.close()
145
146
147 def _run_golden_suite(backend: HuggingFaceDifferentialBackend) -> dict[str, object]:
148 """Load the golden spec, run it, return the JSON payload as a dict."""
149 import tempfile
150
151 import yaml
152
153 from dlm_sway.suite.loader import load_spec
154
155 # The checked-in spec has placeholder model paths; substitute the
156 # real ones loaded by the fixture. ``load_spec`` takes a file path;
157 # write the patched payload to a tmp file and load that instead.
158
159 with GOLDEN_SPEC_PATH.open("r", encoding="utf-8") as f:
160 spec_payload = yaml.safe_load(f)
161
162 # Paths don't actually matter to the runner once the backend is
163 # built — the backend is passed in directly, not reconstructed
164 # from the spec. But ``load_spec`` + ``SwaySpec.model_validate``
165 # parse and type-check, so we still need a valid-looking spec.
166 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp:
167 yaml.safe_dump(spec_payload, tmp)
168 tmp_path = Path(tmp.name)
169
170 spec: SwaySpec = load_spec(tmp_path)
171 result = run_suite(spec, backend)
172 score = compute_score(result, weights=None)
173 payload = json.loads(report.to_json(result, score))
174 assert isinstance(payload, dict)
175 return payload
176
177
178 def _update_golden(golden_path: Path, payload: dict[str, object]) -> None:
179 """Write ``payload`` (masked) to ``golden_path`` and emit a
180 human-readable diagnostic so the CI log makes the recipe obvious."""
181 masked = mask_variable_fields(payload)
182 golden_path.parent.mkdir(parents=True, exist_ok=True)
183 golden_path.write_text(json.dumps(masked, indent=2, sort_keys=True) + "\n", encoding="utf-8")
184 sys.stderr.write(f"[determinism-golden] wrote {golden_path}\n")
185
186
187 def test_suite_output_matches_platform_golden(
188 golden_backend: HuggingFaceDifferentialBackend,
189 ) -> None:
190 """The cross-platform determinism test.
191
192 Two execution modes:
193
194 - ``SWAY_UPDATE_GOLDENS=1``: writes the current run's output to
195 ``tests/golden/expected_<platform>.json`` and asserts nothing.
196 Use this locally or from the ``determinism-golden`` CI workflow's
197 regenerate mode.
198 - Default: masks variable fields, loads the platform golden, and
199 asserts ``compare_goldens`` finds no diffs. Missing golden →
200 SKIP with a regen recipe.
201 """
202 payload = _run_golden_suite(golden_backend)
203 golden_path = _golden_path()
204
205 if os.environ.get("SWAY_UPDATE_GOLDENS") == "1":
206 _update_golden(golden_path, payload)
207 pytest.skip(f"wrote golden → {golden_path}; re-run without SWAY_UPDATE_GOLDENS")
208
209 if not golden_path.exists():
210 pytest.skip(
211 f"no golden for {_platform_tag()!r} at {golden_path}. "
212 "Generate it by (a) running locally with SWAY_UPDATE_GOLDENS=1, or "
213 "(b) dispatching the ``determinism-golden`` CI workflow with "
214 "``regenerate_goldens=true`` and committing the uploaded artifact."
215 )
216
217 expected = json.loads(golden_path.read_text(encoding="utf-8"))
218 actual = mask_variable_fields(payload)
219 diffs = compare_goldens(actual, expected)
220 if diffs:
221 formatted = "\n".join(f" - {d}" for d in diffs[:20])
222 extra = f"\n ...and {len(diffs) - 20} more" if len(diffs) > 20 else ""
223 pytest.fail(
224 f"{len(diffs)} golden drift(s) on {_platform_tag()}:\n{formatted}{extra}\n"
225 "If the drift is a deliberate algorithm change, regenerate the "
226 "golden via SWAY_UPDATE_GOLDENS=1 and commit the new file."
227 )