| 1 | """Cross-platform determinism golden — S18 / stretch-list F-item. |
| 2 | |
| 3 | Runs a minimal 2-probe suite against a deterministically-seeded LoRA |
| 4 | adapter on SmolLM2-135M, then compares the JSON output against a |
| 5 | platform-pinned golden file (``tests/golden/expected_<platform>.json``). |
| 6 | |
| 7 | Marked ``slow+online``: needs network for the tiny_model fixture and |
| 8 | HF weights for the adapter build. Runs in a dedicated CI matrix |
| 9 | (ubuntu-latest + macos-latest) via ``.github/workflows/ci.yml``'s |
| 10 | ``determinism-golden`` job. |
| 11 | |
| 12 | Determinism contract this test pins: |
| 13 | |
| 14 | - **Within a platform**, two runs of the same spec + adapter produce |
| 15 | byte-identical JSON (after masking timestamps + wall time). The |
| 16 | existing ``seed_everything`` wiring already holds this — this test |
| 17 | just encodes the check. |
| 18 | - **Across platforms**, numeric drift is bounded: raw metrics within |
| 19 | 1e-6, scores within 1e-4. Looser than bitwise (BLAS implementation |
| 20 | differences make bitwise impossible) but tight enough that a silent |
| 21 | algorithm change — say a ``top_k=256`` default flipped to 128 — |
| 22 | surfaces as a clear drift report on both legs. |
| 23 | |
| 24 | Regeneration recipes: |
| 25 | |
| 26 | - **Locally**: ``SWAY_UPDATE_GOLDENS=1 uv run pytest tests/integration/ |
| 27 | test_determinism_golden.py -m "slow or online"`` writes the |
| 28 | current-platform golden to ``tests/golden/expected_<platform>.json``. |
| 29 | - **Via CI**: dispatch the ``determinism-golden`` workflow with |
| 30 | ``regenerate_goldens=true``; download the uploaded artifact; commit |
| 31 | the platform file to the branch. |
| 32 | |
| 33 | First-time Linux runs SKIP with a clear regen-recipe message when the |
| 34 | ``expected_linux.json`` file is missing — generated via the CI recipe |
| 35 | above and committed as a follow-up to the opening PR. |
| 36 | """ |
| 37 | |
| 38 | from __future__ import annotations |
| 39 | |
| 40 | import json |
| 41 | import os |
| 42 | import sys |
| 43 | from collections.abc import Iterator |
| 44 | from pathlib import Path |
| 45 | |
| 46 | import pytest |
| 47 | |
| 48 | from dlm_sway.backends.hf import HuggingFaceDifferentialBackend |
| 49 | from dlm_sway.core.golden import compare_goldens, mask_variable_fields |
| 50 | from dlm_sway.core.model import ModelSpec |
| 51 | from dlm_sway.suite import report |
| 52 | from dlm_sway.suite.runner import run as run_suite |
| 53 | from dlm_sway.suite.score import compute as compute_score |
| 54 | from dlm_sway.suite.spec import SwaySpec |
| 55 | |
| 56 | pytestmark = [ |
| 57 | pytest.mark.slow, |
| 58 | pytest.mark.online, |
| 59 | # F03 (Audit 03) — macOS CI observed a 20m stall inside |
| 60 | # ``snapshot_download`` on a run that normally completes in |
| 61 | # ~1m. Hard cap at 10m so a silent network hang fails as a |
| 62 | # test (actionable error in the CI log) rather than a |
| 63 | # workflow timeout (zero output). |
| 64 | pytest.mark.timeout(600), |
| 65 | ] |
| 66 | |
| 67 | |
| 68 | GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden" |
| 69 | GOLDEN_SPEC_PATH = GOLDEN_DIR / "spec.yaml" |
| 70 | |
| 71 | |
| 72 | def _platform_tag() -> str: |
| 73 | """Map ``sys.platform`` to the golden filename suffix. |
| 74 | |
| 75 | ``darwin`` → ``darwin``; ``linux`` → ``linux``. Other platforms |
| 76 | (windows, freebsd) skip the test in the caller below; the tag |
| 77 | here still returns something usable for diagnostic messages. |
| 78 | """ |
| 79 | if sys.platform.startswith("darwin"): |
| 80 | return "darwin" |
| 81 | if sys.platform.startswith("linux"): |
| 82 | return "linux" |
| 83 | return sys.platform |
| 84 | |
| 85 | |
| 86 | def _golden_path() -> Path: |
| 87 | return GOLDEN_DIR / f"expected_{_platform_tag()}.json" |
| 88 | |
| 89 | |
| 90 | def _build_deterministic_lora_adapter(base_dir: Path, out_dir: Path) -> None: |
| 91 | """Build a LoRA adapter deterministically from a fixed seed. |
| 92 | |
| 93 | The goal is "bit-identical adapter given the same torch version". |
| 94 | ``torch.manual_seed(0)`` + a fixed init scale achieves that; any |
| 95 | drift in the ranker's per-ULP output downstream is caught by the |
| 96 | golden's tolerance. |
| 97 | |
| 98 | Same seed + init shape as ``test_external_perplexity_e2e``'s |
| 99 | fixture — intentionally reused so the two integration tests |
| 100 | stress the same code path. |
| 101 | """ |
| 102 | import torch |
| 103 | from peft import LoraConfig, get_peft_model |
| 104 | from transformers import AutoModelForCausalLM, AutoTokenizer |
| 105 | |
| 106 | torch.manual_seed(0) |
| 107 | tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) |
| 108 | if tokenizer.pad_token_id is None: |
| 109 | tokenizer.pad_token = tokenizer.eos_token |
| 110 | base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) |
| 111 | cfg = LoraConfig( |
| 112 | r=8, |
| 113 | lora_alpha=16, |
| 114 | target_modules=["q_proj", "v_proj"], |
| 115 | lora_dropout=0.0, |
| 116 | bias="none", |
| 117 | task_type="CAUSAL_LM", |
| 118 | ) |
| 119 | peft_model = get_peft_model(base, cfg) |
| 120 | with torch.no_grad(): |
| 121 | for name, param in peft_model.named_parameters(): |
| 122 | if "lora_B" in name: |
| 123 | param.copy_(torch.randn_like(param) * 0.05) |
| 124 | peft_model.save_pretrained(str(out_dir)) |
| 125 | tokenizer.save_pretrained(str(out_dir)) |
| 126 | |
| 127 | |
| 128 | @pytest.fixture(scope="module") |
| 129 | def golden_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: |
| 130 | adapter_dir = tmp_path_factory.mktemp("golden-adapter") |
| 131 | _build_deterministic_lora_adapter(tiny_model_dir, adapter_dir) |
| 132 | return adapter_dir |
| 133 | |
| 134 | |
| 135 | @pytest.fixture(scope="module") |
| 136 | def golden_backend( |
| 137 | tiny_model_dir: Path, golden_adapter: Path |
| 138 | ) -> Iterator[HuggingFaceDifferentialBackend]: |
| 139 | backend = HuggingFaceDifferentialBackend( |
| 140 | base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), |
| 141 | adapter_path=golden_adapter, |
| 142 | ) |
| 143 | yield backend |
| 144 | backend.close() |
| 145 | |
| 146 | |
| 147 | def _run_golden_suite(backend: HuggingFaceDifferentialBackend) -> dict[str, object]: |
| 148 | """Load the golden spec, run it, return the JSON payload as a dict.""" |
| 149 | import tempfile |
| 150 | |
| 151 | import yaml |
| 152 | |
| 153 | from dlm_sway.suite.loader import load_spec |
| 154 | |
| 155 | # The checked-in spec has placeholder model paths; substitute the |
| 156 | # real ones loaded by the fixture. ``load_spec`` takes a file path; |
| 157 | # write the patched payload to a tmp file and load that instead. |
| 158 | |
| 159 | with GOLDEN_SPEC_PATH.open("r", encoding="utf-8") as f: |
| 160 | spec_payload = yaml.safe_load(f) |
| 161 | |
| 162 | # Paths don't actually matter to the runner once the backend is |
| 163 | # built — the backend is passed in directly, not reconstructed |
| 164 | # from the spec. But ``load_spec`` + ``SwaySpec.model_validate`` |
| 165 | # parse and type-check, so we still need a valid-looking spec. |
| 166 | with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp: |
| 167 | yaml.safe_dump(spec_payload, tmp) |
| 168 | tmp_path = Path(tmp.name) |
| 169 | |
| 170 | spec: SwaySpec = load_spec(tmp_path) |
| 171 | result = run_suite(spec, backend) |
| 172 | score = compute_score(result, weights=None) |
| 173 | payload = json.loads(report.to_json(result, score)) |
| 174 | assert isinstance(payload, dict) |
| 175 | return payload |
| 176 | |
| 177 | |
| 178 | def _update_golden(golden_path: Path, payload: dict[str, object]) -> None: |
| 179 | """Write ``payload`` (masked) to ``golden_path`` and emit a |
| 180 | human-readable diagnostic so the CI log makes the recipe obvious.""" |
| 181 | masked = mask_variable_fields(payload) |
| 182 | golden_path.parent.mkdir(parents=True, exist_ok=True) |
| 183 | golden_path.write_text(json.dumps(masked, indent=2, sort_keys=True) + "\n", encoding="utf-8") |
| 184 | sys.stderr.write(f"[determinism-golden] wrote {golden_path}\n") |
| 185 | |
| 186 | |
| 187 | def test_suite_output_matches_platform_golden( |
| 188 | golden_backend: HuggingFaceDifferentialBackend, |
| 189 | ) -> None: |
| 190 | """The cross-platform determinism test. |
| 191 | |
| 192 | Two execution modes: |
| 193 | |
| 194 | - ``SWAY_UPDATE_GOLDENS=1``: writes the current run's output to |
| 195 | ``tests/golden/expected_<platform>.json`` and asserts nothing. |
| 196 | Use this locally or from the ``determinism-golden`` CI workflow's |
| 197 | regenerate mode. |
| 198 | - Default: masks variable fields, loads the platform golden, and |
| 199 | asserts ``compare_goldens`` finds no diffs. Missing golden → |
| 200 | SKIP with a regen recipe. |
| 201 | """ |
| 202 | payload = _run_golden_suite(golden_backend) |
| 203 | golden_path = _golden_path() |
| 204 | |
| 205 | if os.environ.get("SWAY_UPDATE_GOLDENS") == "1": |
| 206 | _update_golden(golden_path, payload) |
| 207 | pytest.skip(f"wrote golden → {golden_path}; re-run without SWAY_UPDATE_GOLDENS") |
| 208 | |
| 209 | if not golden_path.exists(): |
| 210 | pytest.skip( |
| 211 | f"no golden for {_platform_tag()!r} at {golden_path}. " |
| 212 | "Generate it by (a) running locally with SWAY_UPDATE_GOLDENS=1, or " |
| 213 | "(b) dispatching the ``determinism-golden`` CI workflow with " |
| 214 | "``regenerate_goldens=true`` and committing the uploaded artifact." |
| 215 | ) |
| 216 | |
| 217 | expected = json.loads(golden_path.read_text(encoding="utf-8")) |
| 218 | actual = mask_variable_fields(payload) |
| 219 | diffs = compare_goldens(actual, expected) |
| 220 | if diffs: |
| 221 | formatted = "\n".join(f" - {d}" for d in diffs[:20]) |
| 222 | extra = f"\n ...and {len(diffs) - 20} more" if len(diffs) > 20 else "" |
| 223 | pytest.fail( |
| 224 | f"{len(diffs)} golden drift(s) on {_platform_tag()}:\n{formatted}{extra}\n" |
| 225 | "If the drift is a deliberate algorithm change, regenerate the " |
| 226 | "golden via SWAY_UPDATE_GOLDENS=1 and commit the new file." |
| 227 | ) |