sway Public

Watch 0 Fork 0 Star 0

Python · 8853 bytes Raw Blame History

  
        1
        """Cross-platform determinism golden — S18 / stretch-list F-item.
      
        2
        
        3
        Runs a minimal 2-probe suite against a deterministically-seeded LoRA
      
        4
        adapter on SmolLM2-135M, then compares the JSON output against a
      
        5
        platform-pinned golden file (``tests/golden/expected_<platform>.json``).
      
        6
        
        7
        Marked ``slow+online``: needs network for the tiny_model fixture and
      
        8
        HF weights for the adapter build. Runs in a dedicated CI matrix
      
        9
        (ubuntu-latest + macos-latest) via ``.github/workflows/ci.yml``'s
      
        10
        ``determinism-golden`` job.
      
        11
        
        12
        Determinism contract this test pins:
      
        13
        
        14
        - **Within a platform**, two runs of the same spec + adapter produce
      
        15
          byte-identical JSON (after masking timestamps + wall time). The
      
        16
          existing ``seed_everything`` wiring already holds this — this test
      
        17
          just encodes the check.
      
        18
        - **Across platforms**, numeric drift is bounded: raw metrics within
      
        19
          1e-6, scores within 1e-4. Looser than bitwise (BLAS implementation
      
        20
          differences make bitwise impossible) but tight enough that a silent
      
        21
          algorithm change — say a ``top_k=256`` default flipped to 128 —
      
        22
          surfaces as a clear drift report on both legs.
      
        23
        
        24
        Regeneration recipes:
      
        25
        
        26
        - **Locally**: ``SWAY_UPDATE_GOLDENS=1 uv run pytest tests/integration/
      
        27
          test_determinism_golden.py -m "slow or online"`` writes the
      
        28
          current-platform golden to ``tests/golden/expected_<platform>.json``.
      
        29
        - **Via CI**: dispatch the ``determinism-golden`` workflow with
      
        30
          ``regenerate_goldens=true``; download the uploaded artifact; commit
      
        31
          the platform file to the branch.
      
        32
        
        33
        First-time Linux runs SKIP with a clear regen-recipe message when the
      
        34
        ``expected_linux.json`` file is missing — generated via the CI recipe
      
        35
        above and committed as a follow-up to the opening PR.
      
        36
        """
      
        37
        
        38
        from __future__ import annotations
      
        39
        
        40
        import json
      
        41
        import os
      
        42
        import sys
      
        43
        from collections.abc import Iterator
      
        44
        from pathlib import Path
      
        45
        
        46
        import pytest
      
        47
        
        48
        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        49
        from dlm_sway.core.golden import compare_goldens, mask_variable_fields
      
        50
        from dlm_sway.core.model import ModelSpec
      
        51
        from dlm_sway.suite import report
      
        52
        from dlm_sway.suite.runner import run as run_suite
      
        53
        from dlm_sway.suite.score import compute as compute_score
      
        54
        from dlm_sway.suite.spec import SwaySpec
      
        55
        
        56
        pytestmark = [
      
        57
            pytest.mark.slow,
      
        58
            pytest.mark.online,
      
        59
            # F03 (Audit 03) — macOS CI observed a 20m stall inside
      
        60
            # ``snapshot_download`` on a run that normally completes in
      
        61
            # ~1m. Hard cap at 10m so a silent network hang fails as a
      
        62
            # test (actionable error in the CI log) rather than a
      
        63
            # workflow timeout (zero output).
      
        64
            pytest.mark.timeout(600),
      
        65
        ]
      
        66
        
        67
        
        68
        GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden"
      
        69
        GOLDEN_SPEC_PATH = GOLDEN_DIR / "spec.yaml"
      
        70
        
        71
        
        72
        def _platform_tag() -> str:
      
        73
            """Map ``sys.platform`` to the golden filename suffix.
      
        74
        
        75
            ``darwin`` → ``darwin``; ``linux`` → ``linux``. Other platforms
      
        76
            (windows, freebsd) skip the test in the caller below; the tag
      
        77
            here still returns something usable for diagnostic messages.
      
        78
            """
      
        79
            if sys.platform.startswith("darwin"):
      
        80
                return "darwin"
      
        81
            if sys.platform.startswith("linux"):
      
        82
                return "linux"
      
        83
            return sys.platform
      
        84
        
        85
        
        86
        def _golden_path() -> Path:
      
        87
            return GOLDEN_DIR / f"expected_{_platform_tag()}.json"
      
        88
        
        89
        
        90
        def _build_deterministic_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        91
            """Build a LoRA adapter deterministically from a fixed seed.
      
        92
        
        93
            The goal is "bit-identical adapter given the same torch version".
      
        94
            ``torch.manual_seed(0)`` + a fixed init scale achieves that; any
      
        95
            drift in the ranker's per-ULP output downstream is caught by the
      
        96
            golden's tolerance.
      
        97
        
        98
            Same seed + init shape as ``test_external_perplexity_e2e``'s
      
        99
            fixture — intentionally reused so the two integration tests
      
        100
            stress the same code path.
      
        101
            """
      
        102
            import torch
      
        103
            from peft import LoraConfig, get_peft_model
      
        104
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        105
        
        106
            torch.manual_seed(0)
      
        107
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        108
            if tokenizer.pad_token_id is None:
      
        109
                tokenizer.pad_token = tokenizer.eos_token
      
        110
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        111
            cfg = LoraConfig(
      
        112
                r=8,
      
        113
                lora_alpha=16,
      
        114
                target_modules=["q_proj", "v_proj"],
      
        115
                lora_dropout=0.0,
      
        116
                bias="none",
      
        117
                task_type="CAUSAL_LM",
      
        118
            )
      
        119
            peft_model = get_peft_model(base, cfg)
      
        120
            with torch.no_grad():
      
        121
                for name, param in peft_model.named_parameters():
      
        122
                    if "lora_B" in name:
      
        123
                        param.copy_(torch.randn_like(param) * 0.05)
      
        124
            peft_model.save_pretrained(str(out_dir))
      
        125
            tokenizer.save_pretrained(str(out_dir))
      
        126
        
        127
        
        128
        @pytest.fixture(scope="module")
      
        129
        def golden_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        130
            adapter_dir = tmp_path_factory.mktemp("golden-adapter")
      
        131
            _build_deterministic_lora_adapter(tiny_model_dir, adapter_dir)
      
        132
            return adapter_dir
      
        133
        
        134
        
        135
        @pytest.fixture(scope="module")
      
        136
        def golden_backend(
      
        137
            tiny_model_dir: Path, golden_adapter: Path
      
        138
        ) -> Iterator[HuggingFaceDifferentialBackend]:
      
        139
            backend = HuggingFaceDifferentialBackend(
      
        140
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        141
                adapter_path=golden_adapter,
      
        142
            )
      
        143
            yield backend
      
        144
            backend.close()
      
        145
        
        146
        
        147
        def _run_golden_suite(backend: HuggingFaceDifferentialBackend) -> dict[str, object]:
      
        148
            """Load the golden spec, run it, return the JSON payload as a dict."""
      
        149
            import tempfile
      
        150
        
        151
            import yaml
      
        152
        
        153
            from dlm_sway.suite.loader import load_spec
      
        154
        
        155
            # The checked-in spec has placeholder model paths; substitute the
      
        156
            # real ones loaded by the fixture. ``load_spec`` takes a file path;
      
        157
            # write the patched payload to a tmp file and load that instead.
      
        158
        
        159
            with GOLDEN_SPEC_PATH.open("r", encoding="utf-8") as f:
      
        160
                spec_payload = yaml.safe_load(f)
      
        161
        
        162
            # Paths don't actually matter to the runner once the backend is
      
        163
            # built — the backend is passed in directly, not reconstructed
      
        164
            # from the spec. But ``load_spec`` + ``SwaySpec.model_validate``
      
        165
            # parse and type-check, so we still need a valid-looking spec.
      
        166
            with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp:
      
        167
                yaml.safe_dump(spec_payload, tmp)
      
        168
                tmp_path = Path(tmp.name)
      
        169
        
        170
            spec: SwaySpec = load_spec(tmp_path)
      
        171
            result = run_suite(spec, backend)
      
        172
            score = compute_score(result, weights=None)
      
        173
            payload = json.loads(report.to_json(result, score))
      
        174
            assert isinstance(payload, dict)
      
        175
            return payload
      
        176
        
        177
        
        178
        def _update_golden(golden_path: Path, payload: dict[str, object]) -> None:
      
        179
            """Write ``payload`` (masked) to ``golden_path`` and emit a
      
        180
            human-readable diagnostic so the CI log makes the recipe obvious."""
      
        181
            masked = mask_variable_fields(payload)
      
        182
            golden_path.parent.mkdir(parents=True, exist_ok=True)
      
        183
            golden_path.write_text(json.dumps(masked, indent=2, sort_keys=True) + "\n", encoding="utf-8")
      
        184
            sys.stderr.write(f"[determinism-golden] wrote {golden_path}\n")
      
        185
        
        186
        
        187
        def test_suite_output_matches_platform_golden(
      
        188
            golden_backend: HuggingFaceDifferentialBackend,
      
        189
        ) -> None:
      
        190
            """The cross-platform determinism test.
      
        191
        
        192
            Two execution modes:
      
        193
        
        194
            - ``SWAY_UPDATE_GOLDENS=1``: writes the current run's output to
      
        195
              ``tests/golden/expected_<platform>.json`` and asserts nothing.
      
        196
              Use this locally or from the ``determinism-golden`` CI workflow's
      
        197
              regenerate mode.
      
        198
            - Default: masks variable fields, loads the platform golden, and
      
        199
              asserts ``compare_goldens`` finds no diffs. Missing golden →
      
        200
              SKIP with a regen recipe.
      
        201
            """
      
        202
            payload = _run_golden_suite(golden_backend)
      
        203
            golden_path = _golden_path()
      
        204
        
        205
            if os.environ.get("SWAY_UPDATE_GOLDENS") == "1":
      
        206
                _update_golden(golden_path, payload)
      
        207
                pytest.skip(f"wrote golden → {golden_path}; re-run without SWAY_UPDATE_GOLDENS")
      
        208
        
        209
            if not golden_path.exists():
      
        210
                pytest.skip(
      
        211
                    f"no golden for {_platform_tag()!r} at {golden_path}. "
      
        212
                    "Generate it by (a) running locally with SWAY_UPDATE_GOLDENS=1, or "
      
        213
                    "(b) dispatching the ``determinism-golden`` CI workflow with "
      
        214
                    "``regenerate_goldens=true`` and committing the uploaded artifact."
      
        215
                )
      
        216
        
        217
            expected = json.loads(golden_path.read_text(encoding="utf-8"))
      
        218
            actual = mask_variable_fields(payload)
      
        219
            diffs = compare_goldens(actual, expected)
      
        220
            if diffs:
      
        221
                formatted = "\n".join(f"  - {d}" for d in diffs[:20])
      
        222
                extra = f"\n  ...and {len(diffs) - 20} more" if len(diffs) > 20 else ""
      
        223
                pytest.fail(
      
        224
                    f"{len(diffs)} golden drift(s) on {_platform_tag()}:\n{formatted}{extra}\n"
      
        225
                    "If the drift is a deliberate algorithm change, regenerate the "
      
        226
                    "golden via SWAY_UPDATE_GOLDENS=1 and commit the new file."
      
        227
                )

1	"""Cross-platform determinism golden — S18 / stretch-list F-item.
2
3	Runs a minimal 2-probe suite against a deterministically-seeded LoRA
4	adapter on SmolLM2-135M, then compares the JSON output against a
5	platform-pinned golden file (``tests/golden/expected_<platform>.json``).
6
7	Marked ``slow+online``: needs network for the tiny_model fixture and
8	HF weights for the adapter build. Runs in a dedicated CI matrix
9	(ubuntu-latest + macos-latest) via ``.github/workflows/ci.yml``'s
10	``determinism-golden`` job.
11
12	Determinism contract this test pins:
13
14	- Within a platform, two runs of the same spec + adapter produce
15	byte-identical JSON (after masking timestamps + wall time). The
16	existing ``seed_everything`` wiring already holds this — this test
17	just encodes the check.
18	- Across platforms, numeric drift is bounded: raw metrics within
19	1e-6, scores within 1e-4. Looser than bitwise (BLAS implementation
20	differences make bitwise impossible) but tight enough that a silent
21	algorithm change — say a ``top_k=256`` default flipped to 128 —
22	surfaces as a clear drift report on both legs.
23
24	Regeneration recipes:
25
26	- Locally: ``SWAY_UPDATE_GOLDENS=1 uv run pytest tests/integration/
27	test_determinism_golden.py -m "slow or online"`` writes the
28	current-platform golden to ``tests/golden/expected_<platform>.json``.
29	- Via CI: dispatch the ``determinism-golden`` workflow with
30	``regenerate_goldens=true``; download the uploaded artifact; commit
31	the platform file to the branch.
32
33	First-time Linux runs SKIP with a clear regen-recipe message when the
34	``expected_linux.json`` file is missing — generated via the CI recipe
35	above and committed as a follow-up to the opening PR.
36	"""
37
38	from __future__ import annotations
39
40	import json
41	import os
42	import sys
43	from collections.abc import Iterator
44	from pathlib import Path
45
46	import pytest
47
48	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
49	from dlm_sway.core.golden import compare_goldens, mask_variable_fields
50	from dlm_sway.core.model import ModelSpec
51	from dlm_sway.suite import report
52	from dlm_sway.suite.runner import run as run_suite
53	from dlm_sway.suite.score import compute as compute_score
54	from dlm_sway.suite.spec import SwaySpec
55
56	pytestmark = [
57	pytest.mark.slow,
58	pytest.mark.online,
59	# F03 (Audit 03) — macOS CI observed a 20m stall inside
60	# ``snapshot_download`` on a run that normally completes in
61	# ~1m. Hard cap at 10m so a silent network hang fails as a
62	# test (actionable error in the CI log) rather than a
63	# workflow timeout (zero output).
64	pytest.mark.timeout(600),
65	]
66
67
68	GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden"
69	GOLDEN_SPEC_PATH = GOLDEN_DIR / "spec.yaml"
70
71
72	def _platform_tag() -> str:
73	"""Map ``sys.platform`` to the golden filename suffix.
74
75	``darwin`` → ``darwin``; ``linux`` → ``linux``. Other platforms
76	(windows, freebsd) skip the test in the caller below; the tag
77	here still returns something usable for diagnostic messages.
78	"""
79	if sys.platform.startswith("darwin"):
80	return "darwin"
81	if sys.platform.startswith("linux"):
82	return "linux"
83	return sys.platform
84
85
86	def _golden_path() -> Path:
87	return GOLDEN_DIR / f"expected_{_platform_tag()}.json"
88
89
90	def _build_deterministic_lora_adapter(base_dir: Path, out_dir: Path) -> None:
91	"""Build a LoRA adapter deterministically from a fixed seed.
92
93	The goal is "bit-identical adapter given the same torch version".
94	``torch.manual_seed(0)`` + a fixed init scale achieves that; any
95	drift in the ranker's per-ULP output downstream is caught by the
96	golden's tolerance.
97
98	Same seed + init shape as ``test_external_perplexity_e2e``'s
99	fixture — intentionally reused so the two integration tests
100	stress the same code path.
101	"""
102	import torch
103	from peft import LoraConfig, get_peft_model
104	from transformers import AutoModelForCausalLM, AutoTokenizer
105
106	torch.manual_seed(0)
107	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
108	if tokenizer.pad_token_id is None:
109	tokenizer.pad_token = tokenizer.eos_token
110	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
111	cfg = LoraConfig(
112	r=8,
113	lora_alpha=16,
114	target_modules=["q_proj", "v_proj"],
115	lora_dropout=0.0,
116	bias="none",
117	task_type="CAUSAL_LM",
118	)
119	peft_model = get_peft_model(base, cfg)
120	with torch.no_grad():
121	for name, param in peft_model.named_parameters():
122	if "lora_B" in name:
123	param.copy_(torch.randn_like(param) * 0.05)
124	peft_model.save_pretrained(str(out_dir))
125	tokenizer.save_pretrained(str(out_dir))
126
127
128	@pytest.fixture(scope="module")
129	def golden_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
130	adapter_dir = tmp_path_factory.mktemp("golden-adapter")
131	_build_deterministic_lora_adapter(tiny_model_dir, adapter_dir)
132	return adapter_dir
133
134
135	@pytest.fixture(scope="module")
136	def golden_backend(
137	tiny_model_dir: Path, golden_adapter: Path
138	) -> Iterator[HuggingFaceDifferentialBackend]:
139	backend = HuggingFaceDifferentialBackend(
140	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
141	adapter_path=golden_adapter,
142	)
143	yield backend
144	backend.close()
145
146
147	def _run_golden_suite(backend: HuggingFaceDifferentialBackend) -> dict[str, object]:
148	"""Load the golden spec, run it, return the JSON payload as a dict."""
149	import tempfile
150
151	import yaml
152
153	from dlm_sway.suite.loader import load_spec
154
155	# The checked-in spec has placeholder model paths; substitute the
156	# real ones loaded by the fixture. ``load_spec`` takes a file path;
157	# write the patched payload to a tmp file and load that instead.
158
159	with GOLDEN_SPEC_PATH.open("r", encoding="utf-8") as f:
160	spec_payload = yaml.safe_load(f)
161
162	# Paths don't actually matter to the runner once the backend is
163	# built — the backend is passed in directly, not reconstructed
164	# from the spec. But ``load_spec`` + ``SwaySpec.model_validate``
165	# parse and type-check, so we still need a valid-looking spec.
166	with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as tmp:
167	yaml.safe_dump(spec_payload, tmp)
168	tmp_path = Path(tmp.name)
169
170	spec: SwaySpec = load_spec(tmp_path)
171	result = run_suite(spec, backend)
172	score = compute_score(result, weights=None)
173	payload = json.loads(report.to_json(result, score))
174	assert isinstance(payload, dict)
175	return payload
176
177
178	def _update_golden(golden_path: Path, payload: dict[str, object]) -> None:
179	"""Write ``payload`` (masked) to ``golden_path`` and emit a
180	human-readable diagnostic so the CI log makes the recipe obvious."""
181	masked = mask_variable_fields(payload)
182	golden_path.parent.mkdir(parents=True, exist_ok=True)
183	golden_path.write_text(json.dumps(masked, indent=2, sort_keys=True) + "\n", encoding="utf-8")
184	sys.stderr.write(f"[determinism-golden] wrote {golden_path}\n")
185
186
187	def test_suite_output_matches_platform_golden(
188	golden_backend: HuggingFaceDifferentialBackend,
189	) -> None:
190	"""The cross-platform determinism test.
191
192	Two execution modes:
193
194	- ``SWAY_UPDATE_GOLDENS=1``: writes the current run's output to
195	``tests/golden/expected_<platform>.json`` and asserts nothing.
196	Use this locally or from the ``determinism-golden`` CI workflow's
197	regenerate mode.
198	- Default: masks variable fields, loads the platform golden, and
199	asserts ``compare_goldens`` finds no diffs. Missing golden →
200	SKIP with a regen recipe.
201	"""
202	payload = _run_golden_suite(golden_backend)
203	golden_path = _golden_path()
204
205	if os.environ.get("SWAY_UPDATE_GOLDENS") == "1":
206	_update_golden(golden_path, payload)
207	pytest.skip(f"wrote golden → {golden_path}; re-run without SWAY_UPDATE_GOLDENS")
208
209	if not golden_path.exists():
210	pytest.skip(
211	f"no golden for {_platform_tag()!r} at {golden_path}. "
212	"Generate it by (a) running locally with SWAY_UPDATE_GOLDENS=1, or "
213	"(b) dispatching the ``determinism-golden`` CI workflow with "
214	"``regenerate_goldens=true`` and committing the uploaded artifact."
215	)
216
217	expected = json.loads(golden_path.read_text(encoding="utf-8"))
218	actual = mask_variable_fields(payload)
219	diffs = compare_goldens(actual, expected)
220	if diffs:
221	formatted = "\n".join(f" - {d}" for d in diffs[:20])
222	extra = f"\n ...and {len(diffs) - 20} more" if len(diffs) > 20 else ""
223	pytest.fail(
224	f"{len(diffs)} golden drift(s) on {_platform_tag()}:\n{formatted}{extra}\n"
225	"If the drift is a deliberate algorithm change, regenerate the "
226	"golden via SWAY_UPDATE_GOLDENS=1 and commit the new file."
227	)