sway Public

Watch 0 Fork 0 Star 0

Python · 5993 bytes Raw Blame History

  
        1
        """Integration regression for Audit 01's +11639σ bug.
      
        2
        
        3
        This test pins the S01 invariant end-to-end: a PEFT adapter whose
      
        4
        weights are NaN on disk must produce ``Verdict.ERROR`` from the suite,
      
        5
        not a PASS verdict at a mathematically-impossible z-score.
      
        6
        
        7
        We build a real LoRA adapter on the tiny-model fixture, then poison
      
        8
        every ``lora_A`` / ``lora_B`` safetensors shard with NaN before
      
        9
        constructing :class:`HuggingFaceDifferentialBackend` and running a
      
        10
        real ``sway run`` against it. The full chain — preflight check,
      
        11
        ``_divergence`` guards, ``safe_finalize`` — is exercised.
      
        12
        
        13
        Marked ``slow+online`` so the default fast test run skips it; the
      
        14
        audit-response CI lane runs ``pytest -m slow`` to execute it.
      
        15
        """
      
        16
        
        17
        from __future__ import annotations
      
        18
        
        19
        from pathlib import Path
      
        20
        
        21
        import pytest
      
        22
        
        23
        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        24
        from dlm_sway.core.model import ModelSpec
      
        25
        from dlm_sway.core.result import Verdict
      
        26
        from dlm_sway.suite.runner import run as run_suite
      
        27
        from dlm_sway.suite.spec import SwaySpec
      
        28
        
        29
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        30
        
        31
        
        32
        def _build_nan_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        33
            """Build a PEFT adapter then overwrite every lora_A/lora_B with NaN.
      
        34
        
        35
            Reproduces the exact pathology the audit observed: structurally
      
        36
            valid adapter config + tokenizer + safetensors shard layout, but
      
        37
            the numeric tensors are populated with NaN. This is what
      
        38
            ``dlm train`` used to produce on MPS with tiny datasets (fixed
      
        39
            upstream but still the canonical "broken adapter" regression case).
      
        40
            """
      
        41
            import torch
      
        42
            from peft import LoraConfig, get_peft_model
      
        43
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        44
        
        45
            torch.manual_seed(0)
      
        46
        
        47
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        48
            if tokenizer.pad_token_id is None:
      
        49
                tokenizer.pad_token = tokenizer.eos_token
      
        50
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        51
        
        52
            cfg = LoraConfig(
      
        53
                r=8,
      
        54
                lora_alpha=16,
      
        55
                target_modules=["q_proj", "v_proj"],
      
        56
                lora_dropout=0.0,
      
        57
                bias="none",
      
        58
                task_type="CAUSAL_LM",
      
        59
            )
      
        60
            peft_model = get_peft_model(base, cfg)
      
        61
        
        62
            # Poison: fill every LoRA parameter with NaN.
      
        63
            with torch.no_grad():
      
        64
                for name, param in peft_model.named_parameters():
      
        65
                    if "lora_A" in name or "lora_B" in name:
      
        66
                        param.fill_(float("nan"))
      
        67
        
        68
            peft_model.save_pretrained(str(out_dir))
      
        69
            tokenizer.save_pretrained(str(out_dir))
      
        70
        
        71
        
        72
        @pytest.fixture(scope="module")
      
        73
        def nan_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        74
            adapter_dir = tmp_path_factory.mktemp("nan-adapter")
      
        75
            _build_nan_lora_adapter(tiny_model_dir, adapter_dir)
      
        76
            return adapter_dir
      
        77
        
        78
        
        79
        def test_nan_adapter_on_disk_is_reproducibly_nan(nan_adapter: Path) -> None:
      
        80
            """Sanity: the poisoned adapter's persisted weights are actually NaN."""
      
        81
            import torch
      
        82
            from safetensors.torch import load_file
      
        83
        
        84
            weights = load_file(str(nan_adapter / "adapter_model.safetensors"))
      
        85
            assert weights, "no tensors in adapter_model.safetensors"
      
        86
            at_least_one_nan = False
      
        87
            for name, t in weights.items():
      
        88
                if "lora_A" in name or "lora_B" in name:
      
        89
                    assert torch.isnan(t).all(), f"{name} is not fully NaN — regression fixture broken"
      
        90
                    at_least_one_nan = True
      
        91
            assert at_least_one_nan, "no lora_A/lora_B tensors found — adapter structure unexpected"
      
        92
        
        93
        
        94
        def test_hf_backend_preflight_rejects_nan_adapter(tiny_model_dir: Path, nan_adapter: Path) -> None:
      
        95
            """The HF backend's preflight catches the NaN adapter at construction time.
      
        96
        
        97
            Before S01 this ran to completion and produced JS = 13.247 nats.
      
        98
            Now: preflight returns ``(False, ...)`` and the suite aborts.
      
        99
            """
      
        100
            backend = HuggingFaceDifferentialBackend(
      
        101
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        102
                adapter_path=nan_adapter,
      
        103
            )
      
        104
            try:
      
        105
                ok, reason = backend.preflight_finite_check()
      
        106
                assert ok is False
      
        107
                assert "non-finite" in reason.lower() or "nan" in reason.lower()
      
        108
            finally:
      
        109
                backend.close()
      
        110
        
        111
        
        112
        def test_full_suite_run_emits_error_not_pass_on_nan_adapter(
      
        113
            tiny_model_dir: Path, nan_adapter: Path
      
        114
        ) -> None:
      
        115
            """End-to-end: ``sway run`` against a NaN adapter returns ERROR banner.
      
        116
        
        117
            The regression this pins is the +11639σ headline the audit caught.
      
        118
            """
      
        119
            spec = SwaySpec.model_validate(
      
        120
                {
      
        121
                    "version": 1,
      
        122
                    "models": {
      
        123
                        "base": {
      
        124
                            "kind": "hf",
      
        125
                            "base": str(tiny_model_dir),
      
        126
                            "dtype": "fp32",
      
        127
                            "device": "cpu",
      
        128
                        },
      
        129
                        "ft": {
      
        130
                            "kind": "hf",
      
        131
                            "base": str(tiny_model_dir),
      
        132
                            "dtype": "fp32",
      
        133
                            "device": "cpu",
      
        134
                            "adapter": str(nan_adapter),
      
        135
                        },
      
        136
                    },
      
        137
                    "suite": [
      
        138
                        {"name": "doc_kl", "kind": "delta_kl", "prompts": ["hello world"]},
      
        139
                    ],
      
        140
                }
      
        141
            )
      
        142
            backend = HuggingFaceDifferentialBackend(
      
        143
                base_spec=spec.models.ft,
      
        144
                adapter_path=nan_adapter,
      
        145
            )
      
        146
            try:
      
        147
                result = run_suite(spec, backend, spec_path="<nan-regression>")
      
        148
            finally:
      
        149
                backend.close()
      
        150
        
        151
            # Preflight should short-circuit: exactly one synthetic ERROR probe;
      
        152
            # the configured delta_kl probe never runs.
      
        153
            assert len(result.probes) == 1
      
        154
            preflight = result.probes[0]
      
        155
            assert preflight.kind == "preflight"
      
        156
            assert preflight.verdict == Verdict.ERROR
      
        157
            assert "preflight failed" in preflight.message.lower()
      
        158
            # Absolutely no PASS verdict anywhere in the suite result.
      
        159
            assert not any(r.verdict == Verdict.PASS for r in result.probes)
      
        160
            # Sanity: the delta_kl probe configured in the spec did not run.
      
        161
            assert not any(r.kind == "delta_kl" for r in result.probes)

1	"""Integration regression for Audit 01's +11639σ bug.
2
3	This test pins the S01 invariant end-to-end: a PEFT adapter whose
4	weights are NaN on disk must produce ``Verdict.ERROR`` from the suite,
5	not a PASS verdict at a mathematically-impossible z-score.
6
7	We build a real LoRA adapter on the tiny-model fixture, then poison
8	every ``lora_A`` / ``lora_B`` safetensors shard with NaN before
9	constructing :class:`HuggingFaceDifferentialBackend` and running a
10	real ``sway run`` against it. The full chain — preflight check,
11	``_divergence`` guards, ``safe_finalize`` — is exercised.
12
13	Marked ``slow+online`` so the default fast test run skips it; the
14	audit-response CI lane runs ``pytest -m slow`` to execute it.
15	"""
16
17	from __future__ import annotations
18
19	from pathlib import Path
20
21	import pytest
22
23	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
24	from dlm_sway.core.model import ModelSpec
25	from dlm_sway.core.result import Verdict
26	from dlm_sway.suite.runner import run as run_suite
27	from dlm_sway.suite.spec import SwaySpec
28
29	pytestmark = [pytest.mark.slow, pytest.mark.online]
30
31
32	def _build_nan_lora_adapter(base_dir: Path, out_dir: Path) -> None:
33	"""Build a PEFT adapter then overwrite every lora_A/lora_B with NaN.
34
35	Reproduces the exact pathology the audit observed: structurally
36	valid adapter config + tokenizer + safetensors shard layout, but
37	the numeric tensors are populated with NaN. This is what
38	``dlm train`` used to produce on MPS with tiny datasets (fixed
39	upstream but still the canonical "broken adapter" regression case).
40	"""
41	import torch
42	from peft import LoraConfig, get_peft_model
43	from transformers import AutoModelForCausalLM, AutoTokenizer
44
45	torch.manual_seed(0)
46
47	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
48	if tokenizer.pad_token_id is None:
49	tokenizer.pad_token = tokenizer.eos_token
50	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
51
52	cfg = LoraConfig(
53	r=8,
54	lora_alpha=16,
55	target_modules=["q_proj", "v_proj"],
56	lora_dropout=0.0,
57	bias="none",
58	task_type="CAUSAL_LM",
59	)
60	peft_model = get_peft_model(base, cfg)
61
62	# Poison: fill every LoRA parameter with NaN.
63	with torch.no_grad():
64	for name, param in peft_model.named_parameters():
65	if "lora_A" in name or "lora_B" in name:
66	param.fill_(float("nan"))
67
68	peft_model.save_pretrained(str(out_dir))
69	tokenizer.save_pretrained(str(out_dir))
70
71
72	@pytest.fixture(scope="module")
73	def nan_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
74	adapter_dir = tmp_path_factory.mktemp("nan-adapter")
75	_build_nan_lora_adapter(tiny_model_dir, adapter_dir)
76	return adapter_dir
77
78
79	def test_nan_adapter_on_disk_is_reproducibly_nan(nan_adapter: Path) -> None:
80	"""Sanity: the poisoned adapter's persisted weights are actually NaN."""
81	import torch
82	from safetensors.torch import load_file
83
84	weights = load_file(str(nan_adapter / "adapter_model.safetensors"))
85	assert weights, "no tensors in adapter_model.safetensors"
86	at_least_one_nan = False
87	for name, t in weights.items():
88	if "lora_A" in name or "lora_B" in name:
89	assert torch.isnan(t).all(), f"{name} is not fully NaN — regression fixture broken"
90	at_least_one_nan = True
91	assert at_least_one_nan, "no lora_A/lora_B tensors found — adapter structure unexpected"
92
93
94	def test_hf_backend_preflight_rejects_nan_adapter(tiny_model_dir: Path, nan_adapter: Path) -> None:
95	"""The HF backend's preflight catches the NaN adapter at construction time.
96
97	Before S01 this ran to completion and produced JS = 13.247 nats.
98	Now: preflight returns ``(False, ...)`` and the suite aborts.
99	"""
100	backend = HuggingFaceDifferentialBackend(
101	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
102	adapter_path=nan_adapter,
103	)
104	try:
105	ok, reason = backend.preflight_finite_check()
106	assert ok is False
107	assert "non-finite" in reason.lower() or "nan" in reason.lower()
108	finally:
109	backend.close()
110
111
112	def test_full_suite_run_emits_error_not_pass_on_nan_adapter(
113	tiny_model_dir: Path, nan_adapter: Path
114	) -> None:
115	"""End-to-end: ``sway run`` against a NaN adapter returns ERROR banner.
116
117	The regression this pins is the +11639σ headline the audit caught.
118	"""
119	spec = SwaySpec.model_validate(
120	{
121	"version": 1,
122	"models": {
123	"base": {
124	"kind": "hf",
125	"base": str(tiny_model_dir),
126	"dtype": "fp32",
127	"device": "cpu",
128	},
129	"ft": {
130	"kind": "hf",
131	"base": str(tiny_model_dir),
132	"dtype": "fp32",
133	"device": "cpu",
134	"adapter": str(nan_adapter),
135	},
136	},
137	"suite": [
138	{"name": "doc_kl", "kind": "delta_kl", "prompts": ["hello world"]},
139	],
140	}
141	)
142	backend = HuggingFaceDifferentialBackend(
143	base_spec=spec.models.ft,
144	adapter_path=nan_adapter,
145	)
146	try:
147	result = run_suite(spec, backend, spec_path="<nan-regression>")
148	finally:
149	backend.close()
150
151	# Preflight should short-circuit: exactly one synthetic ERROR probe;
152	# the configured delta_kl probe never runs.
153	assert len(result.probes) == 1
154	preflight = result.probes[0]
155	assert preflight.kind == "preflight"
156	assert preflight.verdict == Verdict.ERROR
157	assert "preflight failed" in preflight.message.lower()
158	# Absolutely no PASS verdict anywhere in the suite result.
159	assert not any(r.verdict == Verdict.PASS for r in result.probes)
160	# Sanity: the delta_kl probe configured in the spec did not run.
161	assert not any(r.kind == "delta_kl" for r in result.probes)