tenseleyflow/sway / 00b870b

Browse files

tests/integration: external_perplexity end-to-end on tiny model (slow+online)

Authored by espadonne
SHA
00b870bb5b10f40ccd231077b46cc2858e9768c8
Parents
7e1eeec
Tree
ae38650

1 changed file

StatusFile+-
A tests/integration/test_external_perplexity_e2e.py 134 0
tests/integration/test_external_perplexity_e2e.pyadded
@@ -0,0 +1,134 @@
1
+"""Integration test: ``external_perplexity`` end-to-end on a real tiny model.
2
+
3
+Runs the probe against SmolLM2-135M with a small random LoRA so both
4
+sides produce real rolling-logprob values. The test asserts three
5
+contracts:
6
+
7
+1. The probe terminates in a non-ERROR verdict (the real backend's
8
+   ``rolling_logprob`` returns finite logprobs on natural English prose).
9
+2. The per-chunk delta array has the requested length and no NaNs.
10
+3. The null-calibration path lights up the ``z_score`` field in a
11
+   two-probe suite (``null_adapter`` first, then ``external_perplexity``).
12
+
13
+Marked ``slow+online``.
14
+"""
15
+
16
+from __future__ import annotations
17
+
18
+import math
19
+from collections.abc import Iterator
20
+from pathlib import Path
21
+
22
+import numpy as np
23
+import pytest
24
+
25
+from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
26
+from dlm_sway.core.model import ModelSpec
27
+from dlm_sway.core.result import Verdict
28
+from dlm_sway.probes.base import RunContext, build_probe
29
+from dlm_sway.suite.runner import run as run_suite
30
+from dlm_sway.suite.spec import SwaySpec
31
+
32
+pytestmark = [pytest.mark.slow, pytest.mark.online]
33
+
34
+
35
+def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
36
+    import torch
37
+    from peft import LoraConfig, get_peft_model
38
+    from transformers import AutoModelForCausalLM, AutoTokenizer
39
+
40
+    torch.manual_seed(0)
41
+    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
42
+    if tokenizer.pad_token_id is None:
43
+        tokenizer.pad_token = tokenizer.eos_token
44
+    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
45
+    cfg = LoraConfig(
46
+        r=8,
47
+        lora_alpha=16,
48
+        target_modules=["q_proj", "v_proj"],
49
+        lora_dropout=0.0,
50
+        bias="none",
51
+        task_type="CAUSAL_LM",
52
+    )
53
+    peft_model = get_peft_model(base, cfg)
54
+    with torch.no_grad():
55
+        for name, param in peft_model.named_parameters():
56
+            if "lora_B" in name:
57
+                param.copy_(torch.randn_like(param) * 0.05)
58
+    peft_model.save_pretrained(str(out_dir))
59
+    tokenizer.save_pretrained(str(out_dir))
60
+
61
+
62
+@pytest.fixture(scope="module")
63
+def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
64
+    adapter_dir = tmp_path_factory.mktemp("ext-ppl-random-adapter")
65
+    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
66
+    return adapter_dir
67
+
68
+
69
+@pytest.fixture(scope="module")
70
+def hf_backend(
71
+    tiny_model_dir: Path, random_adapter: Path
72
+) -> Iterator[HuggingFaceDifferentialBackend]:
73
+    backend = HuggingFaceDifferentialBackend(
74
+        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
75
+        adapter_path=random_adapter,
76
+    )
77
+    yield backend
78
+    backend.close()
79
+
80
+
81
+def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
82
+    probe, spec = build_probe(
83
+        {
84
+            "name": "ext_ppl",
85
+            "kind": "external_perplexity",
86
+            "max_chunks": 2,
87
+            "chunk_chars": 512,
88
+        }
89
+    )
90
+    ctx = RunContext(backend=hf_backend)
91
+    result = probe.run(spec, ctx)
92
+    assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
93
+    assert result.raw is not None
94
+    assert math.isfinite(result.raw)
95
+    per_chunk = result.evidence["per_chunk_delta"]
96
+    assert len(per_chunk) == 2
97
+    assert all(math.isfinite(d) for d in per_chunk)
98
+    assert np.all(np.isfinite(np.asarray(per_chunk, dtype=np.float64)))
99
+
100
+
101
+def test_null_calibration_lights_up_zscore(hf_backend: HuggingFaceDifferentialBackend) -> None:
102
+    """null_adapter → external_perplexity produces a z_score end-to-end."""
103
+    raw_spec = SwaySpec.model_validate(
104
+        {
105
+            "version": 1,
106
+            "models": {
107
+                "base": {"base": "placeholder"},
108
+                "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
109
+            },
110
+            "suite": [
111
+                # Two null seeds keep runtime bounded; std just has to be
112
+                # non-zero for the z-score path to engage.
113
+                {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
114
+                {
115
+                    "name": "ext",
116
+                    "kind": "external_perplexity",
117
+                    "max_chunks": 2,
118
+                    "chunk_chars": 512,
119
+                    "assert_z_gte": -100.0,  # permissive — sign/magnitude is adapter-specific
120
+                },
121
+            ],
122
+        }
123
+    )
124
+    result = run_suite(raw_spec, hf_backend)
125
+    assert len(result.probes) == 2
126
+    null_result = result.probes[0]
127
+    ext_result = result.probes[1]
128
+    assert null_result.verdict == Verdict.PASS
129
+    assert ext_result.verdict != Verdict.ERROR
130
+    assert ext_result.z_score is not None, (
131
+        f"external_perplexity should have z-scored against null baseline; "
132
+        f"evidence={ext_result.evidence}, message={ext_result.message}"
133
+    )
134
+    assert math.isfinite(ext_result.z_score)