Python · 4894 bytes Raw Blame History
1 """Integration test: ``external_perplexity`` end-to-end on a real tiny model.
2
3 Runs the probe against SmolLM2-135M with a small random LoRA so both
4 sides produce real rolling-logprob values. The test asserts three
5 contracts:
6
7 1. The probe terminates in a non-ERROR verdict (the real backend's
8 ``rolling_logprob`` returns finite logprobs on natural English prose).
9 2. The per-chunk delta array has the requested length and no NaNs.
10 3. The null-calibration path lights up the ``z_score`` field in a
11 two-probe suite (``null_adapter`` first, then ``external_perplexity``).
12
13 Marked ``slow+online``.
14 """
15
16 from __future__ import annotations
17
18 import math
19 from collections.abc import Iterator
20 from pathlib import Path
21
22 import numpy as np
23 import pytest
24
25 from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
26 from dlm_sway.core.model import ModelSpec
27 from dlm_sway.core.result import Verdict
28 from dlm_sway.probes.base import RunContext, build_probe
29 from dlm_sway.suite.runner import run as run_suite
30 from dlm_sway.suite.spec import SwaySpec
31
32 pytestmark = [pytest.mark.slow, pytest.mark.online]
33
34
35 def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
36 import torch
37 from peft import LoraConfig, get_peft_model
38 from transformers import AutoModelForCausalLM, AutoTokenizer
39
40 torch.manual_seed(0)
41 tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
42 if tokenizer.pad_token_id is None:
43 tokenizer.pad_token = tokenizer.eos_token
44 base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
45 cfg = LoraConfig(
46 r=8,
47 lora_alpha=16,
48 target_modules=["q_proj", "v_proj"],
49 lora_dropout=0.0,
50 bias="none",
51 task_type="CAUSAL_LM",
52 )
53 peft_model = get_peft_model(base, cfg)
54 with torch.no_grad():
55 for name, param in peft_model.named_parameters():
56 if "lora_B" in name:
57 param.copy_(torch.randn_like(param) * 0.05)
58 peft_model.save_pretrained(str(out_dir))
59 tokenizer.save_pretrained(str(out_dir))
60
61
62 @pytest.fixture(scope="module")
63 def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
64 adapter_dir = tmp_path_factory.mktemp("ext-ppl-random-adapter")
65 _build_random_lora_adapter(tiny_model_dir, adapter_dir)
66 return adapter_dir
67
68
69 @pytest.fixture(scope="module")
70 def hf_backend(
71 tiny_model_dir: Path, random_adapter: Path
72 ) -> Iterator[HuggingFaceDifferentialBackend]:
73 backend = HuggingFaceDifferentialBackend(
74 base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
75 adapter_path=random_adapter,
76 )
77 yield backend
78 backend.close()
79
80
81 def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
82 probe, spec = build_probe(
83 {
84 "name": "ext_ppl",
85 "kind": "external_perplexity",
86 "max_chunks": 2,
87 "chunk_chars": 512,
88 }
89 )
90 ctx = RunContext(backend=hf_backend)
91 result = probe.run(spec, ctx)
92 assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
93 assert result.raw is not None
94 assert math.isfinite(result.raw)
95 per_chunk = result.evidence["per_chunk_delta"]
96 assert len(per_chunk) == 2
97 assert all(math.isfinite(d) for d in per_chunk)
98 assert np.all(np.isfinite(np.asarray(per_chunk, dtype=np.float64)))
99
100
101 def test_null_calibration_lights_up_zscore(hf_backend: HuggingFaceDifferentialBackend) -> None:
102 """null_adapter → external_perplexity produces a z_score end-to-end."""
103 raw_spec = SwaySpec.model_validate(
104 {
105 "version": 1,
106 "models": {
107 "base": {"base": "placeholder"},
108 "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
109 },
110 "suite": [
111 # Two null seeds keep runtime bounded; std just has to be
112 # non-zero for the z-score path to engage.
113 {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
114 {
115 "name": "ext",
116 "kind": "external_perplexity",
117 "max_chunks": 2,
118 "chunk_chars": 512,
119 "assert_z_gte": -100.0, # permissive — sign/magnitude is adapter-specific
120 },
121 ],
122 }
123 )
124 result = run_suite(raw_spec, hf_backend)
125 assert len(result.probes) == 2
126 null_result = result.probes[0]
127 ext_result = result.probes[1]
128 assert null_result.verdict == Verdict.PASS
129 assert ext_result.verdict != Verdict.ERROR
130 assert ext_result.z_score is not None, (
131 f"external_perplexity should have z-scored against null baseline; "
132 f"evidence={ext_result.evidence}, message={ext_result.message}"
133 )
134 assert math.isfinite(ext_result.z_score)