| 1 | """Integration test: ``external_perplexity`` end-to-end on a real tiny model. |
| 2 | |
| 3 | Runs the probe against SmolLM2-135M with a small random LoRA so both |
| 4 | sides produce real rolling-logprob values. The test asserts three |
| 5 | contracts: |
| 6 | |
| 7 | 1. The probe terminates in a non-ERROR verdict (the real backend's |
| 8 | ``rolling_logprob`` returns finite logprobs on natural English prose). |
| 9 | 2. The per-chunk delta array has the requested length and no NaNs. |
| 10 | 3. The null-calibration path lights up the ``z_score`` field in a |
| 11 | two-probe suite (``null_adapter`` first, then ``external_perplexity``). |
| 12 | |
| 13 | Marked ``slow+online``. |
| 14 | """ |
| 15 | |
| 16 | from __future__ import annotations |
| 17 | |
| 18 | import math |
| 19 | from collections.abc import Iterator |
| 20 | from pathlib import Path |
| 21 | |
| 22 | import numpy as np |
| 23 | import pytest |
| 24 | |
| 25 | from dlm_sway.backends.hf import HuggingFaceDifferentialBackend |
| 26 | from dlm_sway.core.model import ModelSpec |
| 27 | from dlm_sway.core.result import Verdict |
| 28 | from dlm_sway.probes.base import RunContext, build_probe |
| 29 | from dlm_sway.suite.runner import run as run_suite |
| 30 | from dlm_sway.suite.spec import SwaySpec |
| 31 | |
| 32 | pytestmark = [pytest.mark.slow, pytest.mark.online] |
| 33 | |
| 34 | |
| 35 | def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None: |
| 36 | import torch |
| 37 | from peft import LoraConfig, get_peft_model |
| 38 | from transformers import AutoModelForCausalLM, AutoTokenizer |
| 39 | |
| 40 | torch.manual_seed(0) |
| 41 | tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) |
| 42 | if tokenizer.pad_token_id is None: |
| 43 | tokenizer.pad_token = tokenizer.eos_token |
| 44 | base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) |
| 45 | cfg = LoraConfig( |
| 46 | r=8, |
| 47 | lora_alpha=16, |
| 48 | target_modules=["q_proj", "v_proj"], |
| 49 | lora_dropout=0.0, |
| 50 | bias="none", |
| 51 | task_type="CAUSAL_LM", |
| 52 | ) |
| 53 | peft_model = get_peft_model(base, cfg) |
| 54 | with torch.no_grad(): |
| 55 | for name, param in peft_model.named_parameters(): |
| 56 | if "lora_B" in name: |
| 57 | param.copy_(torch.randn_like(param) * 0.05) |
| 58 | peft_model.save_pretrained(str(out_dir)) |
| 59 | tokenizer.save_pretrained(str(out_dir)) |
| 60 | |
| 61 | |
| 62 | @pytest.fixture(scope="module") |
| 63 | def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: |
| 64 | adapter_dir = tmp_path_factory.mktemp("ext-ppl-random-adapter") |
| 65 | _build_random_lora_adapter(tiny_model_dir, adapter_dir) |
| 66 | return adapter_dir |
| 67 | |
| 68 | |
| 69 | @pytest.fixture(scope="module") |
| 70 | def hf_backend( |
| 71 | tiny_model_dir: Path, random_adapter: Path |
| 72 | ) -> Iterator[HuggingFaceDifferentialBackend]: |
| 73 | backend = HuggingFaceDifferentialBackend( |
| 74 | base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), |
| 75 | adapter_path=random_adapter, |
| 76 | ) |
| 77 | yield backend |
| 78 | backend.close() |
| 79 | |
| 80 | |
| 81 | def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None: |
| 82 | probe, spec = build_probe( |
| 83 | { |
| 84 | "name": "ext_ppl", |
| 85 | "kind": "external_perplexity", |
| 86 | "max_chunks": 2, |
| 87 | "chunk_chars": 512, |
| 88 | } |
| 89 | ) |
| 90 | ctx = RunContext(backend=hf_backend) |
| 91 | result = probe.run(spec, ctx) |
| 92 | assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}" |
| 93 | assert result.raw is not None |
| 94 | assert math.isfinite(result.raw) |
| 95 | per_chunk = result.evidence["per_chunk_delta"] |
| 96 | assert len(per_chunk) == 2 |
| 97 | assert all(math.isfinite(d) for d in per_chunk) |
| 98 | assert np.all(np.isfinite(np.asarray(per_chunk, dtype=np.float64))) |
| 99 | |
| 100 | |
| 101 | def test_null_calibration_lights_up_zscore(hf_backend: HuggingFaceDifferentialBackend) -> None: |
| 102 | """null_adapter → external_perplexity produces a z_score end-to-end.""" |
| 103 | raw_spec = SwaySpec.model_validate( |
| 104 | { |
| 105 | "version": 1, |
| 106 | "models": { |
| 107 | "base": {"base": "placeholder"}, |
| 108 | "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"}, |
| 109 | }, |
| 110 | "suite": [ |
| 111 | # Two null seeds keep runtime bounded; std just has to be |
| 112 | # non-zero for the z-score path to engage. |
| 113 | {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False}, |
| 114 | { |
| 115 | "name": "ext", |
| 116 | "kind": "external_perplexity", |
| 117 | "max_chunks": 2, |
| 118 | "chunk_chars": 512, |
| 119 | "assert_z_gte": -100.0, # permissive — sign/magnitude is adapter-specific |
| 120 | }, |
| 121 | ], |
| 122 | } |
| 123 | ) |
| 124 | result = run_suite(raw_spec, hf_backend) |
| 125 | assert len(result.probes) == 2 |
| 126 | null_result = result.probes[0] |
| 127 | ext_result = result.probes[1] |
| 128 | assert null_result.verdict == Verdict.PASS |
| 129 | assert ext_result.verdict != Verdict.ERROR |
| 130 | assert ext_result.z_score is not None, ( |
| 131 | f"external_perplexity should have z-scored against null baseline; " |
| 132 | f"evidence={ext_result.evidence}, message={ext_result.message}" |
| 133 | ) |
| 134 | assert math.isfinite(ext_result.z_score) |