"""Integration test: PEFT ``disable_adapter`` actually changes logits. This is the load-bearing sanity check for the whole differential design. If a future ``peft`` release subtly breaks the disable-context semantics, sway's KL / SIS / ablation probes would all silently report zero signal. We catch that here, before the rest of the test battery runs. The test builds a random-init LoRA adapter on a tiny model so no network dependency beyond the base model snapshot itself. """ from __future__ import annotations from pathlib import Path import pytest from dlm_sway.backends.hf import HuggingFaceDifferentialBackend from dlm_sway.core.model import ModelSpec pytestmark = [pytest.mark.slow, pytest.mark.online] def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None: """Construct a LoRA adapter with random-init weights on ``base_dir``. The weights are kept small so the toggle-delta is clear but the adapter is structurally valid (correct ``adapter_config.json``, tokenizer files, safetensors layout). """ import torch from peft import LoraConfig, get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer torch.manual_seed(0) tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) cfg = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.0, bias="none", task_type="CAUSAL_LM", ) peft_model = get_peft_model(base, cfg) # Explicitly scale lora_B out of its PEFT-default zero-init so the # adapter actually changes outputs. Real training does this via # gradients; we do it with a scaled normal. with torch.no_grad(): for name, param in peft_model.named_parameters(): if "lora_B" in name: param.copy_(torch.randn_like(param) * 0.05) peft_model.save_pretrained(str(out_dir)) tokenizer.save_pretrained(str(out_dir)) @pytest.fixture(scope="module") def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: adapter_dir = tmp_path_factory.mktemp("random-adapter") _build_random_lora_adapter(tiny_model_dir, adapter_dir) return adapter_dir def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None: """The keystone invariant: base view ≠ ft view on the same prompt.""" import numpy as np backend = HuggingFaceDifferentialBackend( base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), adapter_path=random_adapter, ) try: prompt = "The quick brown fox" with backend.as_base() as b: base_dist = b.next_token_dist(prompt, top_k=32) with backend.as_finetuned() as f: ft_dist = f.next_token_dist(prompt, top_k=32) # Top-k indices may shift under the adapter; take a safe shared # subset instead of asserting identical ordering. assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose( base_dist.logprobs, ft_dist.logprobs, atol=1e-5 ), "adapter toggle did not change next-token distribution" finally: backend.close() def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None: """as_base → as_finetuned → as_base yields a stable base view.""" import numpy as np backend = HuggingFaceDifferentialBackend( base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), adapter_path=random_adapter, ) try: prompt = "hello" with backend.as_base() as b: first = b.next_token_dist(prompt, top_k=16).logprobs with backend.as_finetuned() as f: f.next_token_dist(prompt, top_k=16) # toggle with backend.as_base() as b: second = b.next_token_dist(prompt, top_k=16).logprobs np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6) finally: backend.close() def test_disable_re_enable_bit_identical_logits(tiny_model_dir: Path, random_adapter: Path) -> None: """B15 mitigation: ft → base → ft produces bit-identical ft logits. Subtle state corruption inside ``disable_adapter()`` (e.g. a wrong re-attach order on context exit) would silently shift the second ft pass by an immeasurably small amount that ``assert_allclose`` tolerates but ``assert_array_equal`` doesn't. Pin the stricter claim on fp32 + CPU so the test stays deterministic across hosts. """ import numpy as np backend = HuggingFaceDifferentialBackend( base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), adapter_path=random_adapter, ) try: prompt = "the disable_adapter contract is" with backend.as_finetuned() as f: first = np.array(f.next_token_dist(prompt, top_k=32).logprobs, copy=True) with backend.as_base() as b: b.next_token_dist(prompt, top_k=32) # toggle through base with backend.as_finetuned() as f: second = np.array(f.next_token_dist(prompt, top_k=32).logprobs, copy=True) np.testing.assert_array_equal( first, second, err_msg="ft logits drifted across a base toggle — disable_adapter exit may have corrupted the adapter state (B15)", ) finally: backend.close()