`e1af82b`

sway(probes): N2 adapter_ablation — the signature lambda-scaled KL curve

Authored by

espadonne 3 weeks ago

SHA: e1af82b44c0a04a881cb6130c17c2d48ba38cfcb
Parents: 7f806df
Tree: 1061d18

2 changed files

Status	File	+	-
A	`src/dlm_sway/probes/adapter_ablation.py`	193	0
A	`tests/unit/test_probe_adapter_ablation.py`	135	0

src/dlm_sway/probes/adapter_ablation.pyadded

 +"""N2 AdapterAblation — the sway signature primitive.
++
 +Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25}
 +and measures the mean divergence from the base distribution at each
 +step. Fits a monotonic response curve; reports three shape metrics:
++
 +- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means
 +  the adapter's effect scales predictably; low means it's "all or
 +  nothing" (degenerate).
 +- **saturation_lambda**: the smallest λ at which divergence reaches
 +  90% of the λ=1 value. Too low (<0.3) means the adapter fires at
 +  partial strength — fragile. Too high (>1.0) means the adapter is
 +  under-trained.
 +- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the
 +  healthy "pushing past 1 still moves the model" signal. An overshoot
 +  below 1.0 suggests collapse.
++
 +This is the single novel primitive that no generic eval harness
 +provides — sway's position next to the adapter math makes it possible.
++
 +Requires the backend to implement
 +:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes
 +SKIP gracefully on backends that don't.
 +"""
++
 +from __future__ import annotations
++
 +from typing import Literal
++
 +import numpy as np
 +from pydantic import Field
++
 +from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.core.scoring import ScalableDifferentialBackend
 +from dlm_sway.probes._divergence import Divergence, divergence
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
 +class AdapterAblationSpec(ProbeSpec):
 +    kind: Literal["adapter_ablation"] = "adapter_ablation"
 +    prompts: list[str] = Field(default_factory=list)
 +    lambdas: list[float] = Field(
 +        default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
 +        min_length=3,
 +    )
 +    divergence: Divergence = "js"
 +    top_k: int | None = None
 +    assert_linearity_gte: float = 0.85
 +    assert_saturation_between: tuple[float, float] = (0.3, 1.05)
 +    assert_overshoot_gte: float = 1.02
++
++
 +class AdapterAblationProbe(Probe):
 +    kind = "adapter_ablation"
 +    spec_cls = AdapterAblationSpec
 +    category = "ablation"
++
 +    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 +        assert isinstance(spec, AdapterAblationSpec)
 +        if not spec.prompts:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.ERROR,
 +                score=None,
 +                message="no prompts provided",
 +            )
 +        if not isinstance(ctx.backend, ScalableDifferentialBackend):
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.SKIP,
 +                score=None,
 +                message=(
 +                    "backend does not implement ScalableDifferentialBackend — "
 +                    "adapter ablation requires LoRA-scale access"
 +                ),
 +            )
++
 +        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
++
 +        # Reference distribution at λ=0 (adapter scaled to zero → base).
 +        lam_zero = min(spec.lambdas)
 +        per_lambda: list[float] = []
 +        for lam in spec.lambdas:
 +            divs_for_lam: list[float] = []
 +            for prompt in spec.prompts:
 +                with ctx.backend.as_scaled_adapter(lam_zero) as ref:
 +                    ref_dist = ref.next_token_dist(prompt, top_k=top_k)
 +                with ctx.backend.as_scaled_adapter(lam) as scaled:
 +                    scaled_dist = scaled.next_token_dist(prompt, top_k=top_k)
 +                divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence))
 +            per_lambda.append(float(np.mean(divs_for_lam)))
++
 +        lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64)
 +        divs_arr = np.asarray(per_lambda, dtype=np.float64)
++
 +        linearity = _r_squared(lambdas_arr, divs_arr)
 +        saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr)
 +        overshoot = _overshoot(lambdas_arr, divs_arr)
++
 +        # Pass when all three shape metrics land in their healthy bands.
 +        sat_lo, sat_hi = spec.assert_saturation_between
 +        ok_lin = linearity >= spec.assert_linearity_gte
 +        ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi
 +        ok_over = overshoot >= spec.assert_overshoot_gte
 +        verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL
++
 +        lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6)))
 +        over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2))
 +        sat_score = 1.0 if ok_sat else 0.3
 +        score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score
++
 +        return ProbeResult(
 +            name=spec.name,
 +            kind=spec.kind,
 +            verdict=verdict,
 +            score=score,
 +            raw=linearity,
 +            evidence={
 +                "lambdas": spec.lambdas,
 +                "mean_divergence_per_lambda": per_lambda,
 +                "linearity": linearity,
 +                "saturation_lambda": saturation_lambda,
 +                "overshoot": overshoot,
 +                "passed_linearity": ok_lin,
 +                "passed_saturation": ok_sat,
 +                "passed_overshoot": ok_over,
 +                "weight": spec.weight,
 +            },
 +            message=(
 +                f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} "
 +                f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}"
 +                if saturation_lambda is not None
 +                else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}"
 +            ),
 +        )
++
++
 +def _r_squared(x: np.ndarray, y: np.ndarray) -> float:
 +    """Coefficient of determination for a linear fit of ``y`` on ``x``."""
 +    if x.size < 2:
 +        return 0.0
 +    xm = float(x.mean())
 +    ym = float(y.mean())
 +    denom = float(((x - xm) ** 2).sum())
 +    if denom == 0.0:
 +        return 0.0
 +    slope = float(((x - xm) * (y - ym)).sum()) / denom
 +    intercept = ym - slope * xm
 +    y_pred = slope * x + intercept
 +    ss_res = float(((y - y_pred) ** 2).sum())
 +    ss_tot = float(((y - ym) ** 2).sum())
 +    if ss_tot == 0.0:
 +        return 1.0
 +    return max(0.0, 1.0 - ss_res / ss_tot)
++
++
 +def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None:
 +    """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1)."""
 +    # Locate the index of λ=1.0 (or the closest entry ≤ 1.0).
 +    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
 +    if candidates.size == 0:
 +        # Fall back to the largest λ ≤ 1.0.
 +        mask = lambdas <= 1.0
 +        if not mask.any():
 +            return None
 +        idx1 = int(np.argmax(lambdas * mask))
 +    else:
 +        idx1 = int(candidates[0])
 +    target = 0.9 * float(divs[idx1])
 +    if target <= 0:
 +        return None
 +    for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False):
 +        if d >= target:
 +            return float(lam)
 +    return None
++
++
 +def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float:
 +    """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0."""
 +    idx_max = int(np.argmax(lambdas))
 +    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
 +    if candidates.size == 0:
 +        return 1.0
 +    idx1 = int(candidates[0])
 +    if idx_max == idx1:
 +        return 1.0
 +    d1 = float(divs[idx1])
 +    dmax = float(divs[idx_max])
 +    if d1 <= 0:
 +        return 1.0
 +    return dmax / d1

tests/unit/test_probe_adapter_ablation.pyadded

 +"""Tests for :mod:`dlm_sway.probes.adapter_ablation`.
++
 +Uses the dummy backend's lam-interpolation implementation to exercise
 +the full probe path without loading a real model.
 +"""
++
 +from __future__ import annotations
++
 +import numpy as np
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist
 +from dlm_sway.probes.adapter_ablation import (
 +    _overshoot,
 +    _r_squared,
 +    _saturation_lambda,
 +)
 +from dlm_sway.probes.base import RunContext, build_probe
++
++
 +class TestShapeMetrics:
 +    def test_r_squared_perfect_linear(self) -> None:
 +        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
 +        y = 2 * x + 0.1
 +        assert _r_squared(x, y) > 0.99
++
 +    def test_r_squared_zero_slope_defined(self) -> None:
 +        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
 +        y = np.zeros_like(x)
 +        # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit).
 +        assert _r_squared(x, y) == 1.0
++
 +    def test_saturation_lambda_expected(self) -> None:
 +        lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64)
 +        divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64)
 +        sat = _saturation_lambda(lambdas, divs)
 +        assert sat == 0.75  # 0.95 / 1.0 = 0.95 ≥ 0.9
++
 +    def test_overshoot_recovered(self) -> None:
 +        lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64)
 +        divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64)
 +        assert _overshoot(lambdas, divs) == 1.15
++
++
 +def _diverging_backend() -> DummyDifferentialBackend:
 +    """Backend where base ≠ ft at a few prompts; distributions interpolate
 +    smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter."""
 +    base = DummyResponses(
 +        token_dists={
 +            "q1": TokenDist(
 +                token_ids=np.array([1, 2, 3], dtype=np.int64),
 +                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
 +                vocab_size=100,
 +            ),
 +            "q2": TokenDist(
 +                token_ids=np.array([5, 6], dtype=np.int64),
 +                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
 +                vocab_size=100,
 +            ),
 +        }
 +    )
 +    ft = DummyResponses(
 +        token_dists={
 +            "q1": TokenDist(
 +                token_ids=np.array([1, 2, 3], dtype=np.int64),
 +                logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
 +                vocab_size=100,
 +            ),
 +            "q2": TokenDist(
 +                token_ids=np.array([5, 6], dtype=np.int64),
 +                logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)),
 +                vocab_size=100,
 +            ),
 +        }
 +    )
 +    return DummyDifferentialBackend(base=base, ft=ft)
++
++
 +class TestProbe:
 +    def test_backend_implements_scalable_protocol(self) -> None:
 +        backend = _diverging_backend()
 +        assert isinstance(backend, ScalableDifferentialBackend)
++
 +    def test_probe_runs_and_emits_shape_metrics(self) -> None:
 +        probe, spec = build_probe(
 +            {
 +                "name": "abl",
 +                "kind": "adapter_ablation",
 +                "prompts": ["q1", "q2"],
 +                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
 +                # Very permissive to tolerate the log-space blend of a
 +                # tiny synthetic fixture.
 +                "assert_linearity_gte": 0.3,
 +                "assert_overshoot_gte": 1.0,
 +            }
 +        )
 +        ctx = RunContext(backend=_diverging_backend())
 +        result = probe.run(spec, ctx)
 +        assert result.verdict in (Verdict.PASS, Verdict.FAIL)
 +        assert "lambdas" in result.evidence
 +        assert "mean_divergence_per_lambda" in result.evidence
 +        assert len(result.evidence["mean_divergence_per_lambda"]) == 6
 +        # Divergence should increase as λ grows from 0 toward ft.
 +        divs = result.evidence["mean_divergence_per_lambda"]
 +        # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing
 +        # for the bulk of the curve.
 +        assert divs[-2] >= divs[0]
++
 +    def test_skip_when_backend_not_scalable(self) -> None:
 +        class _NonScalable:
 +            def as_base(self):  # noqa: ANN202
 +                raise NotImplementedError
++
 +            def as_finetuned(self):  # noqa: ANN202
 +                raise NotImplementedError
++
 +        probe, spec = build_probe(
 +            {
 +                "name": "abl",
 +                "kind": "adapter_ablation",
 +                "prompts": ["q1"],
 +            }
 +        )
 +        ctx = RunContext(backend=_NonScalable())  # type: ignore[arg-type]
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.SKIP
 +        assert "ScalableDifferentialBackend" in result.message
++
 +    def test_error_on_empty_prompts(self) -> None:
 +        backend = _diverging_backend()
 +        probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.ERROR