`70bc167`

sway(probes): A3 prompt_collapse — KL decay fit in log space

Authored by

espadonne 3 weeks ago

SHA: 70bc167ced51f716ea737cf1d3a2c4509c1818e3
Parents: ed303dd
Tree: 61cfca0

2 changed files

Status	File	+	-
A	`src/dlm_sway/probes/prompt_collapse.py`	159	0
A	`tests/unit/test_probe_prompt_collapse.py`	137	0

src/dlm_sway/probes/prompt_collapse.pyadded

 +"""A3 PromptCollapse — does adapter influence decay with context length?
++
 +For each test prompt we prepend irrelevant "stuffing" of varying length
 +and measure ``divergence(base, ft)`` at the final position. A healthy
 +adapter shows a modest, slow decay; a degenerate one collapses quickly
 +— its signal evaporates once the base has a lot of context to lean on.
++
 +We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log
 +space and report the half-life in tokens. Pass if the half-life is at
 +least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which
 +defaults to half the default sequence length.
++
 +All math is numpy-only to avoid a scipy dependency on the install path.
 +"""
++
 +from __future__ import annotations
++
 +from typing import Literal
++
 +import numpy as np
 +from pydantic import Field
++
 +from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.probes._divergence import Divergence, divergence
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
 +# A neutral, token-dense piece of text we prepend to stress the base
 +# model's long-context handling. Deliberately low-information so the
 +# "answer" at the end is the only thing driving next-token predictions.
 +_STUFFING = (
 +    "The following log lines are archived for historical record and have no "
 +    "bearing on the question that follows. They are retained for audit purposes "
 +    "only and should be ignored when forming an answer. "
 +)
++
++
 +class PromptCollapseSpec(ProbeSpec):
 +    kind: Literal["prompt_collapse"] = "prompt_collapse"
 +    prompts: list[str] = Field(default_factory=list, min_length=0)
 +    context_lengths: list[int] = Field(
 +        default_factory=lambda: [0, 256, 512, 1024],
 +        min_length=2,
 +    )
 +    """Approximate token counts of stuffing to prepend. ≥2 required
 +    because the exponential fit is undefined for a single point."""
 +    divergence: Divergence = "js"
 +    top_k: int | None = None
 +    assert_half_life_tokens: int = 512
 +    """Minimum half-life to pass. Default is deliberately permissive —
 +    tune upward for high-stakes deployments."""
++
++
 +class PromptCollapseProbe(Probe):
 +    kind = "prompt_collapse"
 +    spec_cls = PromptCollapseSpec
 +    category = "adherence"
++
 +    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 +        assert isinstance(spec, PromptCollapseSpec)
 +        if not spec.prompts:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.ERROR,
 +                score=None,
 +                message="no prompts provided",
 +            )
++
 +        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
 +        # Mean divergence at each context length.
 +        mean_divs: list[float] = []
 +        for ctx_len in spec.context_lengths:
 +            prefix = _stuffing(ctx_len)
 +            divs: list[float] = []
 +            for prompt in spec.prompts:
 +                full_prompt = prefix + prompt
 +                with ctx.backend.as_base() as bv:
 +                    base_dist = bv.next_token_dist(full_prompt, top_k=top_k)
 +                with ctx.backend.as_finetuned() as fv:
 +                    ft_dist = fv.next_token_dist(full_prompt, top_k=top_k)
 +                divs.append(divergence(base_dist, ft_dist, kind=spec.divergence))
 +            mean_divs.append(float(np.mean(divs)))
++
 +        half_life = _fit_half_life(
 +            np.asarray(spec.context_lengths, dtype=np.float64),
 +            np.asarray(mean_divs, dtype=np.float64),
 +        )
++
 +        verdict = (
 +            Verdict.PASS
 +            if half_life is not None and half_life >= spec.assert_half_life_tokens
 +            else Verdict.FAIL
 +        )
 +        score = _score(half_life, spec.assert_half_life_tokens)
++
 +        msg = (
 +            f"half-life={half_life:.0f} tokens"
 +            if half_life is not None
 +            else "could not fit exponential decay (too flat or non-monotonic)"
 +        )
 +        return ProbeResult(
 +            name=spec.name,
 +            kind=spec.kind,
 +            verdict=verdict,
 +            score=score,
 +            raw=half_life,
 +            evidence={
 +                "context_lengths": spec.context_lengths,
 +                "mean_divergence_per_length": mean_divs,
 +                "divergence_kind": spec.divergence,
 +                "weight": spec.weight,
 +            },
 +            message=msg,
 +        )
++
++
 +def _stuffing(target_tokens: int) -> str:
 +    """Approximate target-length stuffing. 4 chars ≈ 1 token is fine
 +    for SentencePiece-style tokenizers at the order-of-magnitude level."""
 +    if target_tokens <= 0:
 +        return ""
 +    # Repeat enough copies to hit the target length in characters.
 +    target_chars = target_tokens * 4
 +    reps = (target_chars // len(_STUFFING)) + 1
 +    return (_STUFFING * reps)[:target_chars] + "\n\n"
++
++
 +def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None:
 +    """Fit ``y = a * exp(-x / h)`` via log-space linear regression.
++
 +    Returns ``None`` if the divergences aren't strictly positive or the
 +    fit is non-decreasing (i.e. the fine-tune got *more* distinct with
 +    context, which invalidates the half-life concept).
 +    """
 +    if (divergences <= 0.0).any():
 +        # Can't take a log; treat near-zero as too-flat-to-fit.
 +        return None
 +    log_y = np.log(divergences)
 +    # Standard linear regression slope.
 +    x_mean = float(lengths.mean())
 +    y_mean = float(log_y.mean())
 +    denom = float(((lengths - x_mean) ** 2).sum())
 +    if denom == 0.0:
 +        return None
 +    slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom
 +    if slope >= 0.0:
 +        # Signal grew with context — can't express as half-life.
 +        return None
 +    # Slope = -1/h → h = -1/slope → half_life = ln(2) * h.
 +    import math
++
 +    return float(math.log(2.0) * (-1.0 / slope))
++
++
 +def _score(half_life: float | None, target: int) -> float:
 +    if half_life is None:
 +        return 0.0
 +    # Asymptotic: score saturates at 1.0 when hits target, declines toward 0.
 +    return float(min(1.0, half_life / max(target, 1)))

tests/unit/test_probe_prompt_collapse.pyadded

 +"""Tests for :mod:`dlm_sway.probes.prompt_collapse`.
++
 +Uses a programmable dummy backend that serves different token dists
 +depending on whether the prompt contains the stuffing prefix. That's the
 +cleanest way to simulate "divergence decays with context length" without
 +a real model.
 +"""
++
 +from __future__ import annotations
++
 +import numpy as np
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.core.scoring import TokenDist
 +from dlm_sway.probes.base import RunContext, build_probe
 +from dlm_sway.probes.prompt_collapse import _fit_half_life
++
++
 +class TestFitHalfLife:
 +    def test_exponential_recovered(self) -> None:
 +        lengths = np.array([0.0, 100.0, 200.0, 300.0])
 +        # y = 1.0 * exp(-x / 100)
 +        y = np.exp(-lengths / 100.0)
 +        h = _fit_half_life(lengths, y)
 +        assert h is not None
 +        import math
++
 +        # True half-life = ln(2) * 100 ≈ 69.3
 +        assert abs(h - math.log(2.0) * 100.0) < 1e-6
++
 +    def test_returns_none_for_flat(self) -> None:
 +        lengths = np.array([0.0, 100.0, 200.0])
 +        y = np.array([1e-10, 1e-10, 1e-10])
 +        assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None
 +        # Either None or a huge half-life — both acceptable for flat input.
++
 +    def test_returns_none_for_increasing(self) -> None:
 +        lengths = np.array([0.0, 100.0, 200.0])
 +        y = np.array([0.1, 0.3, 0.5])
 +        assert _fit_half_life(lengths, y) is None
++
++
 +def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend:
 +    """Return a backend whose divergence decays with prompt length.
++
 +    ``stuffing_sensitivity`` controls how quickly the ft distribution
 +    snaps back to base as prompt length grows; lower = healthier adapter.
 +    """
 +    import numpy as np
++
 +    base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32)
++
 +    class _StuffedResponses(DummyResponses):
 +        def __init__(self, is_ft: bool):
 +            super().__init__()
 +            self._is_ft = is_ft
++
 +        # Override retrieval by subclassing the view's lookup path.
++
 +    # Simpler: use explicit prompts at each expected length to seed the dict.
 +    # The probe prefixes stuffing so the dummy sees the exact final prompt.
 +    # We pre-build dists for each prompt we expect to see.
 +    base = DummyResponses()
 +    ft = DummyResponses()
++
 +    # Pre-generate prompts the probe will query. The probe uses default
 +    # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok.
 +    from dlm_sway.probes.prompt_collapse import _stuffing
++
 +    for ctx_len in (0, 256, 512, 1024):
 +        prefix = _stuffing(ctx_len)
 +        for prompt in ("q1",):
 +            key = prefix + prompt
 +            # Base: always tight on token 1.
 +            base.token_dists[key] = TokenDist(
 +                token_ids=np.array([1, 2, 3], dtype=np.int64),
 +                logprobs=np.log(base_probs),
 +                vocab_size=100,
 +            )
 +            # FT: diverges at ctx=0, decays toward base with length.
 +            decay = np.exp(-ctx_len * stuffing_sensitivity)
 +            ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay
 +            ft_probs = ft_probs / ft_probs.sum()
 +            ft.token_dists[key] = TokenDist(
 +                token_ids=np.array([1, 2, 3], dtype=np.int64),
 +                logprobs=np.log(ft_probs.astype(np.float32)),
 +                vocab_size=100,
 +            )
 +    return DummyDifferentialBackend(base=base, ft=ft)
++
++
 +class TestPromptCollapse:
 +    def test_healthy_adapter_passes(self) -> None:
 +        probe, spec = build_probe(
 +            {
 +                "name": "pc",
 +                "kind": "prompt_collapse",
 +                "prompts": ["q1"],
 +                "context_lengths": [0, 256, 512, 1024],
 +                "assert_half_life_tokens": 100,
 +            }
 +        )
 +        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001))
 +        result = probe.run(spec, ctx)
 +        # Half-life should be well above 100 with slow decay.
 +        assert result.verdict == Verdict.PASS
 +        assert result.raw is not None
 +        assert result.raw > 100
++
 +    def test_collapsing_adapter_fails(self) -> None:
 +        probe, spec = build_probe(
 +            {
 +                "name": "pc",
 +                "kind": "prompt_collapse",
 +                "prompts": ["q1"],
 +                "context_lengths": [0, 256, 512, 1024],
 +                "assert_half_life_tokens": 500,
 +            }
 +        )
 +        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02))
 +        result = probe.run(spec, ctx)
 +        # Fast decay → short half-life → fail against 500-token threshold.
 +        assert result.verdict == Verdict.FAIL
++
 +    def test_error_on_empty_prompts(self) -> None:
 +        probe, spec = build_probe(
 +            {
 +                "name": "pc",
 +                "kind": "prompt_collapse",
 +                "prompts": [],
 +                "context_lengths": [0, 256],
 +            }
 +        )
 +        ctx = RunContext(backend=_programmed_backend(0.001))
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.ERROR