`d955dfd`

tests/unit: 22 tests for multi_turn_coherence probe + curve-fit math

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: d955dfd4241120abd9ba029f0d815330a3a1ed64
Parents: ecca902
Tree: cbe3f97

1 changed file

Status	File	+	-
A	`tests/unit/test_probe_multi_turn_coherence.py`	418	0

tests/unit/test_probe_multi_turn_coherence.pyadded

 +"""Tests for :mod:`dlm_sway.probes.multi_turn_coherence`."""
++
 +from __future__ import annotations
++
 +import math
++
 +import numpy as np
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.core.scoring import TokenDist
 +from dlm_sway.probes.base import RunContext, build_probe
 +from dlm_sway.probes.multi_turn_coherence import (
 +    _ascii_sparkline,
 +    _chat_template_of,
 +    _fallback_format,
 +    _fit_half_life_turns,
 +    _verdict_from_half_life,
 +)
++
 +# ---------------------------------------------------------------------------
 +# Fake tokenizer + dist-injection helpers
 +# ---------------------------------------------------------------------------
++
++
 +class _FakeTokenizer:
 +    """Minimal stand-in for an HF tokenizer with a chat_template.
++
 +    Implements just enough of the surface :mod:`multi_turn_coherence`
 +    consults: the ``chat_template`` attribute and an
 +    ``apply_chat_template`` method that returns a concatenated
 +    role-marker string. Real HF chat templates are Jinja-rendered;
 +    ours is a deterministic string concat so per-turn comparisons
 +    are reproducible.
 +    """
++
 +    chat_template = "fake-jinja-template-string"
++
 +    def apply_chat_template(
 +        self,
 +        messages: list[dict[str, str]],
 +        *,
 +        tokenize: bool = False,
 +        add_generation_prompt: bool = False,
 +    ) -> str:
 +        del tokenize  # we always return text
 +        parts = [f"{m['role']}::{m['content']}" for m in messages]
 +        if add_generation_prompt:
 +            parts.append("assistant::")
 +        return "|".join(parts)
++
++
 +def _attach_tokenizer(
 +    backend: DummyDifferentialBackend, tokenizer: object | None
 +) -> DummyDifferentialBackend:
 +    """Slot a tokenizer onto the dummy backend the same way HF does.
++
 +    multi_turn_coherence reads ``ctx.backend._tokenizer`` to find the
 +    chat template — mirrors prompt_collapse's _peek_backend_tokenizer.
 +    """
 +    backend._tokenizer = tokenizer  # type: ignore[attr-defined]
 +    return backend
++
++
 +def _decay_dist(value: float, *, broad: bool, k: int = 8) -> TokenDist:
 +    """Build a TokenDist whose top-k logprobs encode a known signal.
++
 +    ``value`` controls how peaked the distribution is — feeding two
 +    such dists into the divergence helper produces a deterministic
 +    KL we can plant per-turn for the curve-fit tests.
 +    """
 +    if broad:
 +        # Roughly uniform with tiny perturbation (clears the
 +        # _UNIFORM_LOGPROB_TOL guard).
 +        lp = np.full(k, -math.log(k), dtype=np.float32)
 +        lp += np.linspace(-1e-4, 1e-4, k, dtype=np.float32)
 +    else:
 +        # Sharp: most mass on first token, with `value` controlling
 +        # how peaked. Larger value ⇒ sharper.
 +        lp = np.array([-0.01 * value] + [-1.0 - value] * (k - 1), dtype=np.float32)
 +    return TokenDist(
 +        token_ids=np.arange(k, dtype=np.int64),
 +        logprobs=lp,
 +        vocab_size=1000,
 +        tail_logprob=None,
 +    )
++
++
 +# ---------------------------------------------------------------------------
 +# Skip / error paths
 +# ---------------------------------------------------------------------------
++
++
 +class TestSkipPaths:
 +    def test_skips_when_no_tokenizer(self) -> None:
 +        """Dummy backend has no tokenizer ⇒ probe SKIPs cleanly."""
 +        backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict == Verdict.SKIP
 +        assert "chat_template" in result.message
++
 +    def test_skips_when_tokenizer_lacks_chat_template(self) -> None:
 +        backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
++
 +        class _BareTokenizer:
 +            chat_template = None
++
 +        _attach_tokenizer(backend, _BareTokenizer())
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict == Verdict.SKIP
++
 +    def test_errors_when_no_prompts(self) -> None:
 +        backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
 +        _attach_tokenizer(backend, _FakeTokenizer())
 +        probe, spec = build_probe({"name": "mtc", "kind": "multi_turn_coherence_decay"})
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict == Verdict.ERROR
++
++
 +# ---------------------------------------------------------------------------
 +# Happy path: planted decay → curve fit recovers half-life
 +# ---------------------------------------------------------------------------
++
++
 +class TestHappyPath:
 +    """End-to-end probe runs against the dummy backend.
++
 +    Tests verify *wiring* — the probe correctly drives the chat loop,
 +    asks for next-token-dists at the right positions, and produces a
 +    finalized result with the documented evidence shape. The exact
 +    KL values depend on the dummy backend's synthetic-distribution
 +    math; tests assert the curve is monotonic / non-monotonic / flat
 +    as intended rather than pinning specific KL targets. Curve-fit
 +    correctness is covered separately in :class:`TestFitHalfLife`.
 +    """
++
 +    def test_decreasing_curve_yields_finite_half_life(self) -> None:
 +        """Decreasing planted KLs → 'ok' fit status, finite half-life."""
 +        backend = _backend_with_decreasing_dists(
 +            prompt="hello", max_turns=4, sharpness_per_turn=[3.0, 1.0, 0.3]
 +        )
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +                "max_turns": 4,
 +                "assert_half_life_turns": 0.5,
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict in {Verdict.PASS, Verdict.FAIL}, result.message
 +        assert result.evidence["fit_status"] in {"ok", "stable"}
 +        per_turn = result.evidence["per_turn_kls"]
 +        assert len(per_turn) == 3
 +        # Monotonic decrease — verifies the probe wired turns in order.
 +        assert per_turn[0] > per_turn[1] > per_turn[2]
 +        assert result.raw is not None
 +        assert result.raw > 0.0
++
 +    def test_flat_curve_marked_stable(self) -> None:
 +        """Same dists at every turn → fit_status=stable → PASS."""
 +        backend = _backend_with_decreasing_dists(
 +            prompt="hello", max_turns=4, sharpness_per_turn=[1.0, 1.0, 1.0]
 +        )
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +                "max_turns": 4,
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict == Verdict.PASS
 +        assert result.evidence["fit_status"] == "stable"
 +        assert result.score == 1.0
 +        assert "held coherence" in result.message
++
 +    def test_growing_curve_warns(self) -> None:
 +        """Increasing planted KLs → fit_status=non_monotonic → WARN."""
 +        backend = _backend_with_decreasing_dists(
 +            prompt="hello", max_turns=4, sharpness_per_turn=[0.3, 1.0, 3.0]
 +        )
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +                "max_turns": 4,
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.verdict == Verdict.WARN
 +        assert result.evidence["fit_status"] == "non_monotonic"
++
 +    def test_evidence_carries_turns_and_sparkline(self) -> None:
 +        backend = _backend_with_decreasing_dists(
 +            prompt="hello", max_turns=4, sharpness_per_turn=[2.0, 1.0, 0.5]
 +        )
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": ["hello"],
 +                "max_turns": 4,
 +            }
 +        )
 +        result = probe.run(spec, RunContext(backend=backend))
 +        assert result.evidence["turns_axis"] == [2.0, 3.0, 4.0]
 +        assert result.evidence["max_turns"] == 4
 +        assert result.evidence["num_prompts"] == 1
 +        assert isinstance(result.evidence["sparkline"], str)
 +        assert len(result.evidence["sparkline"]) == 3
++
++
 +# ---------------------------------------------------------------------------
 +# Curve fit unit tests (math only, no backend)
 +# ---------------------------------------------------------------------------
++
++
 +class TestFitHalfLife:
 +    def test_clean_exponential_recovers_half_life(self) -> None:
 +        # y = exp(-0.693 * x) ⇒ half-life = 1 turn
 +        turns = np.array([2.0, 3.0, 4.0])
 +        kls = np.exp(-math.log(2.0) * turns)
 +        h, status = _fit_half_life_turns(turns, kls, max_turns=4)
 +        assert status == "ok"
 +        assert h is not None
 +        assert math.isclose(h, 1.0, rel_tol=1e-3)
++
 +    def test_stable_returns_saturation(self) -> None:
 +        turns = np.array([2.0, 3.0, 4.0])
 +        kls = np.array([0.5, 0.5, 0.5])
 +        h, status = _fit_half_life_turns(turns, kls, max_turns=4)
 +        assert status == "stable"
 +        assert h == 40.0  # max_turns * 10
++
 +    def test_growing_returns_non_monotonic(self) -> None:
 +        turns = np.array([2.0, 3.0, 4.0])
 +        kls = np.array([0.1, 0.3, 0.9])
 +        h, status = _fit_half_life_turns(turns, kls, max_turns=4)
 +        assert status == "non_monotonic"
 +        assert h is None
++
 +    def test_all_zero_returns_degenerate(self) -> None:
 +        turns = np.array([2.0, 3.0, 4.0])
 +        kls = np.array([0.0, 0.0, 0.0])
 +        h, status = _fit_half_life_turns(turns, kls, max_turns=4)
 +        assert status == "degenerate"
 +        assert h == 0.0
++
 +    def test_partial_zero_drops_zero_points(self) -> None:
 +        """One zero-KL turn: drop it, fit on the remaining positives."""
 +        turns = np.array([2.0, 3.0, 4.0])
 +        kls = np.array([0.4, 0.0, 0.1])
 +        h, status = _fit_half_life_turns(turns, kls, max_turns=4)
 +        assert status == "ok"
 +        assert h is not None
 +        assert h > 0.0
++
++
 +# ---------------------------------------------------------------------------
 +# Verdict mapping (math-free)
 +# ---------------------------------------------------------------------------
++
++
 +class TestVerdictMapping:
 +    def test_pass_on_half_life_above_target(self) -> None:
 +        v, s, msg = _verdict_from_half_life(
 +            half_life=3.0,
 +            fit_status="ok",
 +            target=2.0,
 +            mean_kls=[0.4, 0.2, 0.1],
 +            turns_axis=[2.0, 3.0, 4.0],
 +        )
 +        assert v == Verdict.PASS
 +        assert s == 1.0
 +        assert "half-life=3.00" in msg
++
 +    def test_fail_on_half_life_below_target(self) -> None:
 +        v, s, msg = _verdict_from_half_life(
 +            half_life=0.5,
 +            fit_status="ok",
 +            target=2.0,
 +            mean_kls=[0.4, 0.1, 0.02],
 +            turns_axis=[2.0, 3.0, 4.0],
 +        )
 +        assert v == Verdict.FAIL
 +        assert 0.0 < s < 1.0
++
++
 +# ---------------------------------------------------------------------------
 +# Sparkline + fallback formatter
 +# ---------------------------------------------------------------------------
++
++
 +class TestSparkline:
 +    def test_renders_one_char_per_value(self) -> None:
 +        out = _ascii_sparkline([0.4, 0.2, 0.1])
 +        assert len(out) == 3
 +        # Decreasing input ⇒ first bar should be the tallest
 +        assert out[0] >= out[-1]
++
 +    def test_flat_input_renders_uniform_mid(self) -> None:
 +        out = _ascii_sparkline([0.5, 0.5, 0.5])
 +        assert len(set(out)) == 1  # all the same bar
++
 +    def test_empty_input_returns_empty(self) -> None:
 +        assert _ascii_sparkline([]) == ""
++
 +    def test_non_finite_drops_and_marks(self) -> None:
 +        out = _ascii_sparkline([0.4, math.inf, 0.1])
 +        assert out[1] == "?"
++
++
 +class TestFallbackFormatter:
 +    def test_concatenates_with_role_markers(self) -> None:
 +        msg = _fallback_format(
 +            [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "yo"}],
 +            add_generation_prompt=True,
 +        )
 +        assert "USER: hi" in msg
 +        assert "ASSISTANT: yo" in msg
 +        assert msg.endswith("ASSISTANT:")
++
++
 +class TestChatTemplateDetection:
 +    def test_returns_none_when_tokenizer_is_none(self) -> None:
 +        assert _chat_template_of(None) is None
++
 +    def test_returns_none_when_no_attribute(self) -> None:
 +        class _T:
 +            pass
++
 +        assert _chat_template_of(_T()) is None
++
 +    def test_returns_template_when_set(self) -> None:
 +        assert _chat_template_of(_FakeTokenizer()) == "fake-jinja-template-string"
++
++
 +# ---------------------------------------------------------------------------
 +# Test infrastructure
 +# ---------------------------------------------------------------------------
++
++
 +def _backend_with_decreasing_dists(
 +    *,
 +    prompt: str,
 +    max_turns: int,
 +    sharpness_per_turn: list[float],
 +) -> DummyDifferentialBackend:
 +    """Build a dummy backend whose per-turn ft TokenDists vary in sharpness.
++
 +    Base view always returns the same broad (≈uniform) dist. ft view
 +    returns a dist with a controllable sharpness per turn: larger
 +    sharpness ⇒ more peaked ⇒ larger KL from the broad base. The
 +    sharpness sequence drives the curve shape (decreasing,
 +    increasing, flat) without trying to plant exact KL values — that
 +    coupling proved fragile in the first cut.
++
 +    Plants the chat strings the probe will see by replaying the
 +    probe's per-prompt loop with the same fake tokenizer + the same
 +    follow-up cycle.
 +    """
 +    if len(sharpness_per_turn) != max_turns - 1:
 +        raise ValueError(f"need {max_turns - 1} sharpness values, got {len(sharpness_per_turn)}")
++
 +    tok = _FakeTokenizer()
 +    follow_ups_default = [
 +        "Continue.",
 +        "Tell me more.",
 +        "Can you elaborate?",
 +        "What else?",
 +        "Go deeper.",
 +        "Expand on that.",
 +        "And then?",
 +    ]
++
 +    base_dists: dict[str, TokenDist] = {}
 +    ft_dists: dict[str, TokenDist] = {}
 +    ft_gens: dict[str, str] = {}
++
 +    messages: list[dict[str, str]] = [{"role": "user", "content": prompt}]
 +    t1_input = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 +    ft_gens[t1_input] = f"ft-response-1 for {prompt}"
 +    messages.append({"role": "assistant", "content": ft_gens[t1_input]})
++
 +    for turn_idx, sharpness in enumerate(sharpness_per_turn):
 +        follow_up = follow_ups_default[turn_idx % len(follow_ups_default)]
 +        messages.append({"role": "user", "content": follow_up})
 +        chat_str = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 +        base_dists[chat_str] = _decay_dist(0.0, broad=True)
 +        ft_dists[chat_str] = _decay_dist(value=sharpness, broad=False)
 +        if turn_idx < len(sharpness_per_turn) - 1:
 +            ft_gens[chat_str] = f"ft-response-{turn_idx + 2} for {prompt}"
 +            messages.append({"role": "assistant", "content": ft_gens[chat_str]})
++
 +    backend = DummyDifferentialBackend(
 +        base=DummyResponses(token_dists=base_dists),
 +        ft=DummyResponses(token_dists=ft_dists, generations=ft_gens),
 +    )
 +    _attach_tokenizer(backend, tok)
 +    return backend