`e05932b`

sway(probes): B3 preference_flip on chosen/rejected margin inversion

Authored by

espadonne 3 weeks ago

SHA: e05932b5467c0565a6cf919c3c2ca472af4ca942
Parents: 887bb35
Tree: d34264d

2 changed files

Status	File	+	-
A	`src/dlm_sway/probes/preference_flip.py`	140	0
A	`tests/unit/test_probe_preference_flip.py`	161	0

src/dlm_sway/probes/preference_flip.pyadded

 +"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking?
++
 +For each ``(prompt, chosen, rejected)`` triple, compute the margin
++
 +.. math::
 +    m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt})
++
 +under both base and fine-tuned views. Interesting triples are the ones
 +where base got the sign *wrong* (``m_base < 0``); we fail if the
 +fine-tune doesn't flip a large enough fraction of them.
++
 +Triples come from either an inline ``triples:`` block in the spec or
 +from PREFERENCE sections in :attr:`RunContext.sections`. The probe
 +returns :attr:`Verdict.SKIP` when no triples are present — this is the
 +"no PREFERENCE sections in your document" case, graceful by design.
 +"""
++
 +from __future__ import annotations
++
 +import statistics
 +from typing import Literal
++
 +from pydantic import BaseModel, ConfigDict, Field
++
 +from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
 +class PreferenceTriple(BaseModel):
 +    model_config = ConfigDict(extra="forbid", frozen=True)
++
 +    prompt: str
 +    chosen: str
 +    rejected: str
++
++
 +class PreferenceFlipSpec(ProbeSpec):
 +    kind: Literal["preference_flip"] = "preference_flip"
 +    triples: list[PreferenceTriple] = Field(default_factory=list)
 +    """Inline triples. If empty, the probe pulls from PREFERENCE
 +    sections in ctx.sections; if neither is available the probe SKIPs."""
 +    assert_flip_rate_gte: float = 0.7
 +    """Fraction of *base-wrong* triples that must flip under ft."""
 +    min_triples_for_decision: int = 3
++
++
 +class PreferenceFlipProbe(Probe):
 +    kind = "preference_flip"
 +    spec_cls = PreferenceFlipSpec
 +    category = "attribution"
++
 +    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 +        assert isinstance(spec, PreferenceFlipSpec)
 +        triples = list(spec.triples) or _triples_from_sections(ctx)
 +        if not triples:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.SKIP,
 +                score=None,
 +                message="no preference triples (inline or from sections)",
 +            )
++
 +        base_margins: list[float] = []
 +        ft_margins: list[float] = []
 +        for t in triples:
 +            with ctx.backend.as_base() as b:
 +                base_margins.append(
 +                    b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected)
 +                )
 +            with ctx.backend.as_finetuned() as f:
 +                ft_margins.append(
 +                    f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected)
 +                )
++
 +        # Interesting denominator: base got it wrong.
 +        base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0]
 +        flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0]
++
 +        if len(base_wrong_idx) < spec.min_triples_for_decision:
 +            # Not enough base-wrong triples to decide. Fall back to mean margin delta.
 +            mean_delta = statistics.fmean(
 +                (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True)
 +            )
 +            verdict = Verdict.WARN
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=verdict,
 +                score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)),
 +                raw=mean_delta,
 +                base_value=statistics.fmean(base_margins),
 +                ft_value=statistics.fmean(ft_margins),
 +                evidence={
 +                    "base_wrong": len(base_wrong_idx),
 +                    "total": len(triples),
 +                    "mean_margin_delta": mean_delta,
 +                    "weight": spec.weight,
 +                },
 +                message=(
 +                    f"only {len(base_wrong_idx)} base-wrong triples < "
 +                    f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}"
 +                ),
 +            )
++
 +        flip_rate = len(flipped_idx) / len(base_wrong_idx)
 +        verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL
 +        score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6))
 +        return ProbeResult(
 +            name=spec.name,
 +            kind=spec.kind,
 +            verdict=verdict,
 +            score=score,
 +            raw=flip_rate,
 +            base_value=statistics.fmean(base_margins),
 +            ft_value=statistics.fmean(ft_margins),
 +            evidence={
 +                "flip_rate": flip_rate,
 +                "flipped": len(flipped_idx),
 +                "base_wrong": len(base_wrong_idx),
 +                "total": len(triples),
 +                "weight": spec.weight,
 +            },
 +            message=(
 +                f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} "
 +                f"base-wrong triples flipped by ft)"
 +            ),
 +        )
++
++
 +def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]:
 +    if ctx.sections is None:
 +        return []
 +    out: list[PreferenceTriple] = []
 +    for s in ctx.sections:
 +        if s.kind != "preference":
 +            continue
 +        for p in s.preferences:
 +            out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected))
 +    return out

tests/unit/test_probe_preference_flip.pyadded

 +"""Tests for :mod:`dlm_sway.probes.preference_flip`."""
++
 +from __future__ import annotations
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.core.sections import Section, SectionPreference
 +from dlm_sway.probes.base import RunContext, build_probe
++
++
 +def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend:
 +    """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin).
++
 +    We distribute the margin half to the chosen and half (negative) to
 +    the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected)
 +    equal the requested margin.
 +    """
 +    base_lp: dict[tuple[str, str], float] = {}
 +    ft_lp: dict[tuple[str, str], float] = {}
 +    for prompt, chosen, rejected, base_m, ft_m in pairs:
 +        base_lp[(prompt, chosen)] = base_m / 2
 +        base_lp[(prompt, rejected)] = -base_m / 2
 +        ft_lp[(prompt, chosen)] = ft_m / 2
 +        ft_lp[(prompt, rejected)] = -ft_m / 2
 +    return DummyDifferentialBackend(
 +        base=DummyResponses(logprobs=base_lp),
 +        ft=DummyResponses(logprobs=ft_lp),
 +    )
++
++
 +def test_pass_when_base_wrong_flipped() -> None:
 +    backend = _backend(
 +        [
 +            ("p1", "good1", "bad1", -2.0, 2.0),  # base wrong, ft flips
 +            ("p2", "good2", "bad2", -1.5, 1.0),  # base wrong, ft flips
 +            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
 +            ("p4", "good4", "bad4", 1.0, 2.0),  # base already right (no contribution)
 +        ]
 +    )
 +    triples = [
 +        {"prompt": p, "chosen": c, "rejected": r}
 +        for (p, c, r, _, _) in [
 +            ("p1", "good1", "bad1", 0, 0),
 +            ("p2", "good2", "bad2", 0, 0),
 +            ("p3", "good3", "bad3", 0, 0),
 +            ("p4", "good4", "bad4", 0, 0),
 +        ]
 +    ]
 +    probe, spec = build_probe(
 +        {
 +            "name": "pf",
 +            "kind": "preference_flip",
 +            "triples": triples,
 +            "assert_flip_rate_gte": 0.7,
 +            "min_triples_for_decision": 3,
 +        }
 +    )
 +    ctx = RunContext(backend=backend)
 +    result = probe.run(spec, ctx)
 +    assert result.verdict == Verdict.PASS
 +    assert result.raw == 1.0  # 3/3 flipped
++
++
 +def test_fail_when_base_wrong_not_flipped() -> None:
 +    backend = _backend(
 +        [
 +            ("p1", "good1", "bad1", -2.0, -1.5),  # base wrong, ft still wrong
 +            ("p2", "good2", "bad2", -1.5, -1.0),  # base wrong, ft still wrong
 +            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
 +        ]
 +    )
 +    triples = [
 +        {"prompt": p, "chosen": c, "rejected": r}
 +        for p, c, r in [
 +            ("p1", "good1", "bad1"),
 +            ("p2", "good2", "bad2"),
 +            ("p3", "good3", "bad3"),
 +        ]
 +    ]
 +    probe, spec = build_probe(
 +        {
 +            "name": "pf",
 +            "kind": "preference_flip",
 +            "triples": triples,
 +            "assert_flip_rate_gte": 0.7,
 +            "min_triples_for_decision": 3,
 +        }
 +    )
 +    ctx = RunContext(backend=backend)
 +    result = probe.run(spec, ctx)
 +    assert result.verdict == Verdict.FAIL
 +    assert result.raw is not None
 +    assert result.raw < 0.7
++
++
 +def test_skip_when_no_triples_anywhere() -> None:
 +    probe, spec = build_probe({"name": "pf", "kind": "preference_flip"})
 +    backend = _backend([])
 +    ctx = RunContext(backend=backend)
 +    result = probe.run(spec, ctx)
 +    assert result.verdict == Verdict.SKIP
++
++
 +def test_warn_when_too_few_base_wrong() -> None:
 +    backend = _backend(
 +        [
 +            ("p1", "good1", "bad1", 1.0, 2.0),  # base right
 +            ("p2", "good2", "bad2", 0.5, 1.0),  # base right
 +            ("p3", "good3", "bad3", -0.5, 0.5),  # base wrong
 +        ]
 +    )
 +    triples = [
 +        {"prompt": p, "chosen": c, "rejected": r}
 +        for p, c, r in [
 +            ("p1", "good1", "bad1"),
 +            ("p2", "good2", "bad2"),
 +            ("p3", "good3", "bad3"),
 +        ]
 +    ]
 +    probe, spec = build_probe(
 +        {
 +            "name": "pf",
 +            "kind": "preference_flip",
 +            "triples": triples,
 +            "min_triples_for_decision": 3,
 +        }
 +    )
 +    ctx = RunContext(backend=backend)
 +    result = probe.run(spec, ctx)
 +    assert result.verdict == Verdict.WARN
++
++
 +def test_triples_pulled_from_sections() -> None:
 +    pref_section = Section(
 +        id="p1",
 +        kind="preference",
 +        content="...",
 +        preferences=(
 +            SectionPreference(prompt="q1", chosen="good", rejected="bad"),
 +            SectionPreference(prompt="q2", chosen="good2", rejected="bad2"),
 +            SectionPreference(prompt="q3", chosen="good3", rejected="bad3"),
 +        ),
 +    )
 +    backend = _backend(
 +        [
 +            ("q1", "good", "bad", -1.0, 1.0),
 +            ("q2", "good2", "bad2", -1.0, 1.0),
 +            ("q3", "good3", "bad3", -1.0, 1.0),
 +        ]
 +    )
 +    probe, spec = build_probe(
 +        {
 +            "name": "pf",
 +            "kind": "preference_flip",
 +            "assert_flip_rate_gte": 0.7,
 +            "min_triples_for_decision": 3,
 +        }
 +    )
 +    ctx = RunContext(backend=backend, sections=(pref_section,))
 +    result = probe.run(spec, ctx)
 +    assert result.verdict == Verdict.PASS