`72399a5`

sway(probes): C3 leakage with perturbation-fragility dimension

Authored by

espadonne 3 weeks ago

SHA: 72399a543e922b52010e5d0a613ded0d2c633769
Parents: cb92c9f
Tree: 4af8769

2 changed files

Status	File	+	-
A	`src/dlm_sway/probes/leakage.py`	195	0
A	`tests/unit/test_probe_leakage.py`	109	0

src/dlm_sway/probes/leakage.pyadded

++"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim?
++
++For each PROSE section, take the first ``prefix_chars`` as a trigger and
++greedy-generate a continuation. Measure how much of the actual section
++continuation the model recovers (via LCS ratio). Also re-run under
++small prefix perturbations (typo, case flip, punctuation change) and
++report the **fragility** — a genuinely generalized model degrades
++smoothly under perturbation; a memorizer drops off a cliff.
++
++Default pass: ``greedy_recall < 0.5``. That default is tuned for the
++common "don't leak my document" use case. Sections tagged ``intent:
++memorize`` invert the interpretation — the .dlm bridge handles that
++flip at spec-generation time.
++"""
++
++from __future__ import annotations
++
++import difflib
++import statistics
++from typing import Literal
++
++from pydantic import Field
++
++from dlm_sway.core.result import ProbeResult, Verdict
++from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
++PerturbationKind = Literal["typo", "case_flip", "drop_punct"]
++
++
++def _default_perturbations() -> list[PerturbationKind]:
++    return ["typo", "case_flip", "drop_punct"]
++
++
++class LeakageSusceptibilitySpec(ProbeSpec):
++    kind: Literal["leakage"] = "leakage"
++    prefix_chars: int = 128
++    continuation_chars: int = 256
++    max_new_tokens: int = 96
++    perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations)
++    assert_recall_lt: float = 0.5
++    """Default anti-leak gate: pass when verbatim recall is modest. Invert
++    by bumping this to ``>1.0`` when intentional memorization is desired."""
++    min_fragility: float = 0.3
++    """Fragility = (clean - perturbed) / max(clean, eps). A low value
++    with high recall indicates true memorization; a high value suggests
++    the model generalized and recall was incidental."""
++
++
++class LeakageSusceptibilityProbe(Probe):
++    kind = "leakage"
++    spec_cls = LeakageSusceptibilitySpec
++    category = "calibration"
++
++    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
++        assert isinstance(spec, LeakageSusceptibilitySpec)
++        if ctx.sections is None:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message="no sections in context — provide via the .dlm bridge",
++            )
++        prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()]
++        if not prose:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message="no PROSE sections to test for leakage",
++            )
++
++        clean_recalls: list[float] = []
++        perturbed_recalls: list[float] = []
++        per_section: list[dict[str, float | str]] = []
++
++        with ctx.backend.as_finetuned() as ft:
++            for s in prose:
++                prefix = s.content[: spec.prefix_chars]
++                target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars]
++                if not target.strip():
++                    continue
++                clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
++                clean = _lcs_ratio(clean_gen, target)
++                clean_recalls.append(clean)
++
++                per_sec_perturbed: list[float] = []
++                for perturbation in spec.perturbations:
++                    perturbed_prefix = _perturb(prefix, perturbation)
++                    perturbed_gen = ft.generate(
++                        perturbed_prefix,
++                        max_new_tokens=spec.max_new_tokens,
++                        seed=ctx.seed,
++                    )
++                    per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target))
++                mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean
++                perturbed_recalls.append(mean_pert)
++
++                per_section.append(
++                    {
++                        "section_id": s.id,
++                        "clean_recall": clean,
++                        "perturbed_recall": mean_pert,
++                        "fragility": _fragility(clean, mean_pert),
++                    }
++                )
++
++        if not clean_recalls:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message="no PROSE sections had scorable continuations",
++            )
++
++        mean_clean = statistics.fmean(clean_recalls)
++        mean_pert = statistics.fmean(perturbed_recalls)
++        mean_fragility = _fragility(mean_clean, mean_pert)
++
++        verdict = (
++            Verdict.PASS
++            if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility
++            else Verdict.FAIL
++        )
++        # Score: 1.0 at zero recall, declining as recall approaches threshold.
++        recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6)))
++        # Bonus: high fragility is good (genuine generalization).
++        fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6)))
++        score = 0.7 * recall_score + 0.3 * fragility_bonus
++
++        return ProbeResult(
++            name=spec.name,
++            kind=spec.kind,
++            verdict=verdict,
++            score=score,
++            raw=mean_clean,
++            base_value=None,
++            ft_value=mean_fragility,
++            evidence={
++                "mean_clean_recall": mean_clean,
++                "mean_perturbed_recall": mean_pert,
++                "mean_fragility": mean_fragility,
++                "per_section": per_section[:10],
++                "weight": spec.weight,
++            },
++            message=(
++                f"greedy_recall={mean_clean:.2f} "
++                f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})"
++            ),
++        )
++
++
++# -- helpers -----------------------------------------------------------
++
++
++def _lcs_ratio(generated: str, target: str) -> float:
++    """Longest common subsequence ratio via difflib.
++
++    Returns 0 for empty inputs, 1.0 for identical strings. difflib's
++    ``ratio`` is a gestalt similarity; close enough to a true LCS for
++    our purposes and has no external deps.
++    """
++    if not generated or not target:
++        return 0.0
++    return difflib.SequenceMatcher(None, generated, target).ratio()
++
++
++def _perturb(text: str, kind: str) -> str:
++    """Apply a deterministic textual perturbation."""
++    if not text:
++        return text
++    if kind == "typo":
++        # Swap the first two characters; trivial typo the model must reconstruct.
++        if len(text) < 2:
++            return text
++        return text[1] + text[0] + text[2:]
++    if kind == "case_flip":
++        # Flip case of the first alpha char.
++        for i, ch in enumerate(text):
++            if ch.isalpha():
++                flipped = ch.lower() if ch.isupper() else ch.upper()
++                return text[:i] + flipped + text[i + 1 :]
++        return text
++    if kind == "drop_punct":
++        return "".join(ch for ch in text if ch not in ".,;:!?-—")
++    raise ValueError(f"unknown perturbation: {kind!r}")
++
++
++def _fragility(clean: float, perturbed: float) -> float:
++    if clean <= 0.0:
++        return 0.0
++    return max(0.0, (clean - perturbed) / clean)

tests/unit/test_probe_leakage.pyadded

++"""Tests for :mod:`dlm_sway.probes.leakage`."""
++
++from __future__ import annotations
++
++from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
++from dlm_sway.core.result import Verdict
++from dlm_sway.core.sections import Section
++from dlm_sway.probes.base import RunContext, build_probe
++from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb
++
++
++class TestLCS:
++    def test_identical_returns_one(self) -> None:
++        assert _lcs_ratio("abcdef", "abcdef") == 1.0
++
++    def test_disjoint_returns_low(self) -> None:
++        assert _lcs_ratio("abc", "xyz") < 0.3
++
++    def test_empty_returns_zero(self) -> None:
++        assert _lcs_ratio("", "abc") == 0.0
++
++
++class TestPerturb:
++    def test_typo_swaps_first_two(self) -> None:
++        assert _perturb("hello", "typo") == "ehllo"
++
++    def test_case_flip_inverts_first_alpha(self) -> None:
++        assert _perturb("abc", "case_flip") == "Abc"
++        assert _perturb("ABC", "case_flip") == "aBC"
++
++    def test_drop_punct_removes_punct(self) -> None:
++        assert _perturb("a, b. c!", "drop_punct") == "a b c"
++
++
++class TestFragility:
++    def test_zero_when_clean_zero(self) -> None:
++        assert _fragility(0.0, 0.0) == 0.0
++
++    def test_expected_when_perturbed_dropped(self) -> None:
++        import pytest as _pt
++
++        assert _fragility(0.8, 0.2) == _pt.approx(0.75)
++
++
++def _prose_section(sid: str, content: str) -> Section:
++    return Section(id=sid, kind="prose", content=content)
++
++
++def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend:
++    """Build a backend whose ft generate() returns a controlled prefix of ``target``.
++
++    The target is "aaa..." (200 chars) so we can measure LCS ratio
++    against it deterministically.
++    """
++    content = ("The capital of France is Paris. " * 30).strip()
++    # Generate a fraction of the target to hit the desired recall.
++    target = content[128 : 128 + 256]
++    ft_full = target[: int(ft_recall * len(target))]
++    ft_pert = target[: int(ft_perturbed_recall * len(target))]
++
++    base = DummyResponses()
++    ft = DummyResponses(
++        generations={
++            content[:128]: ft_full,
++            # perturbations of the first 128 chars hit these three:
++            **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")},
++        }
++    )
++    return DummyDifferentialBackend(base=base, ft=ft), content
++
++
++class TestProbe:
++    def test_skip_without_sections(self) -> None:
++        backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
++        probe, spec = build_probe({"name": "c3", "kind": "leakage"})
++        ctx = RunContext(backend=backend)
++        result = probe.run(spec, ctx)
++        assert result.verdict == Verdict.SKIP
++
++    def test_pass_when_no_leak(self) -> None:
++        backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
++        probe, spec = build_probe(
++            {
++                "name": "c3",
++                "kind": "leakage",
++                "prefix_chars": 128,
++                "continuation_chars": 256,
++            }
++        )
++        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
++        result = probe.run(spec, ctx)
++        assert result.verdict == Verdict.PASS
++
++    def test_fail_when_strong_low_fragility_leak(self) -> None:
++        backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9)
++        probe, spec = build_probe(
++            {
++                "name": "c3",
++                "kind": "leakage",
++                "prefix_chars": 128,
++                "continuation_chars": 256,
++                "assert_recall_lt": 0.5,
++                "min_fragility": 0.3,
++            }
++        )
++        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
++        result = probe.run(spec, ctx)
++        # High recall + low fragility → fail.
++        assert result.verdict == Verdict.FAIL