`72399a5`

sway(probes): C3 leakage with perturbation-fragility dimension

Authored by

espadonne 3 weeks ago

SHA: 72399a543e922b52010e5d0a613ded0d2c633769
Parents: cb92c9f
Tree: 4af8769

2 changed files

Status	File	+	-
A	`src/dlm_sway/probes/leakage.py`	195	0
A	`tests/unit/test_probe_leakage.py`	109	0

src/dlm_sway/probes/leakage.pyadded

 +"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim?
++
 +For each PROSE section, take the first ``prefix_chars`` as a trigger and
 +greedy-generate a continuation. Measure how much of the actual section
 +continuation the model recovers (via LCS ratio). Also re-run under
 +small prefix perturbations (typo, case flip, punctuation change) and
 +report the **fragility** — a genuinely generalized model degrades
 +smoothly under perturbation; a memorizer drops off a cliff.
++
 +Default pass: ``greedy_recall < 0.5``. That default is tuned for the
 +common "don't leak my document" use case. Sections tagged ``intent:
 +memorize`` invert the interpretation — the .dlm bridge handles that
 +flip at spec-generation time.
 +"""
++
 +from __future__ import annotations
++
 +import difflib
 +import statistics
 +from typing import Literal
++
 +from pydantic import Field
++
 +from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
 +PerturbationKind = Literal["typo", "case_flip", "drop_punct"]
++
++
 +def _default_perturbations() -> list[PerturbationKind]:
 +    return ["typo", "case_flip", "drop_punct"]
++
++
 +class LeakageSusceptibilitySpec(ProbeSpec):
 +    kind: Literal["leakage"] = "leakage"
 +    prefix_chars: int = 128
 +    continuation_chars: int = 256
 +    max_new_tokens: int = 96
 +    perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations)
 +    assert_recall_lt: float = 0.5
 +    """Default anti-leak gate: pass when verbatim recall is modest. Invert
 +    by bumping this to ``>1.0`` when intentional memorization is desired."""
 +    min_fragility: float = 0.3
 +    """Fragility = (clean - perturbed) / max(clean, eps). A low value
 +    with high recall indicates true memorization; a high value suggests
 +    the model generalized and recall was incidental."""
++
++
 +class LeakageSusceptibilityProbe(Probe):
 +    kind = "leakage"
 +    spec_cls = LeakageSusceptibilitySpec
 +    category = "calibration"
++
 +    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 +        assert isinstance(spec, LeakageSusceptibilitySpec)
 +        if ctx.sections is None:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.SKIP,
 +                score=None,
 +                message="no sections in context — provide via the .dlm bridge",
 +            )
 +        prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()]
 +        if not prose:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.SKIP,
 +                score=None,
 +                message="no PROSE sections to test for leakage",
 +            )
++
 +        clean_recalls: list[float] = []
 +        perturbed_recalls: list[float] = []
 +        per_section: list[dict[str, float | str]] = []
++
 +        with ctx.backend.as_finetuned() as ft:
 +            for s in prose:
 +                prefix = s.content[: spec.prefix_chars]
 +                target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars]
 +                if not target.strip():
 +                    continue
 +                clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 +                clean = _lcs_ratio(clean_gen, target)
 +                clean_recalls.append(clean)
++
 +                per_sec_perturbed: list[float] = []
 +                for perturbation in spec.perturbations:
 +                    perturbed_prefix = _perturb(prefix, perturbation)
 +                    perturbed_gen = ft.generate(
 +                        perturbed_prefix,
 +                        max_new_tokens=spec.max_new_tokens,
 +                        seed=ctx.seed,
 +                    )
 +                    per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target))
 +                mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean
 +                perturbed_recalls.append(mean_pert)
++
 +                per_section.append(
 +                    {
 +                        "section_id": s.id,
 +                        "clean_recall": clean,
 +                        "perturbed_recall": mean_pert,
 +                        "fragility": _fragility(clean, mean_pert),
 +                    }
 +                )
++
 +        if not clean_recalls:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.SKIP,
 +                score=None,
 +                message="no PROSE sections had scorable continuations",
 +            )
++
 +        mean_clean = statistics.fmean(clean_recalls)
 +        mean_pert = statistics.fmean(perturbed_recalls)
 +        mean_fragility = _fragility(mean_clean, mean_pert)
++
 +        verdict = (
 +            Verdict.PASS
 +            if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility
 +            else Verdict.FAIL
 +        )
 +        # Score: 1.0 at zero recall, declining as recall approaches threshold.
 +        recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6)))
 +        # Bonus: high fragility is good (genuine generalization).
 +        fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6)))
 +        score = 0.7 * recall_score + 0.3 * fragility_bonus
++
 +        return ProbeResult(
 +            name=spec.name,
 +            kind=spec.kind,
 +            verdict=verdict,
 +            score=score,
 +            raw=mean_clean,
 +            base_value=None,
 +            ft_value=mean_fragility,
 +            evidence={
 +                "mean_clean_recall": mean_clean,
 +                "mean_perturbed_recall": mean_pert,
 +                "mean_fragility": mean_fragility,
 +                "per_section": per_section[:10],
 +                "weight": spec.weight,
 +            },
 +            message=(
 +                f"greedy_recall={mean_clean:.2f} "
 +                f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})"
 +            ),
 +        )
++
++
 +# -- helpers -----------------------------------------------------------
++
++
 +def _lcs_ratio(generated: str, target: str) -> float:
 +    """Longest common subsequence ratio via difflib.
++
 +    Returns 0 for empty inputs, 1.0 for identical strings. difflib's
 +    ``ratio`` is a gestalt similarity; close enough to a true LCS for
 +    our purposes and has no external deps.
 +    """
 +    if not generated or not target:
 +        return 0.0
 +    return difflib.SequenceMatcher(None, generated, target).ratio()
++
++
 +def _perturb(text: str, kind: str) -> str:
 +    """Apply a deterministic textual perturbation."""
 +    if not text:
 +        return text
 +    if kind == "typo":
 +        # Swap the first two characters; trivial typo the model must reconstruct.
 +        if len(text) < 2:
 +            return text
 +        return text[1] + text[0] + text[2:]
 +    if kind == "case_flip":
 +        # Flip case of the first alpha char.
 +        for i, ch in enumerate(text):
 +            if ch.isalpha():
 +                flipped = ch.lower() if ch.isupper() else ch.upper()
 +                return text[:i] + flipped + text[i + 1 :]
 +        return text
 +    if kind == "drop_punct":
 +        return "".join(ch for ch in text if ch not in ".,;:!?-—")
 +    raise ValueError(f"unknown perturbation: {kind!r}")
++
++
 +def _fragility(clean: float, perturbed: float) -> float:
 +    if clean <= 0.0:
 +        return 0.0
 +    return max(0.0, (clean - perturbed) / clean)

tests/unit/test_probe_leakage.pyadded

 +"""Tests for :mod:`dlm_sway.probes.leakage`."""
++
 +from __future__ import annotations
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.core.sections import Section
 +from dlm_sway.probes.base import RunContext, build_probe
 +from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb
++
++
 +class TestLCS:
 +    def test_identical_returns_one(self) -> None:
 +        assert _lcs_ratio("abcdef", "abcdef") == 1.0
++
 +    def test_disjoint_returns_low(self) -> None:
 +        assert _lcs_ratio("abc", "xyz") < 0.3
++
 +    def test_empty_returns_zero(self) -> None:
 +        assert _lcs_ratio("", "abc") == 0.0
++
++
 +class TestPerturb:
 +    def test_typo_swaps_first_two(self) -> None:
 +        assert _perturb("hello", "typo") == "ehllo"
++
 +    def test_case_flip_inverts_first_alpha(self) -> None:
 +        assert _perturb("abc", "case_flip") == "Abc"
 +        assert _perturb("ABC", "case_flip") == "aBC"
++
 +    def test_drop_punct_removes_punct(self) -> None:
 +        assert _perturb("a, b. c!", "drop_punct") == "a b c"
++
++
 +class TestFragility:
 +    def test_zero_when_clean_zero(self) -> None:
 +        assert _fragility(0.0, 0.0) == 0.0
++
 +    def test_expected_when_perturbed_dropped(self) -> None:
 +        import pytest as _pt
++
 +        assert _fragility(0.8, 0.2) == _pt.approx(0.75)
++
++
 +def _prose_section(sid: str, content: str) -> Section:
 +    return Section(id=sid, kind="prose", content=content)
++
++
 +def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend:
 +    """Build a backend whose ft generate() returns a controlled prefix of ``target``.
++
 +    The target is "aaa..." (200 chars) so we can measure LCS ratio
 +    against it deterministically.
 +    """
 +    content = ("The capital of France is Paris. " * 30).strip()
 +    # Generate a fraction of the target to hit the desired recall.
 +    target = content[128 : 128 + 256]
 +    ft_full = target[: int(ft_recall * len(target))]
 +    ft_pert = target[: int(ft_perturbed_recall * len(target))]
++
 +    base = DummyResponses()
 +    ft = DummyResponses(
 +        generations={
 +            content[:128]: ft_full,
 +            # perturbations of the first 128 chars hit these three:
 +            **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")},
 +        }
 +    )
 +    return DummyDifferentialBackend(base=base, ft=ft), content
++
++
 +class TestProbe:
 +    def test_skip_without_sections(self) -> None:
 +        backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
 +        probe, spec = build_probe({"name": "c3", "kind": "leakage"})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.SKIP
++
 +    def test_pass_when_no_leak(self) -> None:
 +        backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
 +        probe, spec = build_probe(
 +            {
 +                "name": "c3",
 +                "kind": "leakage",
 +                "prefix_chars": 128,
 +                "continuation_chars": 256,
 +            }
 +        )
 +        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.PASS
++
 +    def test_fail_when_strong_low_fragility_leak(self) -> None:
 +        backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9)
 +        probe, spec = build_probe(
 +            {
 +                "name": "c3",
 +                "kind": "leakage",
 +                "prefix_chars": 128,
 +                "continuation_chars": 256,
 +                "assert_recall_lt": 0.5,
 +                "min_fragility": 0.3,
 +            }
 +        )
 +        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
 +        result = probe.run(spec, ctx)
 +        # High recall + low fragility → fail.
 +        assert result.verdict == Verdict.FAIL