sway Public

Watch 0 Fork 0 Star 0

Python · 7738 bytes Raw Blame History

  
        1
        """Outlier-prompt miner — F11 / S17.
      
        2
        
        3
        Companion to :mod:`paraphrase_miner`. Where the paraphrase miner sharpens
      
        4
        a single ``(prompt, gold)`` case, the outlier miner answers a broader
      
        5
        question: "given a *pool* of candidate prompts, which ones produce the
      
        6
        biggest (and smallest) signal under my chosen probe?"
      
        7
        
        8
        Use cases:
      
        9
        
        10
        - **delta_kl outliers.** "Which of these 100 doc-derived prompts shifts
      
        11
          the model the most?" → gives a user the five prompts they should
      
        12
          paste into a future gate. Supported today.
      
        13
        - **leakage outliers.** "Which chunks of training text are most at
      
        14
          risk of verbatim recital?" The shipped ``leakage`` probe is
      
        15
          section-based rather than prompt-based, so it's handled by a
      
        16
          future sprint; the outlier miner falls back to a clean error when
      
        17
          asked for it today.
      
        18
        - **paraphrase_invariance outliers.** Case-structured, same story
      
        19
          as ``leakage``. Paired with the paraphrase miner when you want a
      
        20
          wider net than per-case exploration; direct outlier mining on
      
        21
          cases is future work.
      
        22
        
        23
        The miner runs the chosen probe once per candidate and ranks by
      
        24
        ``raw``. Probes that reject a single-prompt spec (e.g. ``min_prompts``
      
        25
        gates) surface as ``None`` and are simply skipped from the ranking.
      
        26
        
        27
        No probe registration: this module is an evaluation tool, not a probe.
      
        28
        Output is a ranked dataclass list the CLI converts to a YAML fragment.
      
        29
        """
      
        30
        
        31
        from __future__ import annotations
      
        32
        
        33
        from dataclasses import dataclass
      
        34
        from typing import TYPE_CHECKING, Any
      
        35
        
        36
        from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
      
        37
        
        38
        if TYPE_CHECKING:
      
        39
            from dlm_sway.core.scoring import DifferentialBackend
      
        40
        
        41
        
        42
        @dataclass(frozen=True, slots=True)
      
        43
        class OutlierCandidate:
      
        44
            """One ranked prompt from the outlier-mining pool."""
      
        45
        
        46
            prompt: str
      
        47
            raw: float
      
        48
            """The probe's ``raw`` value on this single prompt. Signed — a
      
        49
            negative raw on ``external_perplexity`` (forgetting) ranks in the
      
        50
            bottom-K; a positive raw on ``delta_kl`` ranks in the top-K."""
      
        51
            index: int
      
        52
            """Position in the original candidate pool. Useful for reproducing
      
        53
            the pool when the source was a deterministic corpus slice."""
      
        54
        
        55
        
        56
        @dataclass(frozen=True, slots=True)
      
        57
        class OutlierResult:
      
        58
            """Top-K and bottom-K candidates for one miner run."""
      
        59
        
        60
            probe_kind: str
      
        61
            top: list[OutlierCandidate]
      
        62
            bottom: list[OutlierCandidate]
      
        63
        
        64
        
        65
        def mine_outliers(
      
        66
            *,
      
        67
            probe_kind: str,
      
        68
            candidate_prompts: list[str],
      
        69
            backend: DifferentialBackend,
      
        70
            top_k: int = 10,
      
        71
            seed: int = 0,
      
        72
        ) -> OutlierResult:
      
        73
            """Run ``probe_kind`` once per candidate prompt, rank by ``|raw|``.
      
        74
        
        75
            Parameters
      
        76
            ----------
      
        77
            probe_kind:
      
        78
                The probe registry key — currently ``"delta_kl"``,
      
        79
                ``"paraphrase_invariance"``, or ``"leakage"`` are the probes
      
        80
                S17's DoD names. Any probe accepting ``prompts: list[str]``
      
        81
                and reporting a ``raw`` float works — the miner imports
      
        82
                through :func:`dlm_sway.probes.base.build_probe`.
      
        83
            candidate_prompts:
      
        84
                The pool to rank. Duplicates are kept (so a user feeding
      
        85
                chunked corpus text sees every chunk's rank position).
      
        86
            backend:
      
        87
                Differential backend — probes consume it through
      
        88
                :class:`RunContext`.
      
        89
            top_k:
      
        90
                Return the top-``k`` and bottom-``k`` candidates. Clipped to
      
        91
                the pool size when smaller.
      
        92
            seed:
      
        93
                Threaded into :class:`RunContext` so probes that pick
      
        94
                randomly (e.g. bootstrap) are deterministic.
      
        95
            """
      
        96
            if top_k <= 0:
      
        97
                raise ValueError(f"top_k must be positive; got {top_k}")
      
        98
            if not candidate_prompts:
      
        99
                return OutlierResult(probe_kind=probe_kind, top=[], bottom=[])
      
        100
        
        101
            from dlm_sway.probes.base import RunContext, build_probe
      
        102
        
        103
            ctx = RunContext(backend=backend, seed=seed)
      
        104
            scored: list[OutlierCandidate] = []
      
        105
            for idx, candidate in enumerate(candidate_prompts):
      
        106
                raw = _score_single_prompt(probe_kind, candidate, ctx, build_probe)
      
        107
                if raw is None:
      
        108
                    continue
      
        109
                scored.append(OutlierCandidate(prompt=candidate, raw=raw, index=idx))
      
        110
        
        111
            # F04 (Audit 03) — reject pools smaller than ``2 * top_k`` distinct
      
        112
            # scored prompts. Below that floor the "top" and "bottom" lists
      
        113
            # end up overlapping (same prompt can appear in both) and the
      
        114
            # output loses the outlier-vs-norm contrast the miner is supposed
      
        115
            # to surface. The audit observed this on a 1-distinct-prompt pool
      
        116
            # where the top and bottom lists both contained that single prompt.
      
        117
            #
      
        118
            # Apply AFTER scoring so unsupported probe_kinds (no prompts get
      
        119
            # scored → scored=[]) return an empty OutlierResult cleanly
      
        120
            # instead of raising. The empty-result contract is established by
      
        121
            # pre-F04 tests and load-bearing for probe-kind-not-supported UX.
      
        122
            if scored:
      
        123
                distinct_count = len({c.prompt for c in scored})
      
        124
                required = 2 * top_k
      
        125
                if distinct_count < required:
      
        126
                    from dlm_sway.core.errors import SwayError
      
        127
        
        128
                    suggested = max(1, distinct_count // 2)
      
        129
                    raise SwayError(
      
        130
                        f"outlier miner pool has {distinct_count} distinct prompt(s), "
      
        131
                        f"below the 2·top_k={required} floor — ``top`` and ``bottom`` "
      
        132
                        f"lists would overlap. Pass --top-k {suggested} or supply "
      
        133
                        f"--from-corpus to widen the pool."
      
        134
                    )
      
        135
        
        136
            # Top = most positive raw; bottom = most negative raw. These
      
        137
            # differ for signed metrics (external_perplexity deltas can be
      
        138
            # negative; delta_kl is ≥ 0 but the bottom-K still finds the
      
        139
            # least-moving prompts).
      
        140
            top = sorted(scored, key=lambda c: c.raw, reverse=True)[: min(top_k, len(scored))]
      
        141
            bottom = sorted(scored, key=lambda c: c.raw)[: min(top_k, len(scored))]
      
        142
            return OutlierResult(probe_kind=probe_kind, top=top, bottom=bottom)
      
        143
        
        144
        
        145
        def _score_single_prompt(
      
        146
            probe_kind: str,
      
        147
            prompt: str,
      
        148
            ctx: Any,
      
        149
            build_probe: Any,
      
        150
        ) -> float | None:
      
        151
            """Score one prompt under the given probe kind, return ``raw`` or None.
      
        152
        
        153
            We avoid a global switch over probe kinds by building the spec
      
        154
            shape each probe expects from its own registry entry. This lets a
      
        155
            user plug a custom probe without touching the miner code — as long
      
        156
            as the probe accepts a ``prompts: [single]`` spec and reports a
      
        157
            ``raw`` float, the outlier miner can rank it.
      
        158
            """
      
        159
            # Minimal one-prompt spec for every supported probe kind.
      
        160
            # ``delta_kl`` is the primary target today — its
      
        161
            # ``prompts: list[str]`` spec shape slots cleanly into per-prompt
      
        162
            # scoring. ``leakage`` + ``paraphrase_invariance`` have different
      
        163
            # spec architectures (sections / cases) and are future work.
      
        164
            raw_spec: dict[str, Any] | None = (
      
        165
                {"kind": "delta_kl", "prompts": [prompt]} if probe_kind == "delta_kl" else None
      
        166
            )
      
        167
            if raw_spec is None:
      
        168
                return None
      
        169
            raw_spec["name"] = f"outlier_probe_{probe_kind}"
      
        170
        
        171
            probe, spec = build_probe(raw_spec)
      
        172
            try:
      
        173
                result = probe.run(spec, ctx)
      
        174
            except Exception:
      
        175
                # Single-prompt runs can fail probe-specific guards (e.g.
      
        176
                # leakage's min-length check). Treat as "no signal" and skip.
      
        177
                return None
      
        178
            if result.raw is None:
      
        179
                return None
      
        180
            return float(result.raw)
      
        181
        
        182
        
        183
        def corpus_prompts(corpus_name: str, *, chunk_chars: int = 256, max_chunks: int = 64) -> list[str]:
      
        184
            """Convenience: pull candidate prompts from a packaged corpus.
      
        185
        
        186
            Thin wrapper over :func:`dlm_sway.probes._external_corpus.chunk_corpus`
      
        187
            so the CLI's ``--from-corpus`` flag has a one-liner.
      
        188
            """
      
        189
            return chunk_corpus(load_corpus(corpus_name), chunk_chars=chunk_chars, max_chunks=max_chunks)
      
        190
        
        191
        
        192
        __all__ = [
      
        193
            "OutlierCandidate",
      
        194
            "OutlierResult",
      
        195
            "corpus_prompts",
      
        196
            "mine_outliers",
      
        197
        ]

1	"""Outlier-prompt miner — F11 / S17.
2
3	Companion to :mod:`paraphrase_miner`. Where the paraphrase miner sharpens
4	a single ``(prompt, gold)`` case, the outlier miner answers a broader
5	question: "given a pool of candidate prompts, which ones produce the
6	biggest (and smallest) signal under my chosen probe?"
7
8	Use cases:
9
10	- delta_kl outliers. "Which of these 100 doc-derived prompts shifts
11	the model the most?" → gives a user the five prompts they should
12	paste into a future gate. Supported today.
13	- leakage outliers. "Which chunks of training text are most at
14	risk of verbatim recital?" The shipped ``leakage`` probe is
15	section-based rather than prompt-based, so it's handled by a
16	future sprint; the outlier miner falls back to a clean error when
17	asked for it today.
18	- paraphrase_invariance outliers. Case-structured, same story
19	as ``leakage``. Paired with the paraphrase miner when you want a
20	wider net than per-case exploration; direct outlier mining on
21	cases is future work.
22
23	The miner runs the chosen probe once per candidate and ranks by
24	``raw``. Probes that reject a single-prompt spec (e.g. ``min_prompts``
25	gates) surface as ``None`` and are simply skipped from the ranking.
26
27	No probe registration: this module is an evaluation tool, not a probe.
28	Output is a ranked dataclass list the CLI converts to a YAML fragment.
29	"""
30
31	from __future__ import annotations
32
33	from dataclasses import dataclass
34	from typing import TYPE_CHECKING, Any
35
36	from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
37
38	if TYPE_CHECKING:
39	from dlm_sway.core.scoring import DifferentialBackend
40
41
42	@dataclass(frozen=True, slots=True)
43	class OutlierCandidate:
44	"""One ranked prompt from the outlier-mining pool."""
45
46	prompt: str
47	raw: float
48	"""The probe's ``raw`` value on this single prompt. Signed — a
49	negative raw on ``external_perplexity`` (forgetting) ranks in the
50	bottom-K; a positive raw on ``delta_kl`` ranks in the top-K."""
51	index: int
52	"""Position in the original candidate pool. Useful for reproducing
53	the pool when the source was a deterministic corpus slice."""
54
55
56	@dataclass(frozen=True, slots=True)
57	class OutlierResult:
58	"""Top-K and bottom-K candidates for one miner run."""
59
60	probe_kind: str
61	top: list[OutlierCandidate]
62	bottom: list[OutlierCandidate]
63
64
65	def mine_outliers(
66	*,
67	probe_kind: str,
68	candidate_prompts: list[str],
69	backend: DifferentialBackend,
70	top_k: int = 10,
71	seed: int = 0,
72	) -> OutlierResult:
73	"""Run ``probe_kind`` once per candidate prompt, rank by ``\|raw\|``.
74
75	Parameters
76	----------
77	probe_kind:
78	The probe registry key — currently ``"delta_kl"``,
79	``"paraphrase_invariance"``, or ``"leakage"`` are the probes
80	S17's DoD names. Any probe accepting ``prompts: list[str]``
81	and reporting a ``raw`` float works — the miner imports
82	through :func:`dlm_sway.probes.base.build_probe`.
83	candidate_prompts:
84	The pool to rank. Duplicates are kept (so a user feeding
85	chunked corpus text sees every chunk's rank position).
86	backend:
87	Differential backend — probes consume it through
88	:class:`RunContext`.
89	top_k:
90	Return the top-``k`` and bottom-``k`` candidates. Clipped to
91	the pool size when smaller.
92	seed:
93	Threaded into :class:`RunContext` so probes that pick
94	randomly (e.g. bootstrap) are deterministic.
95	"""
96	if top_k <= 0:
97	raise ValueError(f"top_k must be positive; got {top_k}")
98	if not candidate_prompts:
99	return OutlierResult(probe_kind=probe_kind, top=[], bottom=[])
100
101	from dlm_sway.probes.base import RunContext, build_probe
102
103	ctx = RunContext(backend=backend, seed=seed)
104	scored: list[OutlierCandidate] = []
105	for idx, candidate in enumerate(candidate_prompts):
106	raw = _score_single_prompt(probe_kind, candidate, ctx, build_probe)
107	if raw is None:
108	continue
109	scored.append(OutlierCandidate(prompt=candidate, raw=raw, index=idx))
110
111	# F04 (Audit 03) — reject pools smaller than ``2 * top_k`` distinct
112	# scored prompts. Below that floor the "top" and "bottom" lists
113	# end up overlapping (same prompt can appear in both) and the
114	# output loses the outlier-vs-norm contrast the miner is supposed
115	# to surface. The audit observed this on a 1-distinct-prompt pool
116	# where the top and bottom lists both contained that single prompt.
117	#
118	# Apply AFTER scoring so unsupported probe_kinds (no prompts get
119	# scored → scored=[]) return an empty OutlierResult cleanly
120	# instead of raising. The empty-result contract is established by
121	# pre-F04 tests and load-bearing for probe-kind-not-supported UX.
122	if scored:
123	distinct_count = len({c.prompt for c in scored})
124	required = 2 * top_k
125	if distinct_count < required:
126	from dlm_sway.core.errors import SwayError
127
128	suggested = max(1, distinct_count // 2)
129	raise SwayError(
130	f"outlier miner pool has {distinct_count} distinct prompt(s), "
131	f"below the 2·top_k={required} floor — ``top`` and ``bottom`` "
132	f"lists would overlap. Pass --top-k {suggested} or supply "
133	f"--from-corpus to widen the pool."
134	)
135
136	# Top = most positive raw; bottom = most negative raw. These
137	# differ for signed metrics (external_perplexity deltas can be
138	# negative; delta_kl is ≥ 0 but the bottom-K still finds the
139	# least-moving prompts).
140	top = sorted(scored, key=lambda c: c.raw, reverse=True)[: min(top_k, len(scored))]
141	bottom = sorted(scored, key=lambda c: c.raw)[: min(top_k, len(scored))]
142	return OutlierResult(probe_kind=probe_kind, top=top, bottom=bottom)
143
144
145	def _score_single_prompt(
146	probe_kind: str,
147	prompt: str,
148	ctx: Any,
149	build_probe: Any,
150	) -> float \| None:
151	"""Score one prompt under the given probe kind, return ``raw`` or None.
152
153	We avoid a global switch over probe kinds by building the spec
154	shape each probe expects from its own registry entry. This lets a
155	user plug a custom probe without touching the miner code — as long
156	as the probe accepts a ``prompts: [single]`` spec and reports a
157	``raw`` float, the outlier miner can rank it.
158	"""
159	# Minimal one-prompt spec for every supported probe kind.
160	# ``delta_kl`` is the primary target today — its
161	# ``prompts: list[str]`` spec shape slots cleanly into per-prompt
162	# scoring. ``leakage`` + ``paraphrase_invariance`` have different
163	# spec architectures (sections / cases) and are future work.
164	raw_spec: dict[str, Any] \| None = (
165	{"kind": "delta_kl", "prompts": [prompt]} if probe_kind == "delta_kl" else None
166	)
167	if raw_spec is None:
168	return None
169	raw_spec["name"] = f"outlier_probe_{probe_kind}"
170
171	probe, spec = build_probe(raw_spec)
172	try:
173	result = probe.run(spec, ctx)
174	except Exception:
175	# Single-prompt runs can fail probe-specific guards (e.g.
176	# leakage's min-length check). Treat as "no signal" and skip.
177	return None
178	if result.raw is None:
179	return None
180	return float(result.raw)
181
182
183	def corpus_prompts(corpus_name: str, *, chunk_chars: int = 256, max_chunks: int = 64) -> list[str]:
184	"""Convenience: pull candidate prompts from a packaged corpus.
185
186	Thin wrapper over :func:`dlm_sway.probes._external_corpus.chunk_corpus`
187	so the CLI's ``--from-corpus`` flag has a one-liner.
188	"""
189	return chunk_corpus(load_corpus(corpus_name), chunk_chars=chunk_chars, max_chunks=max_chunks)
190
191
192	__all__ = [
193	"OutlierCandidate",
194	"OutlierResult",
195	"corpus_prompts",
196	"mine_outliers",
197	]