Python · 7738 bytes Raw Blame History
1 """Outlier-prompt miner — F11 / S17.
2
3 Companion to :mod:`paraphrase_miner`. Where the paraphrase miner sharpens
4 a single ``(prompt, gold)`` case, the outlier miner answers a broader
5 question: "given a *pool* of candidate prompts, which ones produce the
6 biggest (and smallest) signal under my chosen probe?"
7
8 Use cases:
9
10 - **delta_kl outliers.** "Which of these 100 doc-derived prompts shifts
11 the model the most?" → gives a user the five prompts they should
12 paste into a future gate. Supported today.
13 - **leakage outliers.** "Which chunks of training text are most at
14 risk of verbatim recital?" The shipped ``leakage`` probe is
15 section-based rather than prompt-based, so it's handled by a
16 future sprint; the outlier miner falls back to a clean error when
17 asked for it today.
18 - **paraphrase_invariance outliers.** Case-structured, same story
19 as ``leakage``. Paired with the paraphrase miner when you want a
20 wider net than per-case exploration; direct outlier mining on
21 cases is future work.
22
23 The miner runs the chosen probe once per candidate and ranks by
24 ``raw``. Probes that reject a single-prompt spec (e.g. ``min_prompts``
25 gates) surface as ``None`` and are simply skipped from the ranking.
26
27 No probe registration: this module is an evaluation tool, not a probe.
28 Output is a ranked dataclass list the CLI converts to a YAML fragment.
29 """
30
31 from __future__ import annotations
32
33 from dataclasses import dataclass
34 from typing import TYPE_CHECKING, Any
35
36 from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
37
38 if TYPE_CHECKING:
39 from dlm_sway.core.scoring import DifferentialBackend
40
41
42 @dataclass(frozen=True, slots=True)
43 class OutlierCandidate:
44 """One ranked prompt from the outlier-mining pool."""
45
46 prompt: str
47 raw: float
48 """The probe's ``raw`` value on this single prompt. Signed — a
49 negative raw on ``external_perplexity`` (forgetting) ranks in the
50 bottom-K; a positive raw on ``delta_kl`` ranks in the top-K."""
51 index: int
52 """Position in the original candidate pool. Useful for reproducing
53 the pool when the source was a deterministic corpus slice."""
54
55
56 @dataclass(frozen=True, slots=True)
57 class OutlierResult:
58 """Top-K and bottom-K candidates for one miner run."""
59
60 probe_kind: str
61 top: list[OutlierCandidate]
62 bottom: list[OutlierCandidate]
63
64
65 def mine_outliers(
66 *,
67 probe_kind: str,
68 candidate_prompts: list[str],
69 backend: DifferentialBackend,
70 top_k: int = 10,
71 seed: int = 0,
72 ) -> OutlierResult:
73 """Run ``probe_kind`` once per candidate prompt, rank by ``|raw|``.
74
75 Parameters
76 ----------
77 probe_kind:
78 The probe registry key — currently ``"delta_kl"``,
79 ``"paraphrase_invariance"``, or ``"leakage"`` are the probes
80 S17's DoD names. Any probe accepting ``prompts: list[str]``
81 and reporting a ``raw`` float works — the miner imports
82 through :func:`dlm_sway.probes.base.build_probe`.
83 candidate_prompts:
84 The pool to rank. Duplicates are kept (so a user feeding
85 chunked corpus text sees every chunk's rank position).
86 backend:
87 Differential backend — probes consume it through
88 :class:`RunContext`.
89 top_k:
90 Return the top-``k`` and bottom-``k`` candidates. Clipped to
91 the pool size when smaller.
92 seed:
93 Threaded into :class:`RunContext` so probes that pick
94 randomly (e.g. bootstrap) are deterministic.
95 """
96 if top_k <= 0:
97 raise ValueError(f"top_k must be positive; got {top_k}")
98 if not candidate_prompts:
99 return OutlierResult(probe_kind=probe_kind, top=[], bottom=[])
100
101 from dlm_sway.probes.base import RunContext, build_probe
102
103 ctx = RunContext(backend=backend, seed=seed)
104 scored: list[OutlierCandidate] = []
105 for idx, candidate in enumerate(candidate_prompts):
106 raw = _score_single_prompt(probe_kind, candidate, ctx, build_probe)
107 if raw is None:
108 continue
109 scored.append(OutlierCandidate(prompt=candidate, raw=raw, index=idx))
110
111 # F04 (Audit 03) — reject pools smaller than ``2 * top_k`` distinct
112 # scored prompts. Below that floor the "top" and "bottom" lists
113 # end up overlapping (same prompt can appear in both) and the
114 # output loses the outlier-vs-norm contrast the miner is supposed
115 # to surface. The audit observed this on a 1-distinct-prompt pool
116 # where the top and bottom lists both contained that single prompt.
117 #
118 # Apply AFTER scoring so unsupported probe_kinds (no prompts get
119 # scored → scored=[]) return an empty OutlierResult cleanly
120 # instead of raising. The empty-result contract is established by
121 # pre-F04 tests and load-bearing for probe-kind-not-supported UX.
122 if scored:
123 distinct_count = len({c.prompt for c in scored})
124 required = 2 * top_k
125 if distinct_count < required:
126 from dlm_sway.core.errors import SwayError
127
128 suggested = max(1, distinct_count // 2)
129 raise SwayError(
130 f"outlier miner pool has {distinct_count} distinct prompt(s), "
131 f"below the 2·top_k={required} floor — ``top`` and ``bottom`` "
132 f"lists would overlap. Pass --top-k {suggested} or supply "
133 f"--from-corpus to widen the pool."
134 )
135
136 # Top = most positive raw; bottom = most negative raw. These
137 # differ for signed metrics (external_perplexity deltas can be
138 # negative; delta_kl is ≥ 0 but the bottom-K still finds the
139 # least-moving prompts).
140 top = sorted(scored, key=lambda c: c.raw, reverse=True)[: min(top_k, len(scored))]
141 bottom = sorted(scored, key=lambda c: c.raw)[: min(top_k, len(scored))]
142 return OutlierResult(probe_kind=probe_kind, top=top, bottom=bottom)
143
144
145 def _score_single_prompt(
146 probe_kind: str,
147 prompt: str,
148 ctx: Any,
149 build_probe: Any,
150 ) -> float | None:
151 """Score one prompt under the given probe kind, return ``raw`` or None.
152
153 We avoid a global switch over probe kinds by building the spec
154 shape each probe expects from its own registry entry. This lets a
155 user plug a custom probe without touching the miner code — as long
156 as the probe accepts a ``prompts: [single]`` spec and reports a
157 ``raw`` float, the outlier miner can rank it.
158 """
159 # Minimal one-prompt spec for every supported probe kind.
160 # ``delta_kl`` is the primary target today — its
161 # ``prompts: list[str]`` spec shape slots cleanly into per-prompt
162 # scoring. ``leakage`` + ``paraphrase_invariance`` have different
163 # spec architectures (sections / cases) and are future work.
164 raw_spec: dict[str, Any] | None = (
165 {"kind": "delta_kl", "prompts": [prompt]} if probe_kind == "delta_kl" else None
166 )
167 if raw_spec is None:
168 return None
169 raw_spec["name"] = f"outlier_probe_{probe_kind}"
170
171 probe, spec = build_probe(raw_spec)
172 try:
173 result = probe.run(spec, ctx)
174 except Exception:
175 # Single-prompt runs can fail probe-specific guards (e.g.
176 # leakage's min-length check). Treat as "no signal" and skip.
177 return None
178 if result.raw is None:
179 return None
180 return float(result.raw)
181
182
183 def corpus_prompts(corpus_name: str, *, chunk_chars: int = 256, max_chunks: int = 64) -> list[str]:
184 """Convenience: pull candidate prompts from a packaged corpus.
185
186 Thin wrapper over :func:`dlm_sway.probes._external_corpus.chunk_corpus`
187 so the CLI's ``--from-corpus`` flag has a one-liner.
188 """
189 return chunk_corpus(load_corpus(corpus_name), chunk_chars=chunk_chars, max_chunks=max_chunks)
190
191
192 __all__ = [
193 "OutlierCandidate",
194 "OutlierResult",
195 "corpus_prompts",
196 "mine_outliers",
197 ]