| 1 | """Outlier-prompt miner — F11 / S17. |
| 2 | |
| 3 | Companion to :mod:`paraphrase_miner`. Where the paraphrase miner sharpens |
| 4 | a single ``(prompt, gold)`` case, the outlier miner answers a broader |
| 5 | question: "given a *pool* of candidate prompts, which ones produce the |
| 6 | biggest (and smallest) signal under my chosen probe?" |
| 7 | |
| 8 | Use cases: |
| 9 | |
| 10 | - **delta_kl outliers.** "Which of these 100 doc-derived prompts shifts |
| 11 | the model the most?" → gives a user the five prompts they should |
| 12 | paste into a future gate. Supported today. |
| 13 | - **leakage outliers.** "Which chunks of training text are most at |
| 14 | risk of verbatim recital?" The shipped ``leakage`` probe is |
| 15 | section-based rather than prompt-based, so it's handled by a |
| 16 | future sprint; the outlier miner falls back to a clean error when |
| 17 | asked for it today. |
| 18 | - **paraphrase_invariance outliers.** Case-structured, same story |
| 19 | as ``leakage``. Paired with the paraphrase miner when you want a |
| 20 | wider net than per-case exploration; direct outlier mining on |
| 21 | cases is future work. |
| 22 | |
| 23 | The miner runs the chosen probe once per candidate and ranks by |
| 24 | ``raw``. Probes that reject a single-prompt spec (e.g. ``min_prompts`` |
| 25 | gates) surface as ``None`` and are simply skipped from the ranking. |
| 26 | |
| 27 | No probe registration: this module is an evaluation tool, not a probe. |
| 28 | Output is a ranked dataclass list the CLI converts to a YAML fragment. |
| 29 | """ |
| 30 | |
| 31 | from __future__ import annotations |
| 32 | |
| 33 | from dataclasses import dataclass |
| 34 | from typing import TYPE_CHECKING, Any |
| 35 | |
| 36 | from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus |
| 37 | |
| 38 | if TYPE_CHECKING: |
| 39 | from dlm_sway.core.scoring import DifferentialBackend |
| 40 | |
| 41 | |
| 42 | @dataclass(frozen=True, slots=True) |
| 43 | class OutlierCandidate: |
| 44 | """One ranked prompt from the outlier-mining pool.""" |
| 45 | |
| 46 | prompt: str |
| 47 | raw: float |
| 48 | """The probe's ``raw`` value on this single prompt. Signed — a |
| 49 | negative raw on ``external_perplexity`` (forgetting) ranks in the |
| 50 | bottom-K; a positive raw on ``delta_kl`` ranks in the top-K.""" |
| 51 | index: int |
| 52 | """Position in the original candidate pool. Useful for reproducing |
| 53 | the pool when the source was a deterministic corpus slice.""" |
| 54 | |
| 55 | |
| 56 | @dataclass(frozen=True, slots=True) |
| 57 | class OutlierResult: |
| 58 | """Top-K and bottom-K candidates for one miner run.""" |
| 59 | |
| 60 | probe_kind: str |
| 61 | top: list[OutlierCandidate] |
| 62 | bottom: list[OutlierCandidate] |
| 63 | |
| 64 | |
| 65 | def mine_outliers( |
| 66 | *, |
| 67 | probe_kind: str, |
| 68 | candidate_prompts: list[str], |
| 69 | backend: DifferentialBackend, |
| 70 | top_k: int = 10, |
| 71 | seed: int = 0, |
| 72 | ) -> OutlierResult: |
| 73 | """Run ``probe_kind`` once per candidate prompt, rank by ``|raw|``. |
| 74 | |
| 75 | Parameters |
| 76 | ---------- |
| 77 | probe_kind: |
| 78 | The probe registry key — currently ``"delta_kl"``, |
| 79 | ``"paraphrase_invariance"``, or ``"leakage"`` are the probes |
| 80 | S17's DoD names. Any probe accepting ``prompts: list[str]`` |
| 81 | and reporting a ``raw`` float works — the miner imports |
| 82 | through :func:`dlm_sway.probes.base.build_probe`. |
| 83 | candidate_prompts: |
| 84 | The pool to rank. Duplicates are kept (so a user feeding |
| 85 | chunked corpus text sees every chunk's rank position). |
| 86 | backend: |
| 87 | Differential backend — probes consume it through |
| 88 | :class:`RunContext`. |
| 89 | top_k: |
| 90 | Return the top-``k`` and bottom-``k`` candidates. Clipped to |
| 91 | the pool size when smaller. |
| 92 | seed: |
| 93 | Threaded into :class:`RunContext` so probes that pick |
| 94 | randomly (e.g. bootstrap) are deterministic. |
| 95 | """ |
| 96 | if top_k <= 0: |
| 97 | raise ValueError(f"top_k must be positive; got {top_k}") |
| 98 | if not candidate_prompts: |
| 99 | return OutlierResult(probe_kind=probe_kind, top=[], bottom=[]) |
| 100 | |
| 101 | from dlm_sway.probes.base import RunContext, build_probe |
| 102 | |
| 103 | ctx = RunContext(backend=backend, seed=seed) |
| 104 | scored: list[OutlierCandidate] = [] |
| 105 | for idx, candidate in enumerate(candidate_prompts): |
| 106 | raw = _score_single_prompt(probe_kind, candidate, ctx, build_probe) |
| 107 | if raw is None: |
| 108 | continue |
| 109 | scored.append(OutlierCandidate(prompt=candidate, raw=raw, index=idx)) |
| 110 | |
| 111 | # F04 (Audit 03) — reject pools smaller than ``2 * top_k`` distinct |
| 112 | # scored prompts. Below that floor the "top" and "bottom" lists |
| 113 | # end up overlapping (same prompt can appear in both) and the |
| 114 | # output loses the outlier-vs-norm contrast the miner is supposed |
| 115 | # to surface. The audit observed this on a 1-distinct-prompt pool |
| 116 | # where the top and bottom lists both contained that single prompt. |
| 117 | # |
| 118 | # Apply AFTER scoring so unsupported probe_kinds (no prompts get |
| 119 | # scored → scored=[]) return an empty OutlierResult cleanly |
| 120 | # instead of raising. The empty-result contract is established by |
| 121 | # pre-F04 tests and load-bearing for probe-kind-not-supported UX. |
| 122 | if scored: |
| 123 | distinct_count = len({c.prompt for c in scored}) |
| 124 | required = 2 * top_k |
| 125 | if distinct_count < required: |
| 126 | from dlm_sway.core.errors import SwayError |
| 127 | |
| 128 | suggested = max(1, distinct_count // 2) |
| 129 | raise SwayError( |
| 130 | f"outlier miner pool has {distinct_count} distinct prompt(s), " |
| 131 | f"below the 2·top_k={required} floor — ``top`` and ``bottom`` " |
| 132 | f"lists would overlap. Pass --top-k {suggested} or supply " |
| 133 | f"--from-corpus to widen the pool." |
| 134 | ) |
| 135 | |
| 136 | # Top = most positive raw; bottom = most negative raw. These |
| 137 | # differ for signed metrics (external_perplexity deltas can be |
| 138 | # negative; delta_kl is ≥ 0 but the bottom-K still finds the |
| 139 | # least-moving prompts). |
| 140 | top = sorted(scored, key=lambda c: c.raw, reverse=True)[: min(top_k, len(scored))] |
| 141 | bottom = sorted(scored, key=lambda c: c.raw)[: min(top_k, len(scored))] |
| 142 | return OutlierResult(probe_kind=probe_kind, top=top, bottom=bottom) |
| 143 | |
| 144 | |
| 145 | def _score_single_prompt( |
| 146 | probe_kind: str, |
| 147 | prompt: str, |
| 148 | ctx: Any, |
| 149 | build_probe: Any, |
| 150 | ) -> float | None: |
| 151 | """Score one prompt under the given probe kind, return ``raw`` or None. |
| 152 | |
| 153 | We avoid a global switch over probe kinds by building the spec |
| 154 | shape each probe expects from its own registry entry. This lets a |
| 155 | user plug a custom probe without touching the miner code — as long |
| 156 | as the probe accepts a ``prompts: [single]`` spec and reports a |
| 157 | ``raw`` float, the outlier miner can rank it. |
| 158 | """ |
| 159 | # Minimal one-prompt spec for every supported probe kind. |
| 160 | # ``delta_kl`` is the primary target today — its |
| 161 | # ``prompts: list[str]`` spec shape slots cleanly into per-prompt |
| 162 | # scoring. ``leakage`` + ``paraphrase_invariance`` have different |
| 163 | # spec architectures (sections / cases) and are future work. |
| 164 | raw_spec: dict[str, Any] | None = ( |
| 165 | {"kind": "delta_kl", "prompts": [prompt]} if probe_kind == "delta_kl" else None |
| 166 | ) |
| 167 | if raw_spec is None: |
| 168 | return None |
| 169 | raw_spec["name"] = f"outlier_probe_{probe_kind}" |
| 170 | |
| 171 | probe, spec = build_probe(raw_spec) |
| 172 | try: |
| 173 | result = probe.run(spec, ctx) |
| 174 | except Exception: |
| 175 | # Single-prompt runs can fail probe-specific guards (e.g. |
| 176 | # leakage's min-length check). Treat as "no signal" and skip. |
| 177 | return None |
| 178 | if result.raw is None: |
| 179 | return None |
| 180 | return float(result.raw) |
| 181 | |
| 182 | |
| 183 | def corpus_prompts(corpus_name: str, *, chunk_chars: int = 256, max_chunks: int = 64) -> list[str]: |
| 184 | """Convenience: pull candidate prompts from a packaged corpus. |
| 185 | |
| 186 | Thin wrapper over :func:`dlm_sway.probes._external_corpus.chunk_corpus` |
| 187 | so the CLI's ``--from-corpus`` flag has a one-liner. |
| 188 | """ |
| 189 | return chunk_corpus(load_corpus(corpus_name), chunk_chars=chunk_chars, max_chunks=max_chunks) |
| 190 | |
| 191 | |
| 192 | __all__ = [ |
| 193 | "OutlierCandidate", |
| 194 | "OutlierResult", |
| 195 | "corpus_prompts", |
| 196 | "mine_outliers", |
| 197 | ] |