@@ -108,6 +108,31 @@ def mine_outliers( |
| 108 | 108 | continue |
| 109 | 109 | scored.append(OutlierCandidate(prompt=candidate, raw=raw, index=idx)) |
| 110 | 110 | |
| 111 | + # F04 (Audit 03) — reject pools smaller than ``2 * top_k`` distinct |
| 112 | + # scored prompts. Below that floor the "top" and "bottom" lists |
| 113 | + # end up overlapping (same prompt can appear in both) and the |
| 114 | + # output loses the outlier-vs-norm contrast the miner is supposed |
| 115 | + # to surface. The audit observed this on a 1-distinct-prompt pool |
| 116 | + # where the top and bottom lists both contained that single prompt. |
| 117 | + # |
| 118 | + # Apply AFTER scoring so unsupported probe_kinds (no prompts get |
| 119 | + # scored → scored=[]) return an empty OutlierResult cleanly |
| 120 | + # instead of raising. The empty-result contract is established by |
| 121 | + # pre-F04 tests and load-bearing for probe-kind-not-supported UX. |
| 122 | + if scored: |
| 123 | + distinct_count = len({c.prompt for c in scored}) |
| 124 | + required = 2 * top_k |
| 125 | + if distinct_count < required: |
| 126 | + from dlm_sway.core.errors import SwayError |
| 127 | + |
| 128 | + suggested = max(1, distinct_count // 2) |
| 129 | + raise SwayError( |
| 130 | + f"outlier miner pool has {distinct_count} distinct prompt(s), " |
| 131 | + f"below the 2·top_k={required} floor — ``top`` and ``bottom`` " |
| 132 | + f"lists would overlap. Pass --top-k {suggested} or supply " |
| 133 | + f"--from-corpus to widen the pool." |
| 134 | + ) |
| 135 | + |
| 111 | 136 | # Top = most positive raw; bottom = most negative raw. These |
| 112 | 137 | # differ for signed metrics (external_perplexity deltas can be |
| 113 | 138 | # negative; delta_kl is ≥ 0 but the bottom-K still finds the |