tenseleyflow/documentlanguagemodel / 72bb003

Browse files

sway: convert in-tree subproject to git submodule pointing at tenseleyFlow/sway

Authored by espadonne
SHA
72bb0030b72321dea3c66a2e6d7ce26e52c74550
Parents
9da4019
Tree
e628ba5

83 changed files

StatusFile+-
M .gitmodules 3 0
A sway 1 0
D sway/CHANGELOG.md 0 41
D sway/LICENSE 0 21
D sway/README.md 0 101
D sway/pyproject.toml 0 210
D sway/src/dlm_sway/__init__.py 0 42
D sway/src/dlm_sway/backends/__init__.py 0 118
D sway/src/dlm_sway/backends/dummy.py 0 257
D sway/src/dlm_sway/backends/hf.py 0 375
D sway/src/dlm_sway/backends/mlx.py 0 205
D sway/src/dlm_sway/cli/__init__.py 0 1
D sway/src/dlm_sway/cli/app.py 0 59
D sway/src/dlm_sway/cli/commands.py 0 396
D sway/src/dlm_sway/core/__init__.py 0 1
D sway/src/dlm_sway/core/determinism.py 0 97
D sway/src/dlm_sway/core/errors.py 0 65
D sway/src/dlm_sway/core/model.py 0 112
D sway/src/dlm_sway/core/result.py 0 139
D sway/src/dlm_sway/core/scoring.py 0 203
D sway/src/dlm_sway/core/sections.py 0 76
D sway/src/dlm_sway/integrations/__init__.py 0 1
D sway/src/dlm_sway/integrations/dlm/__init__.py 0 1
D sway/src/dlm_sway/integrations/dlm/autogen.py 0 191
D sway/src/dlm_sway/integrations/dlm/resolver.py 0 243
D sway/src/dlm_sway/probes/__init__.py 0 27
D sway/src/dlm_sway/probes/_calibration_pack.py 0 63
D sway/src/dlm_sway/probes/_divergence.py 0 102
D sway/src/dlm_sway/probes/adapter_ablation.py 0 193
D sway/src/dlm_sway/probes/adapter_revert.py 0 178
D sway/src/dlm_sway/probes/base.py 0 131
D sway/src/dlm_sway/probes/calibration_drift.py 0 135
D sway/src/dlm_sway/probes/delta_kl.py 0 121
D sway/src/dlm_sway/probes/leakage.py 0 194
D sway/src/dlm_sway/probes/null_adapter.py 0 144
D sway/src/dlm_sway/probes/paraphrase_invariance.py 0 148
D sway/src/dlm_sway/probes/preference_flip.py 0 140
D sway/src/dlm_sway/probes/prompt_collapse.py 0 159
D sway/src/dlm_sway/probes/section_internalization.py 0 189
D sway/src/dlm_sway/probes/style_fingerprint.py 0 179
D sway/src/dlm_sway/py.typed 0 0
D sway/src/dlm_sway/suite/__init__.py 0 1
D sway/src/dlm_sway/suite/loader.py 0 48
D sway/src/dlm_sway/suite/report.py 0 249
D sway/src/dlm_sway/suite/runner.py 0 136
D sway/src/dlm_sway/suite/score.py 0 106
D sway/src/dlm_sway/suite/spec.py 0 72
D sway/src/dlm_sway/visualize.py 0 137
D sway/tests/__init__.py 0 0
D sway/tests/conftest.py 0 29
D sway/tests/fixtures/__init__.py 0 0
D sway/tests/fixtures/tiny_model.py 0 53
D sway/tests/integration/__init__.py 0 0
D sway/tests/integration/conftest.py 0 10
D sway/tests/integration/test_hf_adapter_toggle.py 0 113
D sway/tests/unit/__init__.py 0 0
D sway/tests/unit/test_backend_dummy.py 0 102
D sway/tests/unit/test_backend_registry.py 0 133
D sway/tests/unit/test_cli.py 0 92
D sway/tests/unit/test_determinism.py 0 47
D sway/tests/unit/test_divergence.py 0 73
D sway/tests/unit/test_dlm_bridge.py 0 252
D sway/tests/unit/test_errors.py 0 55
D sway/tests/unit/test_model.py 0 78
D sway/tests/unit/test_null_calibration.py 0 123
D sway/tests/unit/test_probe_adapter_ablation.py 0 135
D sway/tests/unit/test_probe_adapter_revert.py 0 170
D sway/tests/unit/test_probe_base.py 0 69
D sway/tests/unit/test_probe_calibration_drift.py 0 57
D sway/tests/unit/test_probe_delta_kl.py 0 124
D sway/tests/unit/test_probe_leakage.py 0 109
D sway/tests/unit/test_probe_paraphrase_invariance.py 0 91
D sway/tests/unit/test_probe_preference_flip.py 0 161
D sway/tests/unit/test_probe_prompt_collapse.py 0 137
D sway/tests/unit/test_probe_section_internalization.py 0 94
D sway/tests/unit/test_probe_style_fingerprint.py 0 115
D sway/tests/unit/test_result.py 0 82
D sway/tests/unit/test_scoring.py 0 84
D sway/tests/unit/test_sections.py 0 35
D sway/tests/unit/test_suite_runner.py 0 134
D sway/tests/unit/test_suite_score_report.py 0 217
D sway/tests/unit/test_suite_spec.py 0 85
D sway/tests/unit/test_visualize.py 0 202
.gitmodulesmodified
@@ -5,3 +5,6 @@
55
 	# `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/
66
 	# which the submodule's own .gitignore covers.
77
 	ignore = untracked
8
+[submodule "sway"]
9
+	path = sway
10
+	url = https://github.com/tenseleyFlow/sway.git
swayadded
@@ -0,0 +1,1 @@
1
+Subproject commit 98ad9417c94e1bbeb97cf5e553878d7953513f69
sway/CHANGELOG.mddeleted
@@ -1,41 +0,0 @@
1
-# Changelog
2
-
3
-## 0.1.0.dev0 — 2026-04-20
4
-
5
-Initial pre-alpha. Full 11-primitive battery shipped.
6
-
7
-### Primitives
8
-
9
-- **Adherence**
10
-  - `delta_kl` — mean JS/KL divergence between base and fine-tuned next-token distributions
11
-  - `adapter_revert` — reversion under adversarial paraphrase (needs `sway-eval[semsim]`)
12
-  - `prompt_collapse` — exponential-decay fit of divergence over context length
13
-- **Attribution**
14
-  - `section_internalization` *(flagship)* — per-section `effective_sis` with leak check
15
-  - `paraphrase_invariance` — memorization vs. generalization, intent-aware
16
-  - `preference_flip` — DPO/ORPO chosen/rejected margin inversion
17
-- **Calibration**
18
-  - `style_fingerprint` — 6-dim numpy-only stylistic shift vs. document
19
-  - `calibration_drift` — general-knowledge regression on a packaged 30-item pack
20
-  - `leakage` — greedy LCS recall + perturbation fragility
21
-- **Ablation**
22
-  - `adapter_ablation` *(signature primitive)* — λ-scaled divergence curve with linearity, saturation, overshoot metrics
23
-- **Baseline**
24
-  - `null_adapter` — stats scaffolding for z-score calibration (implementation pending)
25
-
26
-### Infrastructure
27
-
28
-- `DifferentialBackend` + `ScalableDifferentialBackend` protocols
29
-- HuggingFace + PEFT backend with `disable_adapter` / `set_adapter` toggling and LoRA-scale mutation
30
-- Dummy backend for unit tests (canned responses + linear-blend scalable mode)
31
-- YAML spec loader, composite score (four-category weighted), rich terminal + JSON + JUnit + Markdown reports
32
-- Typer CLI: `run`, `gate`, `check`, `diff`, `autogen`, `doctor`, `report`
33
-- `.dlm` bridge (`dlm-sway[dlm]`): resolver + full-battery autogen
34
-- Matplotlib visualizations (`dlm-sway[viz]`): SIS bar chart, ablation curve, KL histogram
35
-
36
-### Known gaps
37
-
38
-- Null-adapter baseline is scaffolded but its HF-level materialization (building random-init LoRAs at matched rank) is not yet wired — probes fall back to fixed thresholds until the next milestone.
39
-- Custom backend entry-point dispatch (`kind: custom`) is stubbed but not implemented.
40
-- MLX backend is registered as a future-milestone target; all MLX paths raise `BackendNotAvailableError`.
41
-- PyPI publication of the `dlm-sway` wheel is pending a clean CI release workflow.
sway/LICENSEdeleted
@@ -1,21 +0,0 @@
1
-MIT License
2
-
3
-Copyright (c) 2026 Matt Wolffe
4
-
5
-Permission is hereby granted, free of charge, to any person obtaining a copy
6
-of this software and associated documentation files (the "Software"), to deal
7
-in the Software without restriction, including without limitation the rights
8
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
-copies of the Software, and to permit persons to whom the Software is
10
-furnished to do so, subject to the following conditions:
11
-
12
-The above copyright notice and this permission notice shall be included in all
13
-copies or substantial portions of the Software.
14
-
15
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
-SOFTWARE.
sway/README.mddeleted
@@ -1,101 +0,0 @@
1
-# dlm-sway
2
-
3
-Differential testing for fine-tuned causal language models.
4
-
5
-**One question:** *did LoRA/QLoRA training actually change model behavior
6
-in a meaningful way, or is the model just defaulting to the pretrained
7
-base?*
8
-
9
-`dlm-sway` gives you a trustworthy, reproducible answer with eleven
10
-purpose-built primitives, each z-scored against a null-adapter baseline.
11
-No LLM judges. No external APIs. Deterministic on CPU where possible.
12
-
13
-## Install
14
-
15
-```bash
16
-pip install "dlm-sway[hf]"                # HuggingFace + PEFT backend
17
-pip install "dlm-sway[hf,style,semsim]"   # full primitive battery
18
-pip install "dlm-sway[all]"               # everything including optional viz
19
-pip install "dlm-sway[dlm]"               # auto-generate tests from a .dlm file
20
-```
21
-
22
-## 90-second smoke test
23
-
24
-```bash
25
-dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct
26
-```
27
-
28
-Outputs a verdict in under a minute on CPU for small models: *your
29
-adapter is 4.2σ above noise* ✅ or *indistinguishable from a null
30
-adapter* ❌.
31
-
32
-## Full suite
33
-
34
-```yaml
35
-# sway.yaml
36
-version: 1
37
-models:
38
-  base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"}
39
-  ft:   {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct",
40
-         adapter: "./runs/adapter/v0003"}
41
-suite:
42
-  - {name: knows_concept, kind: dir,
43
-     prompt: "The Dunning-Kruger effect describes",
44
-     target: " a cognitive bias where",
45
-     distractor: " a programming language"}
46
-  - {name: no_reversion, kind: adapter_revert, paraphrases: 4}
47
-  - {name: section_attribution, kind: section_internalization}
48
-```
49
-
50
-```bash
51
-dlm-sway run sway.yaml              # full report to terminal + JSON
52
-dlm-sway gate sway.yaml --junit     # CI-friendly; non-zero on fail
53
-```
54
-
55
-## Why it exists
56
-
57
-Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"*
58
-That's the wrong question after a targeted LoRA fine-tune on a small
59
-user-authored document. The right question is *"did the adapter actually
60
-move the model toward what I wrote?"* — and existing tools answer this
61
-poorly.
62
-
63
-`dlm-sway` answers it directly via eleven primitives across four
64
-categories:
65
-
66
-| Category      | Primitives                                            |
67
-|---------------|-------------------------------------------------------|
68
-| Adherence     | `delta_kl`, `adapter_revert`, `prompt_collapse`       |
69
-| Attribution   | `section_internalization`, `paraphrase_invariance`, `preference_flip` |
70
-| Calibration   | `style_fingerprint`, `calibration_drift`, `leakage`   |
71
-| Ablation      | `adapter_ablation` ← the signature primitive          |
72
-
73
-**The signature primitive.** `adapter_ablation` scales the LoRA additive
74
-term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence
75
-curve. A healthy fine-tune shows a smooth, monotonic, non-saturated
76
-response. A degenerate one shows a step function or an overshoot-then-
77
-crash. Nobody else does this because nobody else gets this close to the
78
-adapter math.
79
-
80
-## The `.dlm` integration
81
-
82
-If you trained your adapter via the [DocumentLanguageModel
83
-project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway
84
-can auto-generate a test suite from your document's sections:
85
-
86
-```bash
87
-pip install "dlm-sway[hf,dlm]"
88
-dlm-sway autogen path/to/doc.dlm -o sway.yaml
89
-dlm-sway run sway.yaml
90
-```
91
-
92
-Per-section attribution tells you *which* parts of your document
93
-actually moved the model — a kind of signal no other tool provides.
94
-
95
-## Status
96
-
97
-Pre-alpha. API will break. Version `0.1.0` is the first tag.
98
-
99
-## License
100
-
101
-MIT
sway/pyproject.tomldeleted
@@ -1,210 +0,0 @@
1
-[project]
2
-name = "dlm-sway"
3
-version = "0.1.0.dev0"
4
-description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?"
5
-readme = "README.md"
6
-requires-python = ">=3.11"
7
-license = { text = "MIT" }
8
-authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }]
9
-keywords = [
10
-    "lora",
11
-    "qlora",
12
-    "peft",
13
-    "fine-tuning",
14
-    "evaluation",
15
-    "llm",
16
-    "differential-testing",
17
-]
18
-classifiers = [
19
-    "Development Status :: 3 - Alpha",
20
-    "Intended Audience :: Developers",
21
-    "Intended Audience :: Science/Research",
22
-    "License :: OSI Approved :: MIT License",
23
-    "Programming Language :: Python :: 3",
24
-    "Programming Language :: Python :: 3.11",
25
-    "Programming Language :: Python :: 3.12",
26
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
-]
28
-
29
-# Core deps: spec loading, orchestration, reporting. No torch — a user
30
-# who only defines specs or writes a custom backend shouldn't pull 3 GB
31
-# of CUDA wheels.
32
-dependencies = [
33
-    "pydantic>=2.9",
34
-    "pyyaml>=6.0",
35
-    "typer>=0.12",
36
-    "rich>=13.7",
37
-    "numpy>=1.26",
38
-    "packaging>=24.0",
39
-]
40
-
41
-[project.optional-dependencies]
42
-# HuggingFace + PEFT scoring backend. The canonical path.
43
-hf = [
44
-    "torch>=2.4",
45
-    "transformers>=4.45",
46
-    "peft>=0.13",
47
-    "safetensors>=0.4",
48
-]
49
-# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op
50
-# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays
51
-# sane.
52
-mlx = [
53
-    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
54
-    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
55
-]
56
-# Stylistic fingerprinting (C1). spaCy models pull at runtime via
57
-# `python -m spacy download`.
58
-style = [
59
-    "spacy>=3.7",
60
-    "textstat>=0.7",
61
-    "nlpaug>=1.1",
62
-]
63
-# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly.
64
-semsim = [
65
-    "sentence-transformers>=3.0",
66
-]
67
-# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm.
68
-dlm = [
69
-    "dlm>=0.9",
70
-]
71
-# Visualization (P9).
72
-viz = [
73
-    "matplotlib>=3.8",
74
-]
75
-all = [
76
-    "torch>=2.4",
77
-    "transformers>=4.45",
78
-    "peft>=0.13",
79
-    "safetensors>=0.4",
80
-    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
81
-    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
82
-    "spacy>=3.7",
83
-    "textstat>=0.7",
84
-    "nlpaug>=1.1",
85
-    "sentence-transformers>=3.0",
86
-    "matplotlib>=3.8",
87
-]
88
-
89
-[project.scripts]
90
-dlm-sway = "dlm_sway.cli.app:main"
91
-
92
-[project.urls]
93
-Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
94
-Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
95
-
96
-[dependency-groups]
97
-dev = [
98
-    "pytest>=8.0",
99
-    "pytest-cov>=5.0",
100
-    "mypy>=1.11",
101
-    "ruff>=0.6",
102
-    "types-pyyaml>=6.0",
103
-    "hypothesis>=6.152.1",
104
-]
105
-
106
-[build-system]
107
-requires = ["hatchling"]
108
-build-backend = "hatchling.build"
109
-
110
-[tool.hatch.build.targets.wheel]
111
-packages = ["src/dlm_sway"]
112
-
113
-# -------- ruff --------
114
-[tool.ruff]
115
-line-length = 100
116
-target-version = "py311"
117
-src = ["src", "tests"]
118
-
119
-[tool.ruff.lint]
120
-select = [
121
-    "E",    # pycodestyle errors
122
-    "F",    # pyflakes
123
-    "W",    # pycodestyle warnings
124
-    "I",    # isort
125
-    "UP",   # pyupgrade
126
-    "B",    # bugbear
127
-    "N",    # pep8-naming
128
-    "C4",   # comprehensions
129
-    "SIM",  # simplify
130
-    "PT",   # pytest
131
-    "RET",  # return
132
-    "ARG",  # unused args
133
-    "PTH",  # use pathlib
134
-    "TID",  # tidy imports
135
-]
136
-ignore = [
137
-    "E501",  # handled by formatter
138
-]
139
-
140
-[tool.ruff.lint.per-file-ignores]
141
-"tests/**/*.py" = ["ARG", "PT011", "SIM117"]
142
-# PyTorch's canonical `import torch.nn.functional as F` is universally
143
-# read, so we allow the naming exception in the HF backend only.
144
-"src/dlm_sway/backends/hf.py" = ["N812"]
145
-# The .dlm bridge is the one place allowed to import the ``dlm`` package.
146
-"src/dlm_sway/integrations/dlm/*.py" = ["TID251"]
147
-
148
-[tool.ruff.lint.flake8-tidy-imports.banned-api]
149
-# Hard architectural boundary: the `dlm` package is only importable
150
-# from inside the optional integration shim. This keeps dlm-sway
151
-# usable for anyone with just a HuggingFace base + PEFT adapter.
152
-"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)."
153
-
154
-[tool.ruff.format]
155
-quote-style = "double"
156
-indent-style = "space"
157
-
158
-# -------- mypy --------
159
-[tool.mypy]
160
-strict = true
161
-python_version = "3.11"
162
-packages = ["dlm_sway"]
163
-mypy_path = "src"
164
-warn_return_any = true
165
-warn_unused_ignores = true
166
-warn_redundant_casts = true
167
-no_implicit_optional = true
168
-disallow_untyped_decorators = true
169
-plugins = ["pydantic.mypy"]
170
-
171
-[tool.pydantic-mypy]
172
-init_forbid_extra = true
173
-init_typed = true
174
-warn_required_dynamic_aliases = true
175
-
176
-# Stubless ML ecosystem packages. Narrow boundaries in backends/* import
177
-# them explicitly; the rest of the codebase stays strict.
178
-[[tool.mypy.overrides]]
179
-module = [
180
-    "torch",
181
-    "torch.*",
182
-    "transformers.*",
183
-    "peft.*",
184
-    "safetensors.*",
185
-    "mlx.*",
186
-    "mlx_lm.*",
187
-    "sentence_transformers.*",
188
-    "spacy.*",
189
-    "textstat.*",
190
-    "nlpaug.*",
191
-    "matplotlib",
192
-    "matplotlib.*",
193
-    "huggingface_hub.*",
194
-    "dlm.*",
195
-]
196
-ignore_missing_imports = true
197
-disable_error_code = ["no-untyped-call"]
198
-
199
-# -------- pytest --------
200
-[tool.pytest.ini_options]
201
-testpaths = ["tests"]
202
-addopts = [
203
-    "-ra",
204
-    "-m", "not slow and not gpu and not online",
205
-]
206
-markers = [
207
-    "slow: expensive; deselected by default",
208
-    "gpu: requires CUDA; skipped on CPU/MPS runners",
209
-    "online: touches the network; skipped in offline CI",
210
-]
sway/src/dlm_sway/__init__.pydeleted
@@ -1,42 +0,0 @@
1
-"""dlm-sway — differential testing for fine-tuned causal language models."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.core.errors import (
6
-    BackendNotAvailableError,
7
-    ProbeError,
8
-    SpecValidationError,
9
-    SwayError,
10
-)
11
-from dlm_sway.core.model import LoadedModel, Model, ModelSpec
12
-from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
13
-from dlm_sway.core.scoring import (
14
-    DifferentialBackend,
15
-    NullCalibratedBackend,
16
-    RollingLogprob,
17
-    ScalableDifferentialBackend,
18
-    ScoringBackend,
19
-    TokenDist,
20
-)
21
-
22
-__all__ = [
23
-    "BackendNotAvailableError",
24
-    "DifferentialBackend",
25
-    "LoadedModel",
26
-    "Model",
27
-    "ModelSpec",
28
-    "NullCalibratedBackend",
29
-    "ProbeError",
30
-    "ProbeResult",
31
-    "RollingLogprob",
32
-    "ScalableDifferentialBackend",
33
-    "ScoringBackend",
34
-    "SpecValidationError",
35
-    "SuiteResult",
36
-    "SwayError",
37
-    "SwayScore",
38
-    "TokenDist",
39
-    "Verdict",
40
-]
41
-
42
-__version__ = "0.1.0.dev0"
sway/src/dlm_sway/backends/__init__.pydeleted
@@ -1,118 +0,0 @@
1
-"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom.
2
-
3
-Backends are constructed from a :class:`~dlm_sway.core.model.ModelSpec`
4
-via :func:`build`. Heavy backends (HF, MLX) import their framework only
5
-on construction so ``import dlm_sway`` stays cheap for users who only
6
-touch the dummy backend or the spec loader.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-from pathlib import Path
12
-from typing import TYPE_CHECKING
13
-
14
-from dlm_sway.core.errors import SpecValidationError
15
-from dlm_sway.core.model import ModelSpec
16
-
17
-if TYPE_CHECKING:
18
-    from dlm_sway.core.scoring import DifferentialBackend
19
-
20
-
21
-def build(base_spec: ModelSpec, *, adapter_path: Path | None = None) -> DifferentialBackend:
22
-    """Materialize a differential backend from a model spec.
23
-
24
-    The adapter path typically comes from ``ft.adapter`` in the spec —
25
-    it's lifted to a keyword here so the same function can be used for
26
-    "differential" (base + adapter on one loaded model) or future
27
-    split-load paths.
28
-    """
29
-    effective_adapter = adapter_path if adapter_path is not None else base_spec.adapter
30
-
31
-    if base_spec.kind == "dummy":
32
-        # Dummy backend isn't really about the spec — it's for tests
33
-        # that pre-populate responses. Surface a loud error if someone
34
-        # tries to build it through the normal path.
35
-        raise SpecValidationError(
36
-            "kind='dummy' backends must be constructed directly via "
37
-            "DummyDifferentialBackend(base=..., ft=...); they cannot be "
38
-            "materialized from a ModelSpec."
39
-        )
40
-
41
-    if base_spec.kind == "hf":
42
-        if effective_adapter is None:
43
-            raise SpecValidationError(
44
-                "hf backend requires an adapter path (set `adapter:` on the ft model)"
45
-            )
46
-        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
47
-
48
-        return HuggingFaceDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
49
-
50
-    if base_spec.kind == "mlx":
51
-        if effective_adapter is None:
52
-            raise SpecValidationError(
53
-                "mlx backend requires an adapter path (set `adapter:` on the ft model; "
54
-                "must be an MLX .npz adapter — use dlm's peft→mlx converter if needed)"
55
-            )
56
-        from dlm_sway.backends.mlx import MLXDifferentialBackend
57
-
58
-        return MLXDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
59
-
60
-    if base_spec.kind == "custom":
61
-        return _load_custom(base_spec, effective_adapter)
62
-
63
-    raise SpecValidationError(f"unknown backend kind: {base_spec.kind!r}")
64
-
65
-
66
-def _load_custom(base_spec: ModelSpec, adapter: Path | None) -> DifferentialBackend:
67
-    """Dispatch to a user-supplied backend via ``entry_point='pkg.mod:Name'``.
68
-
69
-    The imported class is instantiated as ``Cls(base_spec=..., adapter_path=...)``
70
-    — the same signature as :class:`dlm_sway.backends.hf.HuggingFaceDifferentialBackend`
71
-    so authors can model their implementation on the built-in. The
72
-    result is runtime-checked against :class:`DifferentialBackend` so
73
-    protocol violations fail at construction, not deep inside a probe.
74
-    """
75
-    from dlm_sway.core.scoring import DifferentialBackend as DiffBackend
76
-
77
-    entry = base_spec.entry_point
78
-    if not entry:
79
-        raise SpecValidationError(
80
-            "kind='custom' requires an entry_point of the form 'pkg.module:ClassName'"
81
-        )
82
-    if ":" not in entry:
83
-        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
84
-    module_path, _, class_name = entry.partition(":")
85
-    if not module_path or not class_name:
86
-        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
87
-
88
-    import importlib
89
-
90
-    try:
91
-        module = importlib.import_module(module_path)
92
-    except ImportError as exc:
93
-        raise SpecValidationError(
94
-            f"custom backend: cannot import module {module_path!r}: {exc}"
95
-        ) from exc
96
-    cls = getattr(module, class_name, None)
97
-    if cls is None:
98
-        raise SpecValidationError(
99
-            f"custom backend: module {module_path!r} has no attribute {class_name!r}"
100
-        )
101
-
102
-    try:
103
-        instance = cls(base_spec=base_spec, adapter_path=adapter)
104
-    except TypeError as exc:
105
-        raise SpecValidationError(
106
-            f"custom backend {entry!r} constructor signature mismatch: {exc}. "
107
-            "Expected Cls(base_spec: ModelSpec, adapter_path: Path | None)"
108
-        ) from exc
109
-
110
-    if not isinstance(instance, DiffBackend):
111
-        raise SpecValidationError(
112
-            f"custom backend {entry!r} does not satisfy DifferentialBackend "
113
-            "(needs as_base() and as_finetuned() context managers)"
114
-        )
115
-    return instance
116
-
117
-
118
-__all__ = ["build"]
sway/src/dlm_sway/backends/dummy.pydeleted
@@ -1,257 +0,0 @@
1
-"""In-memory backend for unit tests.
2
-
3
-Deterministic, torchless, and trivially fast. Tests pass canned responses
4
-and canned score tables keyed by ``(mode, prompt, completion)``. The same
5
-backend instance serves as both ``as_base`` and ``as_finetuned`` — it
6
-switches an internal mode flag.
7
-
8
-Use it to drive every probe's unit test without loading a real model.
9
-For integration tests against a real PEFT adapter, see
10
-:class:`~dlm_sway.backends.hf.HuggingFaceDifferentialBackend`.
11
-"""
12
-
13
-from __future__ import annotations
14
-
15
-import math
16
-from collections.abc import Iterator
17
-from contextlib import contextmanager
18
-from dataclasses import dataclass, field
19
-from typing import Literal
20
-
21
-import numpy as np
22
-
23
-from dlm_sway.core.scoring import RollingLogprob, TokenDist
24
-
25
-Mode = Literal["base", "ft"]
26
-
27
-
28
-@dataclass(slots=True)
29
-class DummyResponses:
30
-    """Canned data for one mode (base or ft).
31
-
32
-    Callers populate one of these per mode and hand both to
33
-    :class:`DummyDifferentialBackend`.
34
-    """
35
-
36
-    generations: dict[str, str] = field(default_factory=dict)
37
-    """Prompt → canned completion. Lookup is exact-match."""
38
-    logprobs: dict[tuple[str, str], float] = field(default_factory=dict)
39
-    """``(prompt, completion) → sum logprob``. Default ``-10.0`` if missing."""
40
-    rolling: dict[str, RollingLogprob] = field(default_factory=dict)
41
-    """Text → canned :class:`RollingLogprob`."""
42
-    token_dists: dict[str, TokenDist] = field(default_factory=dict)
43
-    """Prompt → canned :class:`TokenDist`."""
44
-
45
-
46
-class _DummyView:
47
-    """The per-mode view yielded by ``as_base`` / ``as_finetuned``.
48
-
49
-    Implements :class:`~dlm_sway.core.model.Model` *and*
50
-    :class:`~dlm_sway.core.scoring.ScoringBackend` — i.e. the
51
-    ``ScoringModel`` intersection.
52
-    """
53
-
54
-    def __init__(self, mode: Mode, responses: DummyResponses) -> None:
55
-        self.id = mode
56
-        self._mode: Mode = mode
57
-        self._r = responses
58
-
59
-    # -- Model ---------------------------------------------------------
60
-    def generate(
61
-        self,
62
-        prompt: str,
63
-        *,
64
-        max_new_tokens: int,
65
-        temperature: float = 0.0,
66
-        top_p: float = 1.0,
67
-        seed: int = 0,
68
-    ) -> str:
69
-        del max_new_tokens, temperature, top_p, seed  # canned; decoding is trivial.
70
-        try:
71
-            return self._r.generations[prompt]
72
-        except KeyError as exc:
73
-            raise KeyError(
74
-                f"dummy backend ({self._mode}): no canned generation for prompt {prompt!r}"
75
-            ) from exc
76
-
77
-    def close(self) -> None:
78
-        return None
79
-
80
-    # -- ScoringBackend ------------------------------------------------
81
-    def logprob_of(self, prompt: str, completion: str) -> float:
82
-        return self._r.logprobs.get((prompt, completion), -10.0)
83
-
84
-    def rolling_logprob(self, text: str) -> RollingLogprob:
85
-        if text in self._r.rolling:
86
-            return self._r.rolling[text]
87
-        # Synthesize a plausible rolling logprob so probes that just
88
-        # want a non-trivial value work without per-text configuration.
89
-        tokens = text.split()
90
-        n = max(len(tokens), 1)
91
-        per_tok = -2.0 if self._mode == "base" else -1.5
92
-        return RollingLogprob(
93
-            token_ids=np.arange(n, dtype=np.int64),
94
-            logprobs=np.full(max(n - 1, 0), per_tok, dtype=np.float32),
95
-            num_tokens=n,
96
-            total_logprob=per_tok * max(n - 1, 0),
97
-        )
98
-
99
-    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
100
-        del top_k
101
-        if prompt in self._r.token_dists:
102
-            return self._r.token_dists[prompt]
103
-        # Synthesize a sharp base / broad ft distribution so divergence
104
-        # probes see a non-zero signal without hand-rolled data.
105
-        vocab = 1000
106
-        k = 8
107
-        if self._mode == "base":
108
-            lp = np.array([-0.1] + [-5.0] * (k - 1), dtype=np.float32)
109
-        else:
110
-            # More uniform mass across the top-k tokens.
111
-            lp = np.full(k, -math.log(k), dtype=np.float32)
112
-        return TokenDist(
113
-            token_ids=np.arange(k, dtype=np.int64),
114
-            logprobs=lp,
115
-            vocab_size=vocab,
116
-            tail_logprob=math.log1p(-float(np.exp(lp).sum())) if np.exp(lp).sum() < 1 else 0.0,
117
-        )
118
-
119
-
120
-class _NullView(_DummyView):
121
-    """A dummy view that perturbs the base distribution with seeded noise.
122
-
123
-    Used by :meth:`DummyDifferentialBackend.as_null_adapter`. The
124
-    perturbation is small (matches an ``init_scale=0.02`` adapter) so
125
-    the null-vs-base divergence stays well below real-adapter territory
126
-    in probe tests.
127
-    """
128
-
129
-    def __init__(self, base_responses: DummyResponses, seed: int, init_scale: float) -> None:
130
-        super().__init__("base", base_responses)
131
-        self._seed = seed
132
-        self._init_scale = init_scale
133
-
134
-    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
135
-        base_dist = super().next_token_dist(prompt, top_k=top_k)
136
-        rng = np.random.default_rng(self._seed + hash(prompt) % 1_000_003)
137
-        noise = rng.normal(0.0, self._init_scale, size=base_dist.logprobs.shape).astype(np.float32)
138
-        new_lp = base_dist.logprobs + noise
139
-        # Re-normalize (within the top-k slice) so a valid distribution comes back.
140
-        max_lp = new_lp.max()
141
-        new_probs = np.exp(new_lp - max_lp)
142
-        new_probs /= new_probs.sum()
143
-        return TokenDist(
144
-            token_ids=base_dist.token_ids,
145
-            logprobs=np.log(new_probs).astype(np.float32),
146
-            vocab_size=base_dist.vocab_size,
147
-            tail_logprob=base_dist.tail_logprob,
148
-        )
149
-
150
-
151
-class _InterpolatedView(_DummyView):
152
-    """A dummy view where logits/dists are a lam-blend of base and ft.
153
-
154
-    Used by :meth:`DummyDifferentialBackend.as_scaled_adapter`.
155
-    Generation falls back to the ft view at lam>=0.5, base otherwise —
156
-    rounded because the dummy backend's generations are canned strings
157
-    with no notion of "how much".
158
-    """
159
-
160
-    def __init__(
161
-        self,
162
-        base_responses: DummyResponses,
163
-        ft_responses: DummyResponses,
164
-        lam: float,
165
-    ) -> None:
166
-        super().__init__(
167
-            "ft" if lam >= 0.5 else "base", ft_responses if lam >= 0.5 else base_responses
168
-        )
169
-        self._base_r = base_responses
170
-        self._ft_r = ft_responses
171
-        self._lam = lam
172
-
173
-    def logprob_of(self, prompt: str, completion: str) -> float:
174
-        base_v = self._base_r.logprobs.get((prompt, completion), -10.0)
175
-        ft_v = self._ft_r.logprobs.get((prompt, completion), -10.0)
176
-        return (1 - self._lam) * base_v + self._lam * ft_v
177
-
178
-    def next_token_dist(self, prompt: str, *, top_k: int = 256):  # type: ignore[no-untyped-def]
179
-        base_dist = _DummyView("base", self._base_r).next_token_dist(prompt, top_k=top_k)
180
-        ft_dist = _DummyView("ft", self._ft_r).next_token_dist(prompt, top_k=top_k)
181
-        # Both dists are on the same synthetic support when unseeded; blend
182
-        # their logprobs via log-space linear interpolation, which is a
183
-        # log-linear "tempered" mix and keeps normalization close enough.
184
-        lam = self._lam
185
-        blended_lp = (1 - lam) * base_dist.logprobs + lam * ft_dist.logprobs
186
-        return type(base_dist)(
187
-            token_ids=base_dist.token_ids,
188
-            logprobs=blended_lp,
189
-            vocab_size=base_dist.vocab_size,
190
-            tail_logprob=base_dist.tail_logprob,
191
-        )
192
-
193
-
194
-class DummyDifferentialBackend:
195
-    """Dummy implementation of
196
-    :class:`~dlm_sway.core.scoring.DifferentialBackend`.
197
-
198
-    Construction takes one :class:`DummyResponses` per mode. The two
199
-    modes are mutually exclusive — the backend enforces that callers
200
-    exit one view before entering the other, catching bugs in probes
201
-    that hold a stale view across a toggle.
202
-
203
-    Also implements
204
-    :class:`~dlm_sway.core.scoring.ScalableDifferentialBackend` with a
205
-    linear-blend between base and ft responses, so probes that need
206
-    ``as_scaled_adapter`` (N2 AdapterAblation) are unit-testable.
207
-    """
208
-
209
-    def __init__(self, *, base: DummyResponses, ft: DummyResponses) -> None:
210
-        self._base_r = base
211
-        self._ft_r = ft
212
-        self._base = _DummyView("base", base)
213
-        self._ft = _DummyView("ft", ft)
214
-        self._active: str | None = None
215
-
216
-    @contextmanager
217
-    def as_base(self) -> Iterator[_DummyView]:
218
-        self._enter("base")
219
-        try:
220
-            yield self._base
221
-        finally:
222
-            self._exit()
223
-
224
-    @contextmanager
225
-    def as_finetuned(self) -> Iterator[_DummyView]:
226
-        self._enter("ft")
227
-        try:
228
-            yield self._ft
229
-        finally:
230
-            self._exit()
231
-
232
-    @contextmanager
233
-    def as_scaled_adapter(self, lam: float) -> Iterator[_DummyView]:
234
-        self._enter(f"scaled({lam})")
235
-        try:
236
-            yield _InterpolatedView(self._base_r, self._ft_r, lam)
237
-        finally:
238
-            self._exit()
239
-
240
-    @contextmanager
241
-    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_DummyView]:
242
-        self._enter(f"null({seed})")
243
-        try:
244
-            yield _NullView(self._base_r, seed=seed, init_scale=init_scale)
245
-        finally:
246
-            self._exit()
247
-
248
-    def _enter(self, mode: str) -> None:
249
-        if self._active is not None:
250
-            raise RuntimeError(
251
-                f"DifferentialBackend view already active ({self._active!r}); "
252
-                f"exit the current view before entering {mode!r}."
253
-            )
254
-        self._active = mode
255
-
256
-    def _exit(self) -> None:
257
-        self._active = None
sway/src/dlm_sway/backends/hf.pydeleted
@@ -1,375 +0,0 @@
1
-"""HuggingFace + PEFT differential backend.
2
-
3
-Loads the base once, attaches the LoRA adapter once, and toggles between
4
-"base" and "fine-tuned" views on the same module via PEFT's
5
-:meth:`~peft.PeftModel.disable_adapter` / :meth:`~peft.PeftModel.set_adapter`.
6
-
7
-This is the single most important backend in sway. Every numeric probe
8
-benefits from the shared-weights toggle — memory is halved compared to
9
-loading two copies, and KV-cache layouts stay aligned so pairwise KL math
10
-is straight-forward.
11
-
12
-Heavy imports (``torch``, ``transformers``, ``peft``) are deferred until
13
-``HuggingFaceDifferentialBackend`` is actually instantiated so
14
-``import dlm_sway`` stays light for users of the dummy backend or spec
15
-validation.
16
-"""
17
-
18
-from __future__ import annotations
19
-
20
-from collections.abc import Iterator
21
-from contextlib import contextmanager
22
-from dataclasses import dataclass
23
-from pathlib import Path
24
-from typing import TYPE_CHECKING, Any, Literal
25
-
26
-import numpy as np
27
-
28
-from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
29
-from dlm_sway.core.model import ModelSpec
30
-from dlm_sway.core.scoring import RollingLogprob, TokenDist
31
-
32
-if TYPE_CHECKING:
33
-    from transformers import PreTrainedModel, PreTrainedTokenizerBase
34
-
35
-
36
-Device = Literal["cuda", "mps", "cpu"]
37
-
38
-
39
-def _detect_device() -> Device:
40
-    try:
41
-        import torch
42
-    except ImportError as exc:
43
-        raise BackendNotAvailableError("hf", extra="hf") from exc
44
-    if torch.cuda.is_available():
45
-        return "cuda"
46
-    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
47
-        return "mps"
48
-    return "cpu"
49
-
50
-
51
-def _resolve_dtype(requested: str, device: Device) -> Any:
52
-    """Map the user's ``dtype`` preference to a torch dtype."""
53
-    import torch  # noqa: PLC0415 — lazy
54
-
55
-    if requested == "fp16":
56
-        return torch.float16
57
-    if requested == "bf16":
58
-        return torch.bfloat16
59
-    if requested == "fp32":
60
-        return torch.float32
61
-    # auto: bf16 on CUDA (Ampere+) / MPS; fp32 on CPU for numerical stability.
62
-    if device == "cuda" and torch.cuda.is_bf16_supported():
63
-        return torch.bfloat16
64
-    if device == "mps":
65
-        return torch.float16
66
-    return torch.float32
67
-
68
-
69
-def _require_hf() -> tuple[Any, Any, Any]:
70
-    """Import torch + transformers + peft, raising a friendly error if missing."""
71
-    try:
72
-        import torch
73
-        import transformers
74
-    except ImportError as exc:
75
-        raise BackendNotAvailableError("hf", extra="hf") from exc
76
-    try:
77
-        import peft
78
-    except ImportError as exc:
79
-        raise BackendNotAvailableError(
80
-            "hf", extra="hf", hint="peft is required for the adapter toggle."
81
-        ) from exc
82
-    return torch, transformers, peft
83
-
84
-
85
-# --- the view object ------------------------------------------------------
86
-
87
-
88
-@dataclass(slots=True)
89
-class _HFView:
90
-    """One side (base or ft) of a :class:`HuggingFaceDifferentialBackend`.
91
-
92
-    Both sides reuse the same underlying module; the difference is
93
-    whether the adapter is active.
94
-    """
95
-
96
-    id: str
97
-    _model: Any
98
-    _tokenizer: Any
99
-    _device: str
100
-    _pad_token_id: int
101
-
102
-    # -- Model ---------------------------------------------------------
103
-    def generate(
104
-        self,
105
-        prompt: str,
106
-        *,
107
-        max_new_tokens: int,
108
-        temperature: float = 0.0,
109
-        top_p: float = 1.0,
110
-        seed: int = 0,
111
-    ) -> str:
112
-        import torch
113
-
114
-        torch.manual_seed(seed)
115
-        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device)
116
-        do_sample = temperature > 0.0
117
-        gen_kwargs: dict[str, Any] = {
118
-            "max_new_tokens": max_new_tokens,
119
-            "do_sample": do_sample,
120
-            "pad_token_id": self._pad_token_id,
121
-        }
122
-        if do_sample:
123
-            gen_kwargs["temperature"] = temperature
124
-            gen_kwargs["top_p"] = top_p
125
-        with torch.inference_mode():
126
-            out_ids = self._model.generate(**inputs, **gen_kwargs)
127
-        new_tokens = out_ids[0, inputs["input_ids"].shape[1] :]
128
-        return str(self._tokenizer.decode(new_tokens, skip_special_tokens=True))
129
-
130
-    def close(self) -> None:
131
-        return None
132
-
133
-    # -- ScoringBackend ------------------------------------------------
134
-    def logprob_of(self, prompt: str, completion: str) -> float:
135
-        import torch
136
-        import torch.nn.functional as F
137
-
138
-        prompt_ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
139
-        full_ids = self._tokenizer(prompt + completion, return_tensors="pt").input_ids.to(
140
-            self._device
141
-        )
142
-        if full_ids.shape[1] <= prompt_ids.shape[1]:
143
-            raise ProbeError(
144
-                "logprob_of",
145
-                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
146
-            )
147
-        target_ids = full_ids[:, prompt_ids.shape[1] :]
148
-        with torch.inference_mode():
149
-            logits = self._model(full_ids).logits  # (1, T, V)
150
-        # Align: logit at position t predicts token at t+1. We want
151
-        # predictions for the completion slice.
152
-        shift_logits = logits[:, prompt_ids.shape[1] - 1 : -1, :]  # (1, C, V)
153
-        log_probs = F.log_softmax(shift_logits.float(), dim=-1)
154
-        gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
155
-        return float(gathered.sum().item())
156
-
157
-    def rolling_logprob(self, text: str) -> RollingLogprob:
158
-        import torch
159
-        import torch.nn.functional as F
160
-
161
-        ids = self._tokenizer(text, return_tensors="pt").input_ids.to(self._device)
162
-        if ids.shape[1] < 2:
163
-            return RollingLogprob(
164
-                token_ids=ids[0].cpu().numpy().astype(np.int64),
165
-                logprobs=np.array([], dtype=np.float32),
166
-                num_tokens=int(ids.shape[1]),
167
-                total_logprob=0.0,
168
-            )
169
-        with torch.inference_mode():
170
-            logits = self._model(ids).logits  # (1, T, V)
171
-        log_probs = F.log_softmax(logits[:, :-1].float(), dim=-1)  # predicts tokens 1..T
172
-        gathered = log_probs.gather(-1, ids[:, 1:].unsqueeze(-1)).squeeze(-1).squeeze(0)
173
-        return RollingLogprob(
174
-            token_ids=ids[0].cpu().numpy().astype(np.int64),
175
-            logprobs=gathered.cpu().numpy().astype(np.float32),
176
-            num_tokens=int(ids.shape[1]),
177
-            total_logprob=float(gathered.sum().item()),
178
-        )
179
-
180
-    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
181
-        import torch
182
-        import torch.nn.functional as F
183
-
184
-        ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
185
-        with torch.inference_mode():
186
-            logits = self._model(ids).logits[:, -1, :]  # (1, V)
187
-        log_probs = F.log_softmax(logits.float(), dim=-1).squeeze(0)
188
-        k = min(top_k, int(log_probs.shape[0]))
189
-        top = torch.topk(log_probs, k=k)
190
-        tail_mass = float(1.0 - torch.exp(top.values).sum().item())
191
-        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
192
-        return TokenDist(
193
-            token_ids=top.indices.cpu().numpy().astype(np.int64),
194
-            logprobs=top.values.cpu().numpy().astype(np.float32),
195
-            vocab_size=int(log_probs.shape[0]),
196
-            tail_logprob=tail_logprob,
197
-        )
198
-
199
-
200
-# --- the backend -----------------------------------------------------------
201
-
202
-
203
-class HuggingFaceDifferentialBackend:
204
-    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for HF+PEFT.
205
-
206
-    The adapter toggle relies on
207
-    :meth:`peft.PeftModel.disable_adapter` producing a context where the
208
-    forward pass skips the LoRA deltas, and
209
-    :meth:`peft.PeftModel.set_adapter` (or just exiting the disable
210
-    context) re-enabling them. A dedicated sanity test asserts that
211
-    these actually change logits on a fixture.
212
-    """
213
-
214
-    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
215
-        torch, transformers, peft = _require_hf()
216
-        self._torch = torch
217
-        self._spec = base_spec
218
-        self._adapter_path = Path(adapter_path).expanduser().resolve()
219
-
220
-        device_str: Device = (
221
-            _detect_device() if base_spec.device == "auto" else base_spec.device  # type: ignore[assignment]
222
-        )
223
-        self._device: str = device_str
224
-        dtype = _resolve_dtype(base_spec.dtype, device_str)
225
-
226
-        tokenizer = transformers.AutoTokenizer.from_pretrained(
227
-            str(self._adapter_path)
228
-            if (self._adapter_path / "tokenizer_config.json").exists()
229
-            else base_spec.base,
230
-            trust_remote_code=base_spec.trust_remote_code,
231
-        )
232
-        if tokenizer.pad_token_id is None:
233
-            tokenizer.pad_token = tokenizer.eos_token
234
-
235
-        base_model = transformers.AutoModelForCausalLM.from_pretrained(
236
-            base_spec.base,
237
-            torch_dtype=dtype,
238
-            trust_remote_code=base_spec.trust_remote_code,
239
-        )
240
-        base_model.to(self._device)
241
-        peft_model = peft.PeftModel.from_pretrained(
242
-            base_model,
243
-            str(self._adapter_path),
244
-            is_trainable=False,
245
-        )
246
-        peft_model.eval()
247
-
248
-        self._tokenizer: PreTrainedTokenizerBase = tokenizer
249
-        self._peft_model: PreTrainedModel = peft_model
250
-        self._pad_token_id: int = int(tokenizer.pad_token_id)
251
-        self._active: str | None = None
252
-
253
-    # -- DifferentialBackend -------------------------------------------
254
-
255
-    @contextmanager
256
-    def as_base(self) -> Iterator[_HFView]:
257
-        self._enter("base")
258
-        try:
259
-            # peft.PeftModel.disable_adapter is a context manager; mypy
260
-            # mis-reads it as a Tensor on this transformers version.
261
-            with self._peft_model.disable_adapter():  # type: ignore[operator]
262
-                yield self._make_view("base")
263
-        finally:
264
-            self._exit()
265
-
266
-    @contextmanager
267
-    def as_finetuned(self) -> Iterator[_HFView]:
268
-        self._enter("ft")
269
-        try:
270
-            yield self._make_view("ft")
271
-        finally:
272
-            self._exit()
273
-
274
-    @contextmanager
275
-    def as_scaled_adapter(self, lam: float) -> Iterator[_HFView]:
276
-        """Temporarily multiply every LoRA layer's scaling factor by ``lam``.
277
-
278
-        Works by walking the PEFT module tree and mutating each
279
-        ``LoraLayer.scaling[adapter_name]`` in place. The original
280
-        scalings are restored when the context exits — or when an
281
-        exception propagates, to keep the model in a sane state.
282
-        """
283
-        self._enter(f"scaled({lam})")
284
-        # ``module`` is dynamic (peft LoraLayer subclass) — Any avoids
285
-        # mypy treating its ``.scaling`` as a Tensor when peft is loaded.
286
-        saved: list[tuple[Any, str, float]] = []
287
-        try:
288
-            import peft  # noqa: PLC0415 — already a hard dep of this backend
289
-
290
-            lora_cls = getattr(peft.tuners.lora, "LoraLayer", None)
291
-            if lora_cls is None:
292
-                raise RuntimeError("peft.tuners.lora.LoraLayer not found; check peft>=0.13 pin")
293
-            for module in self._peft_model.modules():
294
-                if not isinstance(module, lora_cls):
295
-                    continue
296
-                scaling = getattr(module, "scaling", None)
297
-                if not isinstance(scaling, dict):
298
-                    continue
299
-                for key, original in scaling.items():
300
-                    saved.append((module, key, float(original)))
301
-                    scaling[key] = float(original) * lam
302
-            yield self._make_view(f"scaled_{lam:.2f}")
303
-        finally:
304
-            for module, key, original in saved:
305
-                module.scaling[key] = original
306
-            self._exit()
307
-
308
-    @contextmanager
309
-    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_HFView]:
310
-        """Temporarily replace every LoRA ``A``/``B`` tensor with random noise.
311
-
312
-        Same rank, alpha, and target modules as the real adapter — only
313
-        the weights differ. This is the denominator in every z-score
314
-        path: "how much signal does structural noise produce?"
315
-
316
-        Implementation walks the PEFT module tree for ``lora_A``/``lora_B``
317
-        parameters, saves a clone of each current value, overwrites in
318
-        place with a zero-mean Gaussian at ``init_scale``, and restores
319
-        on exit (including on exception).
320
-        """
321
-        import torch
322
-
323
-        self._enter(f"null({seed})")
324
-        gen = torch.Generator(device="cpu").manual_seed(int(seed))
325
-        saved: list[tuple[torch.nn.Parameter, torch.Tensor]] = []
326
-        try:
327
-            for pname, param in self._peft_model.named_parameters():
328
-                if not any(key in pname for key in ("lora_A", "lora_B")):
329
-                    continue
330
-                saved.append((param, param.detach().clone()))
331
-                with torch.no_grad():
332
-                    noise = torch.randn(
333
-                        *param.shape,
334
-                        generator=gen,
335
-                        dtype=torch.float32,
336
-                    ).to(dtype=param.dtype, device=param.device)
337
-                    param.copy_(noise * init_scale)
338
-            yield self._make_view(f"null_{seed}")
339
-        finally:
340
-            with torch.no_grad():
341
-                for param, original in saved:
342
-                    param.copy_(original)
343
-            self._exit()
344
-
345
-    def close(self) -> None:
346
-        """Release GPU memory. Safe to call more than once."""
347
-        if getattr(self, "_peft_model", None) is not None:
348
-            del self._peft_model
349
-        if self._torch.cuda.is_available():
350
-            self._torch.cuda.empty_cache()
351
-
352
-    # -- internals -----------------------------------------------------
353
-
354
-    def _make_view(self, mode: str) -> _HFView:
355
-        return _HFView(
356
-            id=mode,
357
-            _model=self._peft_model,
358
-            _tokenizer=self._tokenizer,
359
-            _device=self._device,
360
-            _pad_token_id=self._pad_token_id,
361
-        )
362
-
363
-    def _enter(self, mode: str) -> None:
364
-        if self._active is not None:
365
-            raise RuntimeError(
366
-                f"HuggingFaceDifferentialBackend view {self._active!r} already active; "
367
-                f"exit it before entering {mode!r}."
368
-            )
369
-        self._active = mode
370
-
371
-    def _exit(self) -> None:
372
-        self._active = None
373
-
374
-
375
-__all__ = ["HuggingFaceDifferentialBackend"]
sway/src/dlm_sway/backends/mlx.pydeleted
@@ -1,205 +0,0 @@
1
-"""MLX backend for Apple Silicon (darwin-arm64).
2
-
3
-Partial implementation covering the common case: a PEFT adapter that's
4
-already been converted to MLX's ``.npz`` format. Unlike the HF backend,
5
-MLX has no runtime ``disable_adapter`` context — adapters get fused into
6
-the linear layers at load time — so this backend keeps **both** a base
7
-model and an adapted model in memory. Fine for the small (<3B) models
8
-MLX is typically used with on Apple Silicon; document the cost clearly.
9
-
10
-If users point this backend at raw PEFT safetensors, ``mlx_lm.load``
11
-will refuse them with its own error. A future milestone can wire a
12
-PEFT-→-MLX converter; for now the contract is "bring your own .npz".
13
-"""
14
-
15
-from __future__ import annotations
16
-
17
-from collections.abc import Iterator
18
-from contextlib import contextmanager
19
-from dataclasses import dataclass
20
-from pathlib import Path
21
-from typing import TYPE_CHECKING, Any
22
-
23
-import numpy as np
24
-
25
-from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
26
-from dlm_sway.core.model import ModelSpec
27
-from dlm_sway.core.scoring import RollingLogprob, TokenDist
28
-
29
-if TYPE_CHECKING:
30
-    pass
31
-
32
-
33
-def _require_mlx() -> tuple[Any, Any]:
34
-    try:
35
-        import mlx.core as mx
36
-        import mlx_lm
37
-    except ImportError as exc:
38
-        raise BackendNotAvailableError(
39
-            "mlx",
40
-            extra="mlx",
41
-            hint="MLX backend needs mlx + mlx-lm on darwin-arm64.",
42
-        ) from exc
43
-    return mx, mlx_lm
44
-
45
-
46
-@dataclass(slots=True)
47
-class _MLXView:
48
-    """One side (base or ft) of the MLX backend.
49
-
50
-    Both sides carry the same tokenizer (MLX stores it alongside the
51
-    converted model files, so sharing avoids double-loading).
52
-    """
53
-
54
-    id: str
55
-    _model: Any
56
-    _tokenizer: Any
57
-
58
-    def generate(
59
-        self,
60
-        prompt: str,
61
-        *,
62
-        max_new_tokens: int,
63
-        temperature: float = 0.0,
64
-        top_p: float = 1.0,
65
-        seed: int = 0,
66
-    ) -> str:
67
-        del seed  # mlx_lm.generate seeds via its own global state
68
-        _, mlx_lm = _require_mlx()
69
-        kwargs: dict[str, Any] = {"max_tokens": max_new_tokens, "verbose": False}
70
-        if temperature > 0.0:
71
-            kwargs["temp"] = temperature
72
-            kwargs["top_p"] = top_p
73
-        out = mlx_lm.generate(self._model, self._tokenizer, prompt=prompt, **kwargs)
74
-        return str(out)
75
-
76
-    def close(self) -> None:
77
-        return None
78
-
79
-    # -- ScoringBackend ------------------------------------------------
80
-
81
-    def _forward_logits(self, prompt: str) -> np.ndarray:
82
-        """Run the model once and return ``(seq_len, vocab)`` logits."""
83
-        mx, _ = _require_mlx()
84
-        input_ids = self._tokenizer.encode(prompt)
85
-        tokens = mx.array(input_ids)[None, :]  # (1, T)
86
-        out = self._model(tokens)
87
-        # mlx_lm models return an mx.array; convert to numpy for downstream math.
88
-        return np.asarray(out[0])
89
-
90
-    def logprob_of(self, prompt: str, completion: str) -> float:
91
-        input_ids = self._tokenizer.encode(prompt)
92
-        full_ids = self._tokenizer.encode(prompt + completion)
93
-        if len(full_ids) <= len(input_ids):
94
-            raise ProbeError(
95
-                "logprob_of",
96
-                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
97
-            )
98
-        logits = self._forward_logits(prompt + completion)  # (T, V)
99
-        # Position t predicts token t+1 — slice off the last row and the prompt span.
100
-        shift = logits[len(input_ids) - 1 : -1, :]
101
-        target_ids = np.asarray(full_ids[len(input_ids) :], dtype=np.int64)
102
-        log_probs = _log_softmax(shift.astype(np.float64), axis=-1)
103
-        gathered = log_probs[np.arange(len(target_ids)), target_ids]
104
-        return float(gathered.sum())
105
-
106
-    def rolling_logprob(self, text: str) -> RollingLogprob:
107
-        ids = self._tokenizer.encode(text)
108
-        if len(ids) < 2:
109
-            return RollingLogprob(
110
-                token_ids=np.asarray(ids, dtype=np.int64),
111
-                logprobs=np.array([], dtype=np.float32),
112
-                num_tokens=len(ids),
113
-                total_logprob=0.0,
114
-            )
115
-        logits = self._forward_logits(text)
116
-        log_probs = _log_softmax(logits[:-1].astype(np.float64), axis=-1)
117
-        ids_arr = np.asarray(ids, dtype=np.int64)
118
-        gathered = log_probs[np.arange(len(ids) - 1), ids_arr[1:]]
119
-        return RollingLogprob(
120
-            token_ids=ids_arr,
121
-            logprobs=gathered.astype(np.float32),
122
-            num_tokens=len(ids),
123
-            total_logprob=float(gathered.sum()),
124
-        )
125
-
126
-    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
127
-        logits = self._forward_logits(prompt)
128
-        last_logits = logits[-1].astype(np.float64)
129
-        log_probs = _log_softmax(last_logits, axis=-1)
130
-        k = min(top_k, log_probs.shape[0])
131
-        # np.argpartition for top-k then sort the partition.
132
-        part = np.argpartition(log_probs, -k)[-k:]
133
-        top_ids = part[np.argsort(log_probs[part])[::-1]]
134
-        top_lp = log_probs[top_ids]
135
-        tail_mass = float(1.0 - np.exp(top_lp).sum())
136
-        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
137
-        return TokenDist(
138
-            token_ids=top_ids.astype(np.int64),
139
-            logprobs=top_lp.astype(np.float32),
140
-            vocab_size=int(log_probs.shape[0]),
141
-            tail_logprob=tail_logprob,
142
-        )
143
-
144
-
145
-class MLXDifferentialBackend:
146
-    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for MLX models.
147
-
148
-    Loads two copies of the same base model — one bare, one with the
149
-    adapter fused — because MLX has no runtime toggle. Memory cost: 2×
150
-    base weights. On typical Apple Silicon workloads with ≤3B models
151
-    this is acceptable.
152
-    """
153
-
154
-    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
155
-        mx, mlx_lm = _require_mlx()
156
-        self._mx = mx
157
-        self._spec = base_spec
158
-        self._adapter_path = Path(adapter_path).expanduser().resolve()
159
-
160
-        # Load bare base (no adapter).
161
-        self._base_model, self._tokenizer = mlx_lm.load(base_spec.base)
162
-        # Load ft with adapter attached. ``adapter_path`` is mlx_lm's kwarg.
163
-        self._ft_model, _ = mlx_lm.load(base_spec.base, adapter_path=str(self._adapter_path))
164
-        self._active: str | None = None
165
-
166
-    @contextmanager
167
-    def as_base(self) -> Iterator[_MLXView]:
168
-        self._enter("base")
169
-        try:
170
-            yield _MLXView(id="base", _model=self._base_model, _tokenizer=self._tokenizer)
171
-        finally:
172
-            self._exit()
173
-
174
-    @contextmanager
175
-    def as_finetuned(self) -> Iterator[_MLXView]:
176
-        self._enter("ft")
177
-        try:
178
-            yield _MLXView(id="ft", _model=self._ft_model, _tokenizer=self._tokenizer)
179
-        finally:
180
-            self._exit()
181
-
182
-    def close(self) -> None:
183
-        """MLX reclaims memory when references drop; nothing to do here."""
184
-        return
185
-
186
-    def _enter(self, mode: str) -> None:
187
-        if self._active is not None:
188
-            raise RuntimeError(
189
-                f"MLXDifferentialBackend view {self._active!r} already active; "
190
-                f"exit it before entering {mode!r}."
191
-            )
192
-        self._active = mode
193
-
194
-    def _exit(self) -> None:
195
-        self._active = None
196
-
197
-
198
-def _log_softmax(x: np.ndarray, *, axis: int) -> np.ndarray:
199
-    x_max = np.max(x, axis=axis, keepdims=True)
200
-    y = x - x_max
201
-    log_sum = np.log(np.sum(np.exp(y), axis=axis, keepdims=True))
202
-    return np.asarray(y - log_sum, dtype=np.float64)
203
-
204
-
205
-__all__ = ["MLXDifferentialBackend"]
sway/src/dlm_sway/cli/__init__.pydeleted
@@ -1,1 +0,0 @@
1
-"""Command-line interface (entry point: ``dlm-sway``)."""
sway/src/dlm_sway/cli/app.pydeleted
@@ -1,59 +0,0 @@
1
-"""dlm-sway CLI entry point.
2
-
3
-``pip install dlm-sway`` installs this module's :func:`main` as the
4
-``dlm-sway`` console script. Every subcommand is a thin wrapper around a
5
-library-level function so the CLI surface mirrors what programmatic
6
-callers get.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-import typer
12
-
13
-from dlm_sway import __version__
14
-from dlm_sway.cli import commands
15
-
16
-app = typer.Typer(
17
-    name="dlm-sway",
18
-    no_args_is_help=True,
19
-    add_completion=False,
20
-    help="Differential testing for fine-tuned causal language models.",
21
-)
22
-
23
-
24
-def _version_callback(value: bool) -> None:
25
-    if value:
26
-        typer.echo(f"dlm-sway {__version__}")
27
-        raise typer.Exit()
28
-
29
-
30
-@app.callback()
31
-def _root(
32
-    version: bool = typer.Option(  # noqa: B008 — typer pattern
33
-        False,
34
-        "--version",
35
-        callback=_version_callback,
36
-        is_eager=True,
37
-        help="Print version and exit.",
38
-    ),
39
-) -> None:
40
-    """Root callback; accepts ``--version``."""
41
-    del version
42
-
43
-
44
-app.command("run")(commands.run_cmd)
45
-app.command("gate")(commands.gate_cmd)
46
-app.command("check")(commands.check_cmd)
47
-app.command("diff")(commands.diff_cmd)
48
-app.command("autogen")(commands.autogen_cmd)
49
-app.command("doctor")(commands.doctor_cmd)
50
-app.command("report")(commands.report_cmd)
51
-
52
-
53
-def main() -> None:
54
-    """Script entry point registered in :file:`pyproject.toml`."""
55
-    app()
56
-
57
-
58
-if __name__ == "__main__":
59
-    main()
sway/src/dlm_sway/cli/commands.pydeleted
@@ -1,396 +0,0 @@
1
-"""Command implementations for the ``dlm-sway`` CLI.
2
-
3
-Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
4
-Commands deliberately do as little as possible themselves — the real
5
-work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the
6
-probes package.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-import json
12
-import sys
13
-from pathlib import Path
14
-from typing import Annotated, Any
15
-
16
-import typer
17
-from rich.console import Console
18
-
19
-from dlm_sway import __version__
20
-from dlm_sway.core.errors import SwayError
21
-from dlm_sway.core.result import SuiteResult, SwayScore, Verdict
22
-
23
-
24
-def run_cmd(
25
-    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
26
-    json_out: Annotated[
27
-        Path | None,
28
-        typer.Option(
29
-            "--json",
30
-            "-j",
31
-            help="Write the JSON report to this path in addition to the terminal render.",
32
-        ),
33
-    ] = None,
34
-    markdown_out: Annotated[
35
-        Path | None,
36
-        typer.Option("--markdown", "-m", help="Write a markdown report to this path."),
37
-    ] = None,
38
-) -> None:
39
-    """Execute a suite and render a terminal report."""
40
-    try:
41
-        result, score_obj = _execute_spec(spec)
42
-    except SwayError as exc:
43
-        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
44
-        raise typer.Exit(code=2) from exc
45
-
46
-    from dlm_sway.suite import report
47
-
48
-    console = Console()
49
-    report.to_terminal(result, score_obj, console=console)
50
-
51
-    if json_out is not None:
52
-        json_out.write_text(report.to_json(result, score_obj), encoding="utf-8")
53
-        console.print(f"\n[dim]wrote JSON → {json_out}[/dim]")
54
-    if markdown_out is not None:
55
-        markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8")
56
-        console.print(f"[dim]wrote markdown → {markdown_out}[/dim]")
57
-
58
-
59
-def gate_cmd(
60
-    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
61
-    junit_out: Annotated[
62
-        Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.")
63
-    ] = None,
64
-    coverage_threshold: Annotated[
65
-        float | None,
66
-        typer.Option(
67
-            "--threshold",
68
-            help="Override the spec's coverage_threshold. Exit non-zero below it.",
69
-        ),
70
-    ] = None,
71
-) -> None:
72
-    """Execute a suite and exit non-zero on failure (CI gate)."""
73
-    try:
74
-        result, score_obj = _execute_spec(spec)
75
-    except SwayError as exc:
76
-        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
77
-        raise typer.Exit(code=2) from exc
78
-
79
-    from dlm_sway.suite import report
80
-    from dlm_sway.suite.loader import load_spec as _load_spec
81
-
82
-    console = Console()
83
-    report.to_terminal(result, score_obj, console=console)
84
-
85
-    if junit_out is not None:
86
-        junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8")
87
-        console.print(f"[dim]wrote JUnit → {junit_out}[/dim]")
88
-
89
-    threshold = (
90
-        coverage_threshold
91
-        if coverage_threshold is not None
92
-        else _load_spec(spec).defaults.coverage_threshold
93
-    )
94
-    has_failures = any(p.verdict == Verdict.FAIL for p in result.probes)
95
-    below_threshold = score_obj.overall < threshold
96
-    if has_failures or below_threshold:
97
-        console.print(
98
-            f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}"
99
-            if below_threshold
100
-            else "\n[red]gate FAILED[/red] — at least one probe reported FAIL"
101
-        )
102
-        raise typer.Exit(code=1)
103
-    console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}")
104
-
105
-
106
-def check_cmd(
107
-    adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")],
108
-    base: Annotated[str, typer.Option("--base", help="HuggingFace base model id or local path.")],
109
-    prompts: Annotated[
110
-        Path | None,
111
-        typer.Option(
112
-            "--prompts",
113
-            help="File with one prompt per line. Defaults to sway's built-in quick set.",
114
-        ),
115
-    ] = None,
116
-) -> None:
117
-    """<60s smoke test: "is this adapter doing anything at all?".
118
-
119
-    Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No
120
-    spec file required.
121
-    """
122
-    from dlm_sway.backends import build as build_backend
123
-    from dlm_sway.core.model import ModelSpec
124
-    from dlm_sway.suite import report
125
-    from dlm_sway.suite.runner import run as run_suite
126
-    from dlm_sway.suite.score import compute as compute_score
127
-    from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec
128
-
129
-    quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS
130
-
131
-    base_spec = ModelSpec(base=base, kind="hf")
132
-    ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter)
133
-    spec = SwaySpec(
134
-        version=1,
135
-        models=SuiteModels(base=base_spec, ft=ft_spec),
136
-        defaults=SuiteDefaults(seed=0),
137
-        suite=[
138
-            {
139
-                "name": "quick_delta_kl",
140
-                "kind": "delta_kl",
141
-                "prompts": list(quick_prompts),
142
-                "assert_mean_gte": 0.01,
143
-            },
144
-            {
145
-                "name": "quick_calibration",
146
-                "kind": "calibration_drift",
147
-                "items_limit": 10,
148
-            },
149
-        ],
150
-    )
151
-    try:
152
-        backend = build_backend(ft_spec)
153
-    except SwayError as exc:
154
-        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
155
-        raise typer.Exit(code=2) from exc
156
-
157
-    try:
158
-        result = run_suite(spec, backend, spec_path="<check>")
159
-    finally:
160
-        _close_if_possible(backend)
161
-    score_obj = compute_score(result)
162
-    report.to_terminal(result, score_obj, console=Console())
163
-
164
-
165
-def diff_cmd(
166
-    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
167
-    adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")],
168
-    adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")],
169
-) -> None:
170
-    """Run the same suite against two adapters and show per-probe deltas."""
171
-    from dlm_sway.backends import build as build_backend
172
-    from dlm_sway.suite.loader import load_spec
173
-    from dlm_sway.suite.runner import run as run_suite
174
-    from dlm_sway.suite.score import compute as compute_score
175
-
176
-    sway_spec = load_spec(spec)
177
-    console = Console()
178
-
179
-    def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]:
180
-        ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path})
181
-        backend = build_backend(ft_spec)
182
-        try:
183
-            result = run_suite(sway_spec, backend, spec_path=str(spec))
184
-        finally:
185
-            _close_if_possible(backend)
186
-        scored = compute_score(result)
187
-        per_probe = {p.name: (p.score or 0.0) for p in result.probes}
188
-        return scored.overall, per_probe
189
-
190
-    try:
191
-        overall_a, per_a = _score_for(adapter_a)
192
-        overall_b, per_b = _score_for(adapter_b)
193
-    except SwayError as exc:
194
-        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
195
-        raise typer.Exit(code=2) from exc
196
-
197
-    console.print(f"[bold]overall[/bold]  A: {overall_a:.2f}   B: {overall_b:.2f}")
198
-    console.print()
199
-    console.print("[bold]per-probe[/bold] (A → B, Δ):")
200
-    for name in sorted(per_a.keys() | per_b.keys()):
201
-        a = per_a.get(name, 0.0)
202
-        b = per_b.get(name, 0.0)
203
-        delta = b - a
204
-        sign = "+" if delta >= 0 else ""
205
-        console.print(f"  {name:<30}  {a:.2f}  →  {b:.2f}   ({sign}{delta:+.2f})")
206
-
207
-
208
-def autogen_cmd(
209
-    dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")],
210
-    out: Annotated[
211
-        Path,
212
-        typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
213
-    ] = Path("sway.yaml"),
214
-) -> None:
215
-    """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm])."""
216
-    import importlib
217
-
218
-    try:
219
-        autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen")
220
-    except ImportError as exc:
221
-        typer.secho(
222
-            "dlm integration not installed — run: pip install 'dlm-sway[dlm]'",
223
-            err=True,
224
-            fg=typer.colors.RED,
225
-        )
226
-        raise typer.Exit(code=2) from exc
227
-
228
-    try:
229
-        autogen_mod.write_sway_yaml(dlm_path, out)
230
-    except SwayError as exc:
231
-        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
232
-        raise typer.Exit(code=2) from exc
233
-
234
-    typer.echo(f"wrote {out}")
235
-
236
-
237
-def doctor_cmd() -> None:
238
-    """Print backend availability and version info."""
239
-    console = Console()
240
-    console.print(f"[bold]dlm-sway[/bold] {__version__}")
241
-    console.print(f"  python:    {sys.version.split()[0]}")
242
-    console.print(f"  platform:  {sys.platform}")
243
-    console.print()
244
-
245
-    console.print("[bold]backends[/bold]")
246
-    console.print(
247
-        f"  hf:        {_probe_import('torch')} {_probe_import('transformers')} {_probe_import('peft')}"
248
-    )
249
-    console.print(f"  mlx:       {_probe_import('mlx')} {_probe_import('mlx_lm')}")
250
-    console.print(f"  semsim:    {_probe_import('sentence_transformers')}")
251
-    console.print(
252
-        f"  style+:    {_probe_import('spacy')} {_probe_import('textstat')} {_probe_import('nlpaug')}"
253
-    )
254
-    console.print(f"  dlm:       {_probe_import('dlm')}")
255
-    console.print(f"  viz:       {_probe_import('matplotlib')}")
256
-
257
-
258
-def report_cmd(
259
-    result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")],
260
-    format: Annotated[
261
-        str, typer.Option("--format", help="Output format: terminal, md, junit, json.")
262
-    ] = "terminal",
263
-) -> None:
264
-    """Re-render a previously saved run (for history tracking / dashboards)."""
265
-    raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8"))
266
-    fmt = format.lower()
267
-    if fmt == "json":
268
-        typer.echo(json.dumps(raw, indent=2, sort_keys=True))
269
-        return
270
-    if fmt in {"md", "markdown"}:
271
-        # A file-level re-render needs the dataclasses back; simplest is
272
-        # to synthesize a minimal markdown from the JSON directly.
273
-        typer.echo(_render_markdown_from_json(raw))
274
-        return
275
-    if fmt == "junit":
276
-        typer.echo(_render_junit_from_json(raw))
277
-        return
278
-    # Default: terminal-ish one-liner summary.
279
-    score: dict[str, Any] = raw.get("score", {})
280
-    typer.echo(f"overall: {score.get('overall', 0.0):.2f}  [{score.get('band', '?')}]")
281
-    probes: list[dict[str, Any]] = raw.get("probes", [])
282
-    for p in probes:
283
-        typer.echo(
284
-            f"  {p['name']:<30}  {p['verdict']:<6}  "
285
-            f"{(p.get('score') or 0.0):.2f}  {p.get('message', '')[:60]}"
286
-        )
287
-
288
-
289
-# -- helpers -----------------------------------------------------------
290
-
291
-
292
-_BUILTIN_QUICK_PROMPTS: tuple[str, ...] = (
293
-    "The quick brown fox",
294
-    "Once upon a time",
295
-    "The answer to the question is",
296
-    "One important lesson is",
297
-    "In my opinion,",
298
-    "The first step is to",
299
-    "Remember that",
300
-    "A common mistake is",
301
-)
302
-
303
-
304
-def _load_prompts(path: Path) -> tuple[str, ...]:
305
-    return tuple(
306
-        line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()
307
-    )
308
-
309
-
310
-def _execute_spec(path: Path) -> tuple[SuiteResult, SwayScore]:
311
-    """Load a spec, build a backend, run the suite, fold scores. Shared
312
-    by ``run`` and ``gate``. Picks up .dlm-derived sections when the
313
-    spec's ``dlm_source`` is set."""
314
-    from dlm_sway.backends import build as build_backend
315
-    from dlm_sway.suite.loader import load_spec
316
-    from dlm_sway.suite.runner import run as run_suite
317
-    from dlm_sway.suite.score import compute as compute_score
318
-
319
-    spec = load_spec(path)
320
-    sections = None
321
-    doc_text = None
322
-    if spec.dlm_source is not None:
323
-        import importlib
324
-
325
-        try:
326
-            resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver")
327
-            handle = resolver.resolve_dlm(Path(spec.dlm_source))
328
-            sections = handle.sections
329
-            doc_text = handle.doc_text
330
-        except ImportError:
331
-            # Honoring dlm_source is best-effort — probes that need
332
-            # sections will SKIP with a pointer at the extra.
333
-            sections = None
334
-    backend = build_backend(spec.models.ft)
335
-    try:
336
-        result = run_suite(spec, backend, spec_path=str(path), sections=sections, doc_text=doc_text)
337
-    finally:
338
-        _close_if_possible(backend)
339
-    score_obj = compute_score(result)
340
-    return result, score_obj
341
-
342
-
343
-def _close_if_possible(backend: object) -> None:
344
-    close = getattr(backend, "close", None)
345
-    if callable(close):
346
-        close()
347
-
348
-
349
-def _probe_import(name: str) -> str:
350
-    import importlib
351
-
352
-    try:
353
-        mod = importlib.import_module(name)
354
-    except ImportError:
355
-        return f"[red]{name}: missing[/red]"
356
-    ver = getattr(mod, "__version__", "installed")
357
-    return f"[green]{name}: {ver}[/green]"
358
-
359
-
360
-def _render_markdown_from_json(raw: dict[str, Any]) -> str:
361
-    score: dict[str, Any] = raw.get("score", {})
362
-    lines: list[str] = [
363
-        "# dlm-sway report",
364
-        "",
365
-        f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`)  ",
366
-        f"**Base:** `{raw.get('base_model_id', '?')}`  ",
367
-        f"**Adapter:** `{raw.get('adapter_id', '?')}`  ",
368
-        "",
369
-        "## Probes",
370
-        "",
371
-        "| name | kind | verdict | score |",
372
-        "|---|---|---|---:|",
373
-    ]
374
-    probes: list[dict[str, Any]] = raw.get("probes", [])
375
-    for p in probes:
376
-        lines.append(
377
-            f"| {p['name']} | `{p['kind']}` | {p['verdict']} | {(p.get('score') or 0.0):.2f} |"
378
-        )
379
-    return "\n".join(lines)
380
-
381
-
382
-def _render_junit_from_json(raw: dict[str, Any]) -> str:
383
-    """Minimal JUnit renderer from a saved JSON (useful for report --format junit)."""
384
-    import xml.etree.ElementTree as ET
385
-
386
-    probes: list[dict[str, Any]] = raw.get("probes", [])
387
-    testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))})
388
-    for p in probes:
389
-        tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]})
390
-        if p["verdict"] == "fail":
391
-            ET.SubElement(tc, "failure", {"message": p.get("message", "")})
392
-        elif p["verdict"] == "error":
393
-            ET.SubElement(tc, "error", {"message": p.get("message", "")})
394
-        elif p["verdict"] == "skip":
395
-            ET.SubElement(tc, "skipped", {"message": p.get("message", "")})
396
-    return ET.tostring(testsuite, encoding="unicode")
sway/src/dlm_sway/core/__init__.pydeleted
@@ -1,1 +0,0 @@
1
-"""Core abstractions: protocols, results, errors, determinism."""
sway/src/dlm_sway/core/determinism.pydeleted
@@ -1,97 +0,0 @@
1
-"""Deterministic-execution helper.
2
-
3
-Mirrors ``dlm.train.determinism.seed_everything`` so running the same
4
-suite twice on the same host produces the same :class:`ProbeResult`
5
-payloads. The dlm project treats determinism as a contract; sway takes
6
-the same posture for scoring operations.
7
-
8
-Generation is allowed to use non-deterministic attention kernels when
9
-``temperature > 0``, because a deterministic sampled generation is a
10
-contradiction. Scoring (logprobs, rolling logprobs, next-token dists)
11
-always runs under :func:`torch.use_deterministic_algorithms(True)`.
12
-"""
13
-
14
-from __future__ import annotations
15
-
16
-import os
17
-import random
18
-from dataclasses import dataclass
19
-from typing import Literal
20
-
21
-DeterminismClass = Literal["strict", "best_effort", "loose"]
22
-
23
-
24
-@dataclass(frozen=True, slots=True)
25
-class DeterminismSummary:
26
-    """What seeding actually accomplished, for logging in the report."""
27
-
28
-    class_: DeterminismClass
29
-    seed: int
30
-    notes: tuple[str, ...] = ()
31
-
32
-
33
-def seed_everything(seed: int, *, strict: bool = True) -> DeterminismSummary:
34
-    """Seed every RNG sway's probes touch and flip backend flags.
35
-
36
-    Idempotent — safe to call repeatedly with the same seed.
37
-
38
-    Parameters
39
-    ----------
40
-    seed:
41
-        The seed. Callers typically use the value from ``sway.yaml``'s
42
-        ``defaults.seed`` (default 0).
43
-    strict:
44
-        If ``True`` (the default), request deterministic CUDA algorithms
45
-        and set ``CUBLAS_WORKSPACE_CONFIG``. Scoring probes need this;
46
-        generation-only runs can set it ``False``.
47
-
48
-    Returns
49
-    -------
50
-    :class:`DeterminismSummary` with a classification:
51
-
52
-    - ``"strict"`` — deterministic algorithms active, no warnings.
53
-    - ``"best_effort"`` — platform doesn't support full determinism
54
-      (MPS, some CPU kernels).
55
-    - ``"loose"`` — seeded but deterministic algorithms refused.
56
-    """
57
-
58
-    notes: list[str] = []
59
-    clazz: DeterminismClass = "best_effort"
60
-
61
-    # Env vars must come first — torch reads them at cuBLAS init.
62
-    if strict:
63
-        os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
64
-
65
-    random.seed(seed)
66
-
67
-    # numpy is a hard dep; safe to seed unconditionally.
68
-    import numpy as np
69
-
70
-    np.random.seed(seed)
71
-
72
-    try:
73
-        import torch  # noqa: PLC0415 — lazy: torch is an optional extra.
74
-    except ModuleNotFoundError:
75
-        notes.append("torch not installed; seeded python + numpy only")
76
-        return DeterminismSummary(class_="best_effort", seed=seed, notes=tuple(notes))
77
-
78
-    torch.manual_seed(seed)
79
-    if torch.cuda.is_available():
80
-        torch.cuda.manual_seed_all(seed)
81
-        clazz = "strict"
82
-    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
83
-        clazz = "best_effort"
84
-        notes.append("MPS: bit-identical across runs is best-effort")
85
-    else:
86
-        clazz = "best_effort"
87
-        notes.append("CPU-only backend: strict determinism depends on BLAS impl")
88
-
89
-    if strict:
90
-        try:
91
-            torch.use_deterministic_algorithms(True, warn_only=True)
92
-            torch.backends.cudnn.benchmark = False
93
-        except Exception as exc:  # noqa: BLE001 — torch raises a naked Exception
94
-            clazz = "loose"
95
-            notes.append(f"deterministic algorithms refused: {exc}")
96
-
97
-    return DeterminismSummary(class_=clazz, seed=seed, notes=tuple(notes))
sway/src/dlm_sway/core/errors.pydeleted
@@ -1,65 +0,0 @@
1
-"""Exception hierarchy for dlm-sway.
2
-
3
-Every error sway raises inherits from :class:`SwayError` so callers can
4
-catch the whole family with a single ``except``. Subclasses carry enough
5
-context (spec paths, probe names, missing extras) for the CLI to render
6
-actionable messages without the caller having to introspect an exception
7
-chain.
8
-"""
9
-
10
-from __future__ import annotations
11
-
12
-
13
-class SwayError(Exception):
14
-    """Root of the dlm-sway exception hierarchy."""
15
-
16
-
17
-class SpecValidationError(SwayError):
18
-    """A ``sway.yaml`` (or equivalent) failed pydantic validation.
19
-
20
-    Parameters
21
-    ----------
22
-    message:
23
-        Human-readable summary of what went wrong.
24
-    source:
25
-        Path or identifier of the spec being validated, if known.
26
-    """
27
-
28
-    def __init__(self, message: str, *, source: str | None = None) -> None:
29
-        super().__init__(message)
30
-        self.source = source
31
-
32
-    def __str__(self) -> str:
33
-        base = super().__str__()
34
-        return f"{self.source}: {base}" if self.source else base
35
-
36
-
37
-class BackendNotAvailableError(SwayError):
38
-    """A requested backend's optional dependencies aren't installed.
39
-
40
-    The CLI turns this into a pointed ``pip install dlm-sway[<extra>]``
41
-    hint; programmatic callers can read :attr:`extra` directly.
42
-    """
43
-
44
-    def __init__(self, backend: str, *, extra: str, hint: str | None = None) -> None:
45
-        message = (
46
-            f"backend {backend!r} unavailable — install the extra: pip install 'dlm-sway[{extra}]'"
47
-        )
48
-        if hint:
49
-            message = f"{message}\n{hint}"
50
-        super().__init__(message)
51
-        self.backend = backend
52
-        self.extra = extra
53
-
54
-
55
-class ProbeError(SwayError):
56
-    """A probe failed to *execute* (as opposed to failing its assertion).
57
-
58
-    Distinct from a ``verdict=FAIL`` result — assertion failures are
59
-    normal and reported via :class:`ProbeResult`. This is for genuine
60
-    bugs: missing sections, mismatched tokenizers, NaN logits.
61
-    """
62
-
63
-    def __init__(self, probe: str, message: str) -> None:
64
-        super().__init__(f"probe {probe!r}: {message}")
65
-        self.probe = probe
sway/src/dlm_sway/core/model.pydeleted
@@ -1,112 +0,0 @@
1
-"""The :class:`Model` abstraction and :class:`ModelSpec` user-facing config.
2
-
3
-Probes operate on objects that satisfy :class:`Model` (for generation)
4
-and :class:`~dlm_sway.core.scoring.ScoringBackend` (for logit-level
5
-access). Backends return concrete instances of both — they are
6
-deliberately separate Protocols because not every backend exposes logits
7
-(e.g. an Ollama HTTP backend would implement ``Model`` but not
8
-``ScoringBackend``).
9
-
10
-The user-facing surface is :class:`ModelSpec`, a pydantic model that
11
-describes how to materialize a base + adapter pair. No ``.dlm``
12
-concepts live at this layer — those belong in
13
-:mod:`dlm_sway.integrations.dlm`.
14
-"""
15
-
16
-from __future__ import annotations
17
-
18
-from dataclasses import dataclass
19
-from pathlib import Path
20
-from typing import Any, Literal, Protocol, runtime_checkable
21
-
22
-from pydantic import BaseModel, ConfigDict, Field
23
-
24
-BackendKind = Literal["hf", "mlx", "dummy", "custom"]
25
-"""Registered scoring-backend kinds.
26
-
27
-``custom`` is an escape hatch — the runner looks up an entry point when
28
-it sees ``custom`` in a spec.
29
-"""
30
-
31
-
32
-class ModelSpec(BaseModel):
33
-    """How to materialize one model (base or fine-tuned)."""
34
-
35
-    model_config = ConfigDict(extra="forbid", frozen=True)
36
-
37
-    kind: BackendKind = "hf"
38
-    base: str
39
-    """HuggingFace repo id (``HuggingFaceTB/SmolLM2-135M-Instruct``) or
40
-    a local path to a model directory."""
41
-
42
-    adapter: Path | None = None
43
-    """Path to a PEFT adapter directory (containing ``adapter_config.json``
44
-    and ``adapter_model.safetensors``). ``None`` → base-only model."""
45
-
46
-    dtype: Literal["auto", "fp16", "bf16", "fp32"] = "auto"
47
-    device: str = "auto"
48
-    """``"auto"`` chooses CUDA → MPS → CPU in that order."""
49
-
50
-    trust_remote_code: bool = False
51
-    """HuggingFace ``trust_remote_code`` passthrough. Off by default —
52
-    the user must opt in explicitly, matching sway's no-surprises
53
-    posture."""
54
-
55
-    entry_point: str | None = Field(default=None)
56
-    """Required when ``kind='custom'``. Import path like
57
-    ``mypkg.mybackend:MyBackend``."""
58
-
59
-
60
-@dataclass(frozen=True, slots=True)
61
-class LoadedModel:
62
-    """A materialized model plus the tokenizer that produced it.
63
-
64
-    Returned by backend ``load()`` methods. Probes usually don't touch
65
-    this directly — they go through the :class:`Model` /
66
-    :class:`~dlm_sway.core.scoring.ScoringBackend` Protocols.
67
-    """
68
-
69
-    id: str
70
-    """Stable handle: ``"base"`` or ``"ft"`` typically."""
71
-    spec: ModelSpec
72
-    model: Any
73
-    """Framework-native handle (torch ``nn.Module``, MLX array module …).
74
-
75
-    Typed as ``Any`` because the frameworks themselves ship unstubbed.
76
-    Backend implementations narrow this at their boundary."""
77
-    tokenizer: Any
78
-    meta: dict[str, Any]
79
-    """Backend-captured metadata: device, dtype, adapter version, bytes
80
-    on disk, num trainable params. Surfaced in the suite report."""
81
-
82
-
83
-@runtime_checkable
84
-class Model(Protocol):
85
-    """Minimum interface for text generation.
86
-
87
-    Implemented by backend-wrapped model objects. Probes that need logits
88
-    also require :class:`~dlm_sway.core.scoring.ScoringBackend`.
89
-    """
90
-
91
-    id: str
92
-
93
-    def generate(
94
-        self,
95
-        prompt: str,
96
-        *,
97
-        max_new_tokens: int,
98
-        temperature: float = 0.0,
99
-        top_p: float = 1.0,
100
-        seed: int = 0,
101
-    ) -> str:
102
-        """Generate a completion.
103
-
104
-        Defaults (``temperature=0``, ``top_p=1``) are greedy-decode for
105
-        reproducibility. Callers wanting sampled output must pass
106
-        non-defaults *and* a seed.
107
-        """
108
-        ...
109
-
110
-    def close(self) -> None:
111
-        """Release any resources held by this model."""
112
-        ...
sway/src/dlm_sway/core/result.pydeleted
@@ -1,139 +0,0 @@
1
-"""Probe and suite result types.
2
-
3
-Every numeric probe ultimately returns a :class:`ProbeResult`. The suite
4
-runner collects them into a :class:`SuiteResult` and the scorer folds
5
-that into a single :class:`SwayScore` with transparent per-component
6
-weights.
7
-
8
-These dataclasses are deliberately plain — no pydantic — because they
9
-cross probe/backend boundaries hundreds of times per run and a free
10
-``model_validate`` on every construction would dominate the runtime of
11
-cheap probes.
12
-"""
13
-
14
-from __future__ import annotations
15
-
16
-from dataclasses import dataclass, field
17
-from datetime import UTC, datetime
18
-from enum import StrEnum
19
-from typing import Any
20
-
21
-
22
-class Verdict(StrEnum):
23
-    """Outcome of a single probe against its assertion."""
24
-
25
-    PASS = "pass"
26
-    FAIL = "fail"
27
-    WARN = "warn"
28
-    SKIP = "skip"
29
-    ERROR = "error"
30
-
31
-
32
-@dataclass(frozen=True, slots=True)
33
-class ProbeResult:
34
-    """The result of running one probe.
35
-
36
-    Attributes
37
-    ----------
38
-    name:
39
-        User-facing name from the spec (unique within a suite).
40
-    kind:
41
-        Probe discriminator (``delta_kl``, ``section_internalization`` …).
42
-    verdict:
43
-        Pass / fail / warn / skip / error.
44
-    score:
45
-        Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric
46
-        probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`.
47
-    raw:
48
-        The raw metric value (e.g. KL=0.083). Probe-specific units.
49
-    z_score:
50
-        Standard deviations above the null-adapter baseline. ``None``
51
-        when no null calibration was run.
52
-    base_value:
53
-        The metric evaluated on the base model, when meaningful.
54
-    ft_value:
55
-        The metric evaluated on the fine-tuned model, when meaningful.
56
-    evidence:
57
-        Small structured payload for the report — prompts, example
58
-        completions, per-section breakdowns. Kept bounded (<10 KB) so
59
-        suite JSON stays under a megabyte.
60
-    message:
61
-        One-line diagnostic. Surfaces in the terminal report.
62
-    duration_s:
63
-        Wall time to execute.
64
-    """
65
-
66
-    name: str
67
-    kind: str
68
-    verdict: Verdict
69
-    score: float | None
70
-    raw: float | None = None
71
-    z_score: float | None = None
72
-    base_value: float | None = None
73
-    ft_value: float | None = None
74
-    evidence: dict[str, Any] = field(default_factory=dict)
75
-    message: str = ""
76
-    duration_s: float = 0.0
77
-
78
-
79
-@dataclass(frozen=True, slots=True)
80
-class SuiteResult:
81
-    """A full run of a sway.yaml suite."""
82
-
83
-    spec_path: str
84
-    started_at: datetime
85
-    finished_at: datetime
86
-    base_model_id: str
87
-    adapter_id: str
88
-    sway_version: str
89
-    probes: tuple[ProbeResult, ...] = ()
90
-    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
91
-    """Per-primitive null-adapter baseline stats (mean, std, runs). Used
92
-    to turn raw metrics into z-scores when rendering the report."""
93
-
94
-    @property
95
-    def wall_seconds(self) -> float:
96
-        return (self.finished_at - self.started_at).total_seconds()
97
-
98
-
99
-# Component weights for the composite score. Overridable in sway.yaml.
100
-DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = {
101
-    "adherence": 0.30,
102
-    "attribution": 0.35,
103
-    "calibration": 0.20,
104
-    "ablation": 0.15,
105
-}
106
-
107
-
108
-@dataclass(frozen=True, slots=True)
109
-class SwayScore:
110
-    """Composite score with a transparent per-component breakdown."""
111
-
112
-    overall: float
113
-    components: dict[str, float]
114
-    weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS))
115
-    band: str = ""
116
-    findings: tuple[str, ...] = ()
117
-
118
-    @staticmethod
119
-    def band_for(overall: float) -> str:
120
-        """Map a score to a human-readable band.
121
-
122
-        Bands (from the plan):
123
-          - <0.3  : indistinguishable from noise
124
-          - 0.3–0.6 : partial fit
125
-          - 0.6–0.85: healthy
126
-          - >0.85 : suspiciously good (possible overfit / memorization)
127
-        """
128
-        if overall < 0.3:
129
-            return "noise"
130
-        if overall < 0.6:
131
-            return "partial"
132
-        if overall <= 0.85:
133
-            return "healthy"
134
-        return "suspicious"
135
-
136
-
137
-def utcnow() -> datetime:
138
-    """Timezone-aware UTC timestamp (used by the runner)."""
139
-    return datetime.now(UTC)
sway/src/dlm_sway/core/scoring.pydeleted
@@ -1,203 +0,0 @@
1
-"""Scoring protocols: logprobs, next-token distributions, differential toggling.
2
-
3
-Scoring is **separate** from generation because not every backend can
4
-provide logits. Every numeric sway probe depends on at least one of
5
-three operations:
6
-
7
-1. ``logprob_of(prompt, completion)`` — score a completion against a
8
-   prompt (A1, B2, B3, C2, …).
9
-2. ``rolling_logprob(text)`` — perplexity over a piece of text (B1,
10
-   C2).
11
-3. ``next_token_dist(prompt, top_k)`` — the raw next-token distribution
12
-   at a single position (A1, N2).
13
-
14
-The :class:`DifferentialBackend` is the key performance primitive:
15
-both base and fine-tuned views share the same loaded weights and KV
16
-cache layout, toggled via PEFT's :meth:`set_adapter` /
17
-:meth:`disable_adapter`. A naive "load twice" implementation would
18
-double memory and halve throughput.
19
-"""
20
-
21
-from __future__ import annotations
22
-
23
-from contextlib import AbstractContextManager
24
-from dataclasses import dataclass, field
25
-from typing import Protocol, runtime_checkable
26
-
27
-import numpy as np
28
-from numpy.typing import NDArray
29
-
30
-from dlm_sway.core.model import Model
31
-
32
-
33
-@dataclass(frozen=True, slots=True)
34
-class RollingLogprob:
35
-    """Per-token logprobs over a piece of text, plus summary stats.
36
-
37
-    Attributes
38
-    ----------
39
-    token_ids:
40
-        The tokenizer output for ``text``. Length ``N``.
41
-    logprobs:
42
-        ``log p(token_i | token_<i)`` for each position i ≥ 1. Length
43
-        ``N-1``.
44
-    num_tokens:
45
-        ``N`` — included for convenience; ``len(token_ids)``.
46
-    total_logprob:
47
-        Sum of :attr:`logprobs`.
48
-    """
49
-
50
-    token_ids: NDArray[np.int64]
51
-    logprobs: NDArray[np.float32]
52
-    num_tokens: int
53
-    total_logprob: float
54
-
55
-    @property
56
-    def mean_logprob(self) -> float:
57
-        n = self.logprobs.size
58
-        return float(self.total_logprob / n) if n else 0.0
59
-
60
-    @property
61
-    def perplexity(self) -> float:
62
-        """``exp(-mean_logprob)``. Base-e, natural perplexity."""
63
-        return float(np.exp(-self.mean_logprob))
64
-
65
-
66
-@dataclass(frozen=True, slots=True)
67
-class TokenDist:
68
-    """A (possibly top-k truncated) next-token probability distribution.
69
-
70
-    For KL / JS divergence probes sway needs matched distributions
71
-    across base and fine-tuned views. The runner is responsible for
72
-    aligning ``top_k`` token slices between two ``TokenDist`` objects
73
-    before handing them to divergence math.
74
-    """
75
-
76
-    token_ids: NDArray[np.int64]
77
-    """Token ids, descending by probability. Length ``k``."""
78
-    logprobs: NDArray[np.float32]
79
-    """Log-probabilities for :attr:`token_ids`. Length ``k``."""
80
-    vocab_size: int
81
-    """Full vocab size — needed to renormalize top-k truncated slices."""
82
-    tail_logprob: float = field(default=0.0)
83
-    """log of (1 - sum of exp(logprobs[:k])); 0 if top_k covers the full vocab."""
84
-
85
-
86
-@runtime_checkable
87
-class ScoringBackend(Protocol):
88
-    """Logit-level access to a loaded model."""
89
-
90
-    def logprob_of(self, prompt: str, completion: str) -> float:
91
-        """Sum of log-probabilities of ``completion`` tokens given ``prompt``.
92
-
93
-        The prompt is *not* scored; only the completion contributes. The
94
-        value is in nats (natural log). Longer completions are
95
-        monotonically more negative — callers normalize by length if
96
-        they need a rate.
97
-        """
98
-        ...
99
-
100
-    def rolling_logprob(self, text: str) -> RollingLogprob:
101
-        """Compute per-token logprobs for the whole of ``text``.
102
-
103
-        Equivalent to lm-eval's ``loglikelihood_rolling``. Used for
104
-        perplexity comparison on held-out content (B1 SIS, C2).
105
-        """
106
-        ...
107
-
108
-    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
109
-        """Next-token distribution at the position after ``prompt``.
110
-
111
-        Truncated to ``top_k`` for memory; callers doing divergence math
112
-        over the top-k slice accept the (typically negligible) error vs
113
-        full-vocab KL.
114
-        """
115
-        ...
116
-
117
-
118
-@runtime_checkable
119
-class DifferentialBackend(Protocol):
120
-    """A backend that holds base + fine-tuned views on a single loaded model.
121
-
122
-    The idiomatic usage is::
123
-
124
-        with backend.as_base() as base_view:
125
-            p_base = base_view.next_token_dist(prompt)
126
-        with backend.as_finetuned() as ft_view:
127
-            p_ft = ft_view.next_token_dist(prompt)
128
-
129
-    Implementations toggle PEFT adapters via
130
-    :meth:`peft.PeftModel.set_adapter` / :meth:`disable_adapter`.
131
-
132
-    Invariant: the two views must be **not simultaneously usable**. A
133
-    caller holding a ``base_view`` after entering the ``as_finetuned``
134
-    context is a programmer error and implementations MUST detect and
135
-    raise.
136
-    """
137
-
138
-    def as_base(self) -> AbstractContextManager[_ScoringModel]: ...
139
-
140
-    def as_finetuned(self) -> AbstractContextManager[_ScoringModel]: ...
141
-
142
-
143
-@runtime_checkable
144
-class ScalableDifferentialBackend(DifferentialBackend, Protocol):
145
-    """A differential backend that can also scale the LoRA additive term.
146
-
147
-    LoRA applies ``W + (alpha/r) · B @ A`` to a base weight matrix. This
148
-    protocol exposes a context manager that temporarily multiplies that
149
-    additive term by ``lam`` for everything inside the ``with`` block.
150
-
151
-    ``lam = 0.0`` is equivalent to :meth:`as_base`.
152
-    ``lam = 1.0`` is equivalent to :meth:`as_finetuned`.
153
-    ``lam = 1.25`` overshoots — useful for N2 AdapterAblation's
154
-    response-curve measurement.
155
-
156
-    Only the HF backend ships an implementation in v0.1. Probes that
157
-    need scaling check via ``isinstance(backend, ScalableDifferentialBackend)``
158
-    at runtime and SKIP gracefully when unavailable.
159
-    """
160
-
161
-    def as_scaled_adapter(self, lam: float) -> AbstractContextManager[_ScoringModel]: ...
162
-
163
-
164
-@runtime_checkable
165
-class NullCalibratedBackend(DifferentialBackend, Protocol):
166
-    """A differential backend that can produce a "null adapter" view.
167
-
168
-    A null adapter has the *same structure* (rank, alpha, target modules)
169
-    as the real adapter but with weights drawn from a zero-mean Gaussian.
170
-    Running probes against this view yields the baseline "how much
171
-    signal does random noise produce" distribution — the denominator in
172
-    every numeric probe's z-score.
173
-
174
-    The context manager takes a ``seed`` so calibration runs can be
175
-    reproduced and multiple independent null samples can be drawn to
176
-    estimate ``std``.
177
-
178
-    Implementations MUST restore the real adapter on exit, including
179
-    on exceptions, so a caller can freely interleave null and real
180
-    calibrations within the same backend lifetime.
181
-    """
182
-
183
-    def as_null_adapter(
184
-        self, seed: int, *, init_scale: float = 0.02
185
-    ) -> AbstractContextManager[_ScoringModel]: ...
186
-
187
-
188
-# Helper Protocol for type-checking the yielded context object: it
189
-# must satisfy both Model and ScoringBackend. mypy doesn't support
190
-# intersection types, so we spell it out explicitly.
191
-@runtime_checkable
192
-class _ScoringModel(Model, ScoringBackend, Protocol):
193
-    """A Model that also exposes ScoringBackend."""
194
-
195
-    ...
196
-
197
-
198
-ScoringModel = _ScoringModel
199
-"""Public alias for the intersection ``Model & ScoringBackend``.
200
-
201
-Exported for backend and probe implementations that need to annotate
202
-variables of this combined type.
203
-"""
sway/src/dlm_sway/core/sections.pydeleted
@@ -1,76 +0,0 @@
1
-"""Minimal section contract for attribution probes.
2
-
3
-The flagship B1 ``section_internalization`` probe needs *structured*
4
-input — a section has an id, a kind, content text, and possibly some
5
-Q/A pairs or chosen/rejected triples. sway defines this shape here so
6
-the probes stay oblivious to the upstream (``.dlm`` parser, custom
7
-loaders, synthetic test fixtures).
8
-
9
-Field names are aligned with :mod:`dlm.doc.sections` but this module
10
-does not import ``dlm`` — the bridge at
11
-:mod:`dlm_sway.integrations.dlm` does the adaptation.
12
-"""
13
-
14
-from __future__ import annotations
15
-
16
-from dataclasses import dataclass, field
17
-from typing import Literal
18
-
19
-SectionKind = Literal["prose", "instruction", "preference"]
20
-
21
-
22
-@dataclass(frozen=True, slots=True)
23
-class SectionProbe:
24
-    """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section."""
25
-
26
-    prompt: str
27
-    gold: str
28
-
29
-
30
-@dataclass(frozen=True, slots=True)
31
-class SectionPreference:
32
-    """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section."""
33
-
34
-    prompt: str
35
-    chosen: str
36
-    rejected: str
37
-
38
-
39
-@dataclass(frozen=True, slots=True)
40
-class Section:
41
-    """One typed chunk of a training document.
42
-
43
-    Attributes
44
-    ----------
45
-    id:
46
-        Content-addressed identifier. ``.dlm`` uses a 16-hex-char
47
-        sha256 prefix; sway doesn't enforce a format.
48
-    kind:
49
-        Discriminator for which of :attr:`probes` /
50
-        :attr:`preferences` / :attr:`content` is the primary signal.
51
-    content:
52
-        Raw section text. Always populated; used by the rolling-PPL
53
-        path for PROSE sections.
54
-    probes:
55
-        For INSTRUCTION: parsed Q/A pairs. Empty tuple for others.
56
-    preferences:
57
-        For PREFERENCE: parsed chosen/rejected triples. Empty otherwise.
58
-    tag:
59
-        Optional free-form label for the section (e.g., "intro",
60
-        "api-reference"). Surfaces in per-section reports.
61
-    """
62
-
63
-    id: str
64
-    kind: SectionKind
65
-    content: str
66
-    probes: tuple[SectionProbe, ...] = field(default_factory=tuple)
67
-    preferences: tuple[SectionPreference, ...] = field(default_factory=tuple)
68
-    tag: str | None = None
69
-
70
-
71
-def filter_kinds(
72
-    sections: tuple[Section, ...], kinds: tuple[SectionKind, ...]
73
-) -> tuple[Section, ...]:
74
-    """Return only sections whose ``kind`` matches one of ``kinds``."""
75
-    allow = set(kinds)
76
-    return tuple(s for s in sections if s.kind in allow)
sway/src/dlm_sway/integrations/__init__.pydeleted
@@ -1,1 +0,0 @@
1
-"""Optional integrations with upstream fine-tuning tools."""
sway/src/dlm_sway/integrations/dlm/__init__.pydeleted
@@ -1,1 +0,0 @@
1
-"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``."""
sway/src/dlm_sway/integrations/dlm/autogen.pydeleted
@@ -1,191 +0,0 @@
1
-"""Auto-generate a ``sway.yaml`` from a ``.dlm`` document.
2
-
3
-Walks the parsed sections and emits one entry per primitive sway ships:
4
-the full 11-primitive battery wired up against the document's own
5
-content. The result is a YAML artifact the user commits alongside their
6
-``.dlm`` and diffs in PRs.
7
-
8
-The generated spec includes a ``dlm_source`` field that the suite loader
9
-uses to pick up :class:`~dlm_sway.core.sections.Section` data at run
10
-time — probes that need sections (B1, B3, C3) then work against the
11
-typed structure instead of re-parsing text.
12
-"""
13
-
14
-from __future__ import annotations
15
-
16
-from pathlib import Path
17
-from typing import Any
18
-
19
-import yaml
20
-
21
-from dlm_sway.core.errors import SwayError
22
-from dlm_sway.core.sections import Section
23
-from dlm_sway.integrations.dlm.resolver import DlmHandle, resolve_dlm
24
-
25
-
26
-def write_sway_yaml(dlm_path: Path, out: Path) -> None:
27
-    """Resolve the .dlm, build a spec dict, write it as YAML to ``out``."""
28
-    handle = resolve_dlm(dlm_path)
29
-    if handle.adapter_path is None:
30
-        raise SwayError(
31
-            f"{dlm_path}: no trained adapter found at ~/.dlm/store/{handle.dlm_id}/adapter; "
32
-            "train the document with `dlm train` before generating a sway suite."
33
-        )
34
-    spec = build_spec_dict(handle, dlm_source=str(dlm_path.resolve()))
35
-    out.write_text(yaml.safe_dump(spec, sort_keys=False), encoding="utf-8")
36
-
37
-
38
-def build_spec_dict(handle: DlmHandle, *, dlm_source: str | None = None) -> dict[str, Any]:
39
-    """Build a sway.yaml-shaped dict from a :class:`DlmHandle`."""
40
-    base_spec = {"kind": "hf", "base": handle.base_model}
41
-    ft_spec = {
42
-        "kind": "hf",
43
-        "base": handle.base_model,
44
-        "adapter": str(handle.adapter_path) if handle.adapter_path else None,
45
-    }
46
-    spec: dict[str, Any] = {
47
-        "version": 1,
48
-        "models": {"base": base_spec, "ft": ft_spec},
49
-        "defaults": {"seed": 0, "differential": True},
50
-        "suite": _build_suite(handle.sections),
51
-    }
52
-    if dlm_source is not None:
53
-        spec["dlm_source"] = dlm_source
54
-    return spec
55
-
56
-
57
-def _build_suite(sections: tuple[Section, ...]) -> list[dict[str, Any]]:
58
-    """Assemble the full probe battery for the given sections.
59
-
60
-    The ordering matters: ``null_adapter`` first so every downstream
61
-    probe's z-score threshold has stats to consult.
62
-    """
63
-    instruction_probes: list[tuple[str, str]] = [
64
-        (p.prompt, p.gold) for s in sections if s.kind == "instruction" for p in s.probes
65
-    ]
66
-    prose_prompts: list[str] = []
67
-    for s in sections:
68
-        if s.kind == "prose" and s.content.strip():
69
-            # Use the section's leading sentence as a natural completion prompt.
70
-            first_sentence = s.content.split(".")[0].strip()
71
-            if first_sentence:
72
-                prose_prompts.append(first_sentence + ".")
73
-
74
-    kl_prompts = [q for q, _ in instruction_probes][:16] or prose_prompts[:16]
75
-    style_prompts = prose_prompts[:8] or [q for q, _ in instruction_probes][:8]
76
-
77
-    suite: list[dict[str, Any]] = []
78
-
79
-    # Baseline calibration — always first.
80
-    suite.append({"name": "null_baseline", "kind": "null_adapter", "runs": 3})
81
-
82
-    # Adherence.
83
-    if kl_prompts:
84
-        suite.append(
85
-            {
86
-                "name": "delta_kl_doc",
87
-                "kind": "delta_kl",
88
-                "prompts": kl_prompts,
89
-                "assert_mean_gte": 0.02,
90
-            }
91
-        )
92
-    if instruction_probes:
93
-        suite.append(
94
-            {
95
-                "name": "revert_check",
96
-                "kind": "adapter_revert",
97
-                "cases": [
98
-                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
99
-                    for q, a in instruction_probes[:8]
100
-                ],
101
-                "assert_revert_rate_lt": 0.3,
102
-            }
103
-        )
104
-    if kl_prompts:
105
-        suite.append(
106
-            {
107
-                "name": "prompt_collapse",
108
-                "kind": "prompt_collapse",
109
-                "prompts": kl_prompts[:4],
110
-                "context_lengths": [0, 256, 512, 1024],
111
-                "assert_half_life_tokens": 300,
112
-            }
113
-        )
114
-
115
-    # Attribution.
116
-    if len(sections) >= 2:
117
-        suite.append(
118
-            {
119
-                "name": "section_attribution",
120
-                "kind": "section_internalization",
121
-                "per_section_threshold": 0.05,
122
-            }
123
-        )
124
-    if instruction_probes:
125
-        suite.append(
126
-            {
127
-                "name": "paraphrase_invariance",
128
-                "kind": "paraphrase_invariance",
129
-                "cases": [
130
-                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
131
-                    for q, a in instruction_probes[:6]
132
-                ],
133
-            }
134
-        )
135
-    has_preferences = any(s.kind == "preference" and s.preferences for s in sections)
136
-    if has_preferences:
137
-        suite.append(
138
-            {
139
-                "name": "preference_flip",
140
-                "kind": "preference_flip",
141
-                "assert_flip_rate_gte": 0.7,
142
-            }
143
-        )
144
-
145
-    # Calibration.
146
-    if style_prompts:
147
-        suite.append(
148
-            {
149
-                "name": "style_shift",
150
-                "kind": "style_fingerprint",
151
-                "prompts": style_prompts,
152
-            }
153
-        )
154
-    suite.append({"name": "general_knowledge", "kind": "calibration_drift"})
155
-    if any(s.kind == "prose" for s in sections):
156
-        suite.append(
157
-            {
158
-                "name": "verbatim_leak",
159
-                "kind": "leakage",
160
-                "prefix_chars": 128,
161
-                "continuation_chars": 256,
162
-            }
163
-        )
164
-
165
-    # Signature ablation — goes last because it's the most expensive.
166
-    if kl_prompts:
167
-        suite.append(
168
-            {
169
-                "name": "adapter_ablation",
170
-                "kind": "adapter_ablation",
171
-                "prompts": kl_prompts[:6],
172
-                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
173
-            }
174
-        )
175
-
176
-    return suite
177
-
178
-
179
-def _auto_paraphrases(prompt: str) -> list[str]:
180
-    """Small, deterministic paraphrase set used when authors don't supply one.
181
-
182
-    Purely heuristic — good enough to detect "did the model memorize the
183
-    exact wording". Real paraphrase generation lives behind the
184
-    ``semsim`` extra.
185
-    """
186
-    variants: list[str] = []
187
-    stripped = prompt.rstrip("?. ")
188
-    variants.append(f"Could you explain: {stripped}?")
189
-    variants.append(f"I'd like to know — {stripped}.")
190
-    variants.append(f"Please describe: {stripped}.")
191
-    return variants[:3]
sway/src/dlm_sway/integrations/dlm/resolver.pydeleted
@@ -1,243 +0,0 @@
1
-"""Resolve a ``.dlm`` file to the artifacts sway needs.
2
-
3
-Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything
4
-outside this package is oblivious to dlm's internal shape; the bridge
5
-is the only place that knows, e.g., that a dlm section carries a
6
-``kind`` field named ``type`` or that adapters live at
7
-``adapter/versions/vNNNN/``.
8
-"""
9
-
10
-from __future__ import annotations
11
-
12
-import hashlib
13
-from dataclasses import dataclass
14
-from pathlib import Path
15
-
16
-from dlm_sway.core.errors import SwayError
17
-from dlm_sway.core.sections import (
18
-    Section,
19
-    SectionKind,
20
-    SectionPreference,
21
-    SectionProbe,
22
-)
23
-
24
-
25
-@dataclass(frozen=True, slots=True)
26
-class DlmHandle:
27
-    """Everything the sway bridge pulls out of a ``.dlm`` file.
28
-
29
-    Attributes
30
-    ----------
31
-    dlm_id:
32
-        Stable identifier from the frontmatter.
33
-    base_model:
34
-        Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape
35
-        hatch, taken verbatim from the frontmatter.
36
-    adapter_path:
37
-        Directory containing the current trained PEFT adapter (resolved
38
-        via dlm's own ``StorePath.for_dlm``). ``None`` if the document
39
-        hasn't been trained yet.
40
-    sections:
41
-        Typed sections ready for sway's probes.
42
-    doc_text:
43
-        Concatenated raw content of all sections. Used by probes that
44
-        need a whole-document stylistic reference (C1).
45
-    """
46
-
47
-    dlm_id: str
48
-    base_model: str
49
-    adapter_path: Path | None
50
-    sections: tuple[Section, ...]
51
-    doc_text: str
52
-
53
-
54
-def resolve_dlm(dlm_path: Path) -> DlmHandle:
55
-    """Parse ``dlm_path`` and return a :class:`DlmHandle`.
56
-
57
-    Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message
58
-    when the file is malformed or when the resolved adapter path doesn't
59
-    exist on disk.
60
-    """
61
-    try:
62
-        from dlm.doc.parser import parse_file as dlm_parse_file
63
-    except ImportError as exc:
64
-        raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc
65
-
66
-    parsed = dlm_parse_file(dlm_path)
67
-    fm = parsed.frontmatter
68
-    sections = tuple(_translate_section(s) for s in parsed.sections)
69
-    doc_text = "\n\n".join(s.content for s in sections)
70
-
71
-    adapter_path = _resolve_adapter_path(fm.dlm_id)
72
-    base_hf_id = _resolve_base_model_to_hf_id(fm.base_model)
73
-
74
-    return DlmHandle(
75
-        dlm_id=fm.dlm_id,
76
-        base_model=base_hf_id,
77
-        adapter_path=adapter_path,
78
-        sections=sections,
79
-        doc_text=doc_text,
80
-    )
81
-
82
-
83
-def _resolve_base_model_to_hf_id(base_model: str) -> str:
84
-    """Translate dlm's base-model *key* to a HuggingFace repo id.
85
-
86
-    dlm's frontmatter stores registry keys like ``smollm2-135m`` which
87
-    resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends
88
-    call ``AutoModelForCausalLM.from_pretrained`` directly and need the
89
-    HF id. The ``hf:org/name`` escape hatch passes through unchanged.
90
-    """
91
-    if base_model.startswith("hf:"):
92
-        return base_model[len("hf:") :]
93
-    try:
94
-        from dlm.base_models import resolve as resolve_base
95
-    except ImportError:
96
-        return base_model
97
-    try:
98
-        spec = resolve_base(base_model)
99
-    except Exception:  # noqa: BLE001 — unknown dlm errors
100
-        return base_model
101
-    hf_id = getattr(spec, "hf_id", None)
102
-    return str(hf_id) if hf_id else base_model
103
-
104
-
105
-def _resolve_adapter_path(dlm_id: str) -> Path | None:
106
-    """Locate the current adapter directory for ``dlm_id``.
107
-
108
-    Uses dlm's module-level ``for_dlm`` helper if available, else falls
109
-    back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt``
110
-    pointer. Returns ``None`` if no adapter has been trained yet.
111
-    """
112
-    # Primary path: use dlm's own store-path helpers.
113
-    try:
114
-        from dlm.store.paths import for_dlm as _for_dlm
115
-    except ImportError:
116
-        _for_dlm = None
117
-
118
-    if _for_dlm is not None:
119
-        try:
120
-            store = _for_dlm(dlm_id)
121
-        except Exception:  # noqa: BLE001 — unknown dlm exception shapes
122
-            store = None
123
-        if store is not None:
124
-            try:
125
-                resolved = store.resolve_current_adapter()
126
-            except (AttributeError, FileNotFoundError):
127
-                resolved = None
128
-            if resolved is not None and Path(resolved).exists():
129
-                return Path(resolved)
130
-
131
-    # Manual fallback. The ``current.txt`` pointer is relative to the
132
-    # **store root**, not to current.txt's parent dir — so go up one level.
133
-    import os
134
-
135
-    home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser()
136
-    store_root = home / "store" / dlm_id
137
-    current_file = store_root / "adapter" / "current.txt"
138
-    if current_file.exists():
139
-        pointer = current_file.read_text(encoding="utf-8").strip()
140
-        candidate = (store_root / pointer).resolve()
141
-        if candidate.exists():
142
-            return candidate
143
-    return None
144
-
145
-
146
-def _translate_section(dlm_section: object) -> Section:
147
-    """Adapt a ``dlm.doc.sections.Section`` to sway's section type.
148
-
149
-    dlm's Section dataclass uses the attribute name ``type`` (not
150
-    ``kind``) and stores instruction/preference content as raw markdown
151
-    — dlm ships dedicated parsers (``parse_instruction_body``,
152
-    ``parse_preference_body``) that we reuse here so any future dlm
153
-    syntax additions land in sway for free.
154
-    """
155
-    # dlm's current attribute is ``type``; older revisions used ``kind``.
156
-    kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None))
157
-    kind = _normalize_kind(kind_raw)
158
-    content = str(getattr(dlm_section, "content", ""))
159
-    section_id = str(
160
-        getattr(dlm_section, "section_id", None)
161
-        or getattr(dlm_section, "id", None)
162
-        or _content_hash(content)
163
-    )
164
-    tag = getattr(dlm_section, "tag", None)
165
-
166
-    probes: tuple[SectionProbe, ...] = ()
167
-    preferences: tuple[SectionPreference, ...] = ()
168
-    if kind == "instruction":
169
-        probes = tuple(_parse_instruction(content, section_id=section_id))
170
-    elif kind == "preference":
171
-        preferences = tuple(_parse_preference(content, section_id=section_id))
172
-
173
-    return Section(
174
-        id=section_id,
175
-        kind=kind,
176
-        content=content,
177
-        probes=probes,
178
-        preferences=preferences,
179
-        tag=tag if isinstance(tag, str) else None,
180
-    )
181
-
182
-
183
-def _normalize_kind(raw: object) -> SectionKind:
184
-    """Map dlm's SectionType/str to sway's lowercase kind."""
185
-    if raw is None:
186
-        return "prose"
187
-    value = str(raw).lower()
188
-    # dlm uses uppercase StrEnum values like "PROSE"; normalize.
189
-    if value.endswith("prose") or "prose" in value:
190
-        return "prose"
191
-    if "instruction" in value:
192
-        return "instruction"
193
-    if "preference" in value:
194
-        return "preference"
195
-    return "prose"
196
-
197
-
198
-def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]:
199
-    """Pull (Q, A) pairs out of a dlm INSTRUCTION section body.
200
-
201
-    Delegates to dlm's own ``parse_instruction_body`` so syntax additions
202
-    land in sway without code changes here. Falls back to an empty list
203
-    on parse errors — the probe will fail gracefully.
204
-    """
205
-    try:
206
-        from dlm.data.instruction_parser import parse_instruction_body
207
-    except ImportError:
208
-        return []
209
-    try:
210
-        pairs = parse_instruction_body(content, section_id=section_id)
211
-    except Exception:  # noqa: BLE001 — dlm raises InstructionParseError
212
-        return []
213
-    out: list[SectionProbe] = []
214
-    for p in pairs:
215
-        q = getattr(p, "question", getattr(p, "prompt", ""))
216
-        a = getattr(p, "answer", getattr(p, "gold", ""))
217
-        if q and a:
218
-            out.append(SectionProbe(prompt=str(q), gold=str(a)))
219
-    return out
220
-
221
-
222
-def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]:
223
-    """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body."""
224
-    try:
225
-        from dlm.data.preference_parser import parse_preference_body
226
-    except ImportError:
227
-        return []
228
-    try:
229
-        triples = parse_preference_body(content, section_id=section_id)
230
-    except Exception:  # noqa: BLE001 — dlm raises PreferenceParseError
231
-        return []
232
-    out: list[SectionPreference] = []
233
-    for t in triples:
234
-        p = str(getattr(t, "prompt", ""))
235
-        c = str(getattr(t, "chosen", ""))
236
-        rej = str(getattr(t, "rejected", ""))
237
-        if p and c and rej:
238
-            out.append(SectionPreference(prompt=p, chosen=c, rejected=rej))
239
-    return out
240
-
241
-
242
-def _content_hash(content: str) -> str:
243
-    return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
sway/src/dlm_sway/probes/__init__.pydeleted
@@ -1,27 +0,0 @@
1
-"""Probe primitives. Each module in this package implements one primitive.
2
-
3
-Importing this package eagerly imports every probe module so their
4
-``__init_subclass__`` hooks populate the registry. If you're hitting
5
-"unknown probe kind" from :func:`dlm_sway.probes.base.build_probe`, the
6
-fix is to ``import dlm_sway.probes`` before building the probe — which
7
-this ``__init__`` does for you.
8
-"""
9
-
10
-from __future__ import annotations
11
-
12
-# Register every shipped probe with the central registry by importing
13
-# its module. Order is not load-bearing for registration but matches the
14
-# categorical grouping in :mod:`dlm_sway.core.result`.
15
-from dlm_sway.probes import (  # noqa: F401 — imports register the probes
16
-    adapter_ablation,
17
-    adapter_revert,
18
-    calibration_drift,
19
-    delta_kl,
20
-    leakage,
21
-    null_adapter,
22
-    paraphrase_invariance,
23
-    preference_flip,
24
-    prompt_collapse,
25
-    section_internalization,
26
-    style_fingerprint,
27
-)
sway/src/dlm_sway/probes/_calibration_pack.pydeleted
@@ -1,63 +0,0 @@
1
-"""A small, built-in general-knowledge probe pack for C2.
2
-
3
-Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few
4
-tokens a competent base model should assign high probability to. The
5
-items are deliberately *factually trivial* — the point isn't "does the
6
-model know this?" but "did the fine-tune forget this?" — so the pack
7
-skews toward grade-school geography, chemistry, arithmetic, and
8
-high-frequency idiom.
9
-
10
-A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD +
11
-OpenBookQA. This 30-item seed lets the probe ship today and catches the
12
-most egregious over-fit cases.
13
-"""
14
-
15
-from __future__ import annotations
16
-
17
-from typing import Final
18
-
19
-CalibrationItem = tuple[str, str]
20
-
21
-BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = (
22
-    # Geography
23
-    ("The capital of France is", " Paris"),
24
-    ("The capital of Japan is", " Tokyo"),
25
-    ("The largest ocean on Earth is the", " Pacific"),
26
-    ("Mount Everest is located on the border of Nepal and", " China"),
27
-    ("The longest river in South America is the", " Amazon"),
28
-    # Natural sciences
29
-    ("Water freezes at zero degrees", " Celsius"),
30
-    ("The chemical symbol for gold is", " Au"),
31
-    ("Light travels faster than", " sound"),
32
-    ("Plants convert sunlight into energy through", " photosynthesis"),
33
-    ("The Earth orbits around the", " Sun"),
34
-    # Arithmetic
35
-    ("Two plus two equals", " four"),
36
-    ("Ten times ten equals", " one hundred"),
37
-    ("Half of one hundred is", " fifty"),
38
-    ("A dozen means", " twelve"),
39
-    # Language and idiom
40
-    ("A rose by any other name would smell as", " sweet"),
41
-    ("To be or not to be, that is the", " question"),
42
-    ("The early bird catches the", " worm"),
43
-    ("Actions speak louder than", " words"),
44
-    ("A picture is worth a thousand", " words"),
45
-    # History
46
-    ("World War II ended in the year", " 1945"),
47
-    ("The first president of the United States was", " George Washington"),
48
-    ("The Berlin Wall fell in", " 1989"),
49
-    # Biology
50
-    ("Humans have twenty", " fingers and toes"),
51
-    ("The human body has two", " lungs"),
52
-    ("Blood is pumped through the body by the", " heart"),
53
-    # Technology
54
-    ("HTML stands for HyperText", " Markup Language"),
55
-    ("The World Wide Web was invented by Tim", " Berners-Lee"),
56
-    # Miscellaneous
57
-    ("One year has", " 365 days"),
58
-    ("A week has seven", " days"),
59
-    ("There are seven colors in a", " rainbow"),
60
-)
61
-"""30 items covering geography, science, arithmetic, language, history,
62
-biology, and technology. Pulled from public-domain grade-school facts so
63
-there's no licensing concern about shipping with the wheel."""
sway/src/dlm_sway/probes/_divergence.pydeleted
@@ -1,102 +0,0 @@
1
-"""Shared math for divergence-based probes.
2
-
3
-Extracted so :mod:`delta_kl`, :mod:`adapter_ablation`, and any future
4
-probe operating on next-token distributions reuse the same aligned-
5
-top-k KL / JS computation. Having one implementation keeps the numerical
6
-treatment consistent across the report.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-import math
12
-from typing import Literal
13
-
14
-import numpy as np
15
-from numpy.typing import NDArray
16
-
17
-from dlm_sway.core.scoring import TokenDist
18
-
19
-Divergence = Literal["kl", "js"]
20
-
21
-
22
-def aligned_probs(
23
-    base: TokenDist, ft: TokenDist
24
-) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
25
-    """Return aligned probability vectors over the union of top-k tokens.
26
-
27
-    Two ``TokenDist`` objects may surface different top-k indices if
28
-    the two models disagree about the hot tokens. We build a shared
29
-    support — ``union(base.token_ids, ft.token_ids)`` — and slot the
30
-    known probabilities in. Unknown entries fall back to the
31
-    per-distribution tail mass divided across the missing tokens,
32
-    which is the maximum-entropy completion under the truncation.
33
-    """
34
-    union_ids = np.union1d(base.token_ids, ft.token_ids)
35
-    k = int(union_ids.size)
36
-
37
-    base_probs = _to_support(base, union_ids, k)
38
-    ft_probs = _to_support(ft, union_ids, k)
39
-
40
-    # Normalize in case of floating noise from the fill-in.
41
-    base_probs /= base_probs.sum()
42
-    ft_probs /= ft_probs.sum()
43
-    return base_probs, ft_probs
44
-
45
-
46
-def _to_support(dist: TokenDist, support: NDArray[np.int64], k: int) -> NDArray[np.float64]:
47
-    probs = np.exp(dist.logprobs.astype(np.float64))
48
-    out = np.zeros(k, dtype=np.float64)
49
-    known_mass = float(probs.sum())
50
-    tail_mass = max(0.0, 1.0 - known_mass)
51
-
52
-    id_to_idx = {int(tok): idx for idx, tok in enumerate(support.tolist())}
53
-    missing = 0
54
-    for tok, p in zip(dist.token_ids.tolist(), probs.tolist(), strict=True):
55
-        i = id_to_idx.get(int(tok))
56
-        if i is None:
57
-            # Shouldn't happen given union construction.
58
-            missing += 1
59
-            continue
60
-        out[i] = float(p)
61
-
62
-    # Spread the tail mass over the support entries that this dist
63
-    # doesn't explicitly provide. Size of that set:
64
-    n_unknown = int((out == 0.0).sum()) - missing
65
-    if n_unknown > 0 and tail_mass > 0.0:
66
-        per = tail_mass / n_unknown
67
-        out[out == 0.0] = per
68
-
69
-    return out
70
-
71
-
72
-def kl(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
73
-    """KL(p || q) in nats. Robust to zeros in p (treated as 0·log0 = 0)."""
74
-    mask = p > 0.0
75
-    safe_q = np.where(q > 0.0, q, 1e-12)
76
-    return float(np.sum(p[mask] * (np.log(p[mask]) - np.log(safe_q[mask]))))
77
-
78
-
79
-def js(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
80
-    """Jensen-Shannon divergence. Symmetric, bounded in [0, ln 2] (nats).
81
-
82
-    The upper bound makes JS a nicer default for thresholding than raw
83
-    KL — a user doesn't need to know their specific model's KL scale to
84
-    pick a threshold.
85
-    """
86
-    m = 0.5 * (p + q)
87
-    return 0.5 * kl(p, m) + 0.5 * kl(q, m)
88
-
89
-
90
-def divergence(base: TokenDist, ft: TokenDist, kind: Divergence = "js") -> float:
91
-    """Compute KL or JS between two ``TokenDist`` on a shared support."""
92
-    p, q = aligned_probs(base, ft)
93
-    if kind == "js":
94
-        return js(p, q)
95
-    if kind == "kl":
96
-        return kl(q, p)  # KL(ft || base) — "how much does ft diverge from base"
97
-    raise ValueError(f"unknown divergence kind: {kind!r}")
98
-
99
-
100
-def js_ln2() -> float:
101
-    """Upper bound on JS in nats. Useful for normalization."""
102
-    return math.log(2.0)
sway/src/dlm_sway/probes/adapter_ablation.pydeleted
@@ -1,193 +0,0 @@
1
-"""N2 AdapterAblation — the sway signature primitive.
2
-
3
-Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25}
4
-and measures the mean divergence from the base distribution at each
5
-step. Fits a monotonic response curve; reports three shape metrics:
6
-
7
-- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means
8
-  the adapter's effect scales predictably; low means it's "all or
9
-  nothing" (degenerate).
10
-- **saturation_lambda**: the smallest λ at which divergence reaches
11
-  90% of the λ=1 value. Too low (<0.3) means the adapter fires at
12
-  partial strength — fragile. Too high (>1.0) means the adapter is
13
-  under-trained.
14
-- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the
15
-  healthy "pushing past 1 still moves the model" signal. An overshoot
16
-  below 1.0 suggests collapse.
17
-
18
-This is the single novel primitive that no generic eval harness
19
-provides — sway's position next to the adapter math makes it possible.
20
-
21
-Requires the backend to implement
22
-:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes
23
-SKIP gracefully on backends that don't.
24
-"""
25
-
26
-from __future__ import annotations
27
-
28
-from typing import Literal
29
-
30
-import numpy as np
31
-from pydantic import Field
32
-
33
-from dlm_sway.core.result import ProbeResult, Verdict
34
-from dlm_sway.core.scoring import ScalableDifferentialBackend
35
-from dlm_sway.probes._divergence import Divergence, divergence
36
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
37
-
38
-
39
-class AdapterAblationSpec(ProbeSpec):
40
-    kind: Literal["adapter_ablation"] = "adapter_ablation"
41
-    prompts: list[str] = Field(default_factory=list)
42
-    lambdas: list[float] = Field(
43
-        default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
44
-        min_length=3,
45
-    )
46
-    divergence: Divergence = "js"
47
-    top_k: int | None = None
48
-    assert_linearity_gte: float = 0.85
49
-    assert_saturation_between: tuple[float, float] = (0.3, 1.05)
50
-    assert_overshoot_gte: float = 1.02
51
-
52
-
53
-class AdapterAblationProbe(Probe):
54
-    kind = "adapter_ablation"
55
-    spec_cls = AdapterAblationSpec
56
-    category = "ablation"
57
-
58
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
59
-        assert isinstance(spec, AdapterAblationSpec)
60
-        if not spec.prompts:
61
-            return ProbeResult(
62
-                name=spec.name,
63
-                kind=spec.kind,
64
-                verdict=Verdict.ERROR,
65
-                score=None,
66
-                message="no prompts provided",
67
-            )
68
-        if not isinstance(ctx.backend, ScalableDifferentialBackend):
69
-            return ProbeResult(
70
-                name=spec.name,
71
-                kind=spec.kind,
72
-                verdict=Verdict.SKIP,
73
-                score=None,
74
-                message=(
75
-                    "backend does not implement ScalableDifferentialBackend — "
76
-                    "adapter ablation requires LoRA-scale access"
77
-                ),
78
-            )
79
-
80
-        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
81
-
82
-        # Reference distribution at λ=0 (adapter scaled to zero → base).
83
-        lam_zero = min(spec.lambdas)
84
-        per_lambda: list[float] = []
85
-        for lam in spec.lambdas:
86
-            divs_for_lam: list[float] = []
87
-            for prompt in spec.prompts:
88
-                with ctx.backend.as_scaled_adapter(lam_zero) as ref:
89
-                    ref_dist = ref.next_token_dist(prompt, top_k=top_k)
90
-                with ctx.backend.as_scaled_adapter(lam) as scaled:
91
-                    scaled_dist = scaled.next_token_dist(prompt, top_k=top_k)
92
-                divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence))
93
-            per_lambda.append(float(np.mean(divs_for_lam)))
94
-
95
-        lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64)
96
-        divs_arr = np.asarray(per_lambda, dtype=np.float64)
97
-
98
-        linearity = _r_squared(lambdas_arr, divs_arr)
99
-        saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr)
100
-        overshoot = _overshoot(lambdas_arr, divs_arr)
101
-
102
-        # Pass when all three shape metrics land in their healthy bands.
103
-        sat_lo, sat_hi = spec.assert_saturation_between
104
-        ok_lin = linearity >= spec.assert_linearity_gte
105
-        ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi
106
-        ok_over = overshoot >= spec.assert_overshoot_gte
107
-        verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL
108
-
109
-        lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6)))
110
-        over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2))
111
-        sat_score = 1.0 if ok_sat else 0.3
112
-        score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score
113
-
114
-        return ProbeResult(
115
-            name=spec.name,
116
-            kind=spec.kind,
117
-            verdict=verdict,
118
-            score=score,
119
-            raw=linearity,
120
-            evidence={
121
-                "lambdas": spec.lambdas,
122
-                "mean_divergence_per_lambda": per_lambda,
123
-                "linearity": linearity,
124
-                "saturation_lambda": saturation_lambda,
125
-                "overshoot": overshoot,
126
-                "passed_linearity": ok_lin,
127
-                "passed_saturation": ok_sat,
128
-                "passed_overshoot": ok_over,
129
-                "weight": spec.weight,
130
-            },
131
-            message=(
132
-                f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} "
133
-                f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}"
134
-                if saturation_lambda is not None
135
-                else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}"
136
-            ),
137
-        )
138
-
139
-
140
-def _r_squared(x: np.ndarray, y: np.ndarray) -> float:
141
-    """Coefficient of determination for a linear fit of ``y`` on ``x``."""
142
-    if x.size < 2:
143
-        return 0.0
144
-    xm = float(x.mean())
145
-    ym = float(y.mean())
146
-    denom = float(((x - xm) ** 2).sum())
147
-    if denom == 0.0:
148
-        return 0.0
149
-    slope = float(((x - xm) * (y - ym)).sum()) / denom
150
-    intercept = ym - slope * xm
151
-    y_pred = slope * x + intercept
152
-    ss_res = float(((y - y_pred) ** 2).sum())
153
-    ss_tot = float(((y - ym) ** 2).sum())
154
-    if ss_tot == 0.0:
155
-        return 1.0
156
-    return max(0.0, 1.0 - ss_res / ss_tot)
157
-
158
-
159
-def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None:
160
-    """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1)."""
161
-    # Locate the index of λ=1.0 (or the closest entry ≤ 1.0).
162
-    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
163
-    if candidates.size == 0:
164
-        # Fall back to the largest λ ≤ 1.0.
165
-        mask = lambdas <= 1.0
166
-        if not mask.any():
167
-            return None
168
-        idx1 = int(np.argmax(lambdas * mask))
169
-    else:
170
-        idx1 = int(candidates[0])
171
-    target = 0.9 * float(divs[idx1])
172
-    if target <= 0:
173
-        return None
174
-    for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False):
175
-        if d >= target:
176
-            return float(lam)
177
-    return None
178
-
179
-
180
-def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float:
181
-    """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0."""
182
-    idx_max = int(np.argmax(lambdas))
183
-    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
184
-    if candidates.size == 0:
185
-        return 1.0
186
-    idx1 = int(candidates[0])
187
-    if idx_max == idx1:
188
-        return 1.0
189
-    d1 = float(divs[idx1])
190
-    dmax = float(divs[idx_max])
191
-    if d1 <= 0:
192
-        return 1.0
193
-    return dmax / d1
sway/src/dlm_sway/probes/adapter_revert.pydeleted
@@ -1,178 +0,0 @@
1
-"""A2 AdapterRevert — does the fine-tuned model drift back to base under pressure?
2
-
3
-For each test case the user provides a prompt, a "gold" answer (the
4
-adapter's intended response), and one or more adversarial paraphrases of
5
-the prompt. We generate base-model and ft-model completions on every
6
-paraphrase and ask: does the ft output cluster semantically with the
7
-base's output (revert) or with the gold (adhere)?
8
-
9
-Signal: ``revert_rate`` = fraction of (case, paraphrase) pairs where
10
-``cos(ft, base) > cos(ft, gold)``. A healthy fine-tune holds below 25%.
11
-
12
-Needs sentence embeddings. Without the ``semsim`` extra installed the
13
-probe returns :attr:`Verdict.SKIP` with a pip hint — deterministic
14
-n-gram fallbacks don't carry semantic equivalence reliably enough to
15
-drive a revert decision, and we'd rather be honest than lossy.
16
-"""
17
-
18
-from __future__ import annotations
19
-
20
-from typing import Any, Literal
21
-
22
-from pydantic import BaseModel, ConfigDict, Field
23
-
24
-from dlm_sway.core.errors import BackendNotAvailableError
25
-from dlm_sway.core.result import ProbeResult, Verdict
26
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
27
-
28
-
29
-class AdapterRevertCase(BaseModel):
30
-    """One revert test case."""
31
-
32
-    model_config = ConfigDict(extra="forbid", frozen=True)
33
-
34
-    prompt: str
35
-    gold: str
36
-    """What the adapter is supposed to produce."""
37
-    paraphrases: list[str] = Field(default_factory=list, min_length=1)
38
-    """At least one paraphrase is required — revert is observed under
39
-    reframing, not on the original prompt."""
40
-
41
-
42
-class AdapterRevertSpec(ProbeSpec):
43
-    kind: Literal["adapter_revert"] = "adapter_revert"
44
-    cases: list[AdapterRevertCase] = Field(default_factory=list)
45
-    max_new_tokens: int = 64
46
-    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
47
-    """HF id of the embedder. Default is ~80 MB, CPU-friendly."""
48
-    base_gold_similarity_cap: float = 0.75
49
-    """Skip pairs where base and gold are trivially similar — those
50
-    can't distinguish revert from adherence, and including them would
51
-    inflate the revert rate with noise."""
52
-    assert_revert_rate_lt: float = 0.25
53
-
54
-
55
-class AdapterRevertProbe(Probe):
56
-    kind = "adapter_revert"
57
-    spec_cls = AdapterRevertSpec
58
-    category = "adherence"
59
-
60
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
61
-        assert isinstance(spec, AdapterRevertSpec)
62
-        if not spec.cases:
63
-            return ProbeResult(
64
-                name=spec.name,
65
-                kind=spec.kind,
66
-                verdict=Verdict.ERROR,
67
-                score=None,
68
-                message="no cases provided",
69
-            )
70
-
71
-        try:
72
-            embed = _load_embedder(spec.embedding_model)
73
-        except BackendNotAvailableError as exc:
74
-            return ProbeResult(
75
-                name=spec.name,
76
-                kind=spec.kind,
77
-                verdict=Verdict.SKIP,
78
-                score=None,
79
-                message=str(exc),
80
-            )
81
-
82
-        import numpy as np
83
-
84
-        total = 0
85
-        reverts = 0
86
-        dropped_trivial = 0
87
-        per_case: list[dict[str, Any]] = []
88
-        for case in spec.cases:
89
-            gold_vec = embed([case.gold])[0]
90
-            for pp in case.paraphrases:
91
-                with ctx.backend.as_base() as bv:
92
-                    base_gen = bv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
93
-                with ctx.backend.as_finetuned() as fv:
94
-                    ft_gen = fv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
95
-                vecs = embed([base_gen, ft_gen])
96
-                base_vec, ft_vec = vecs[0], vecs[1]
97
-                base_gold = _cosine(base_vec, gold_vec)
98
-                if base_gold > spec.base_gold_similarity_cap:
99
-                    dropped_trivial += 1
100
-                    continue
101
-                cos_ft_base = _cosine(ft_vec, base_vec)
102
-                cos_ft_gold = _cosine(ft_vec, gold_vec)
103
-                total += 1
104
-                if cos_ft_base > cos_ft_gold:
105
-                    reverts += 1
106
-                per_case.append(
107
-                    {
108
-                        "prompt": pp[:80],
109
-                        "cos_ft_base": cos_ft_base,
110
-                        "cos_ft_gold": cos_ft_gold,
111
-                        "reverted": cos_ft_base > cos_ft_gold,
112
-                    }
113
-                )
114
-
115
-        if total == 0:
116
-            return ProbeResult(
117
-                name=spec.name,
118
-                kind=spec.kind,
119
-                verdict=Verdict.WARN,
120
-                score=0.5,
121
-                message=(
122
-                    f"all {dropped_trivial} cases had base≈gold (> "
123
-                    f"{spec.base_gold_similarity_cap}) — no separable signal"
124
-                ),
125
-                evidence={"dropped_trivial": dropped_trivial, "weight": spec.weight},
126
-            )
127
-
128
-        rate = reverts / total
129
-        verdict = Verdict.PASS if rate < spec.assert_revert_rate_lt else Verdict.FAIL
130
-        score = max(0.0, 1.0 - rate / max(spec.assert_revert_rate_lt, 1e-6))
131
-        score = float(np.clip(score, 0.0, 1.0))
132
-
133
-        return ProbeResult(
134
-            name=spec.name,
135
-            kind=spec.kind,
136
-            verdict=verdict,
137
-            score=score,
138
-            raw=rate,
139
-            evidence={
140
-                "revert_rate": rate,
141
-                "reverts": reverts,
142
-                "total": total,
143
-                "dropped_trivial": dropped_trivial,
144
-                "per_case": per_case[:8],  # cap to keep JSON bounded
145
-                "weight": spec.weight,
146
-            },
147
-            message=f"revert_rate={rate:.2%} (reverts={reverts}/{total}, dropped_trivial={dropped_trivial})",
148
-        )
149
-
150
-
151
-def _load_embedder(model_id: str):  # type: ignore[no-untyped-def]
152
-    """Return a callable ``list[str] -> np.ndarray`` over encoded vectors."""
153
-    try:
154
-        from sentence_transformers import SentenceTransformer
155
-    except ImportError as exc:
156
-        raise BackendNotAvailableError(
157
-            "adapter_revert",
158
-            extra="semsim",
159
-            hint="adapter_revert relies on sentence embeddings.",
160
-        ) from exc
161
-    st = SentenceTransformer(model_id)
162
-
163
-    def _embed(texts: list[str]):  # type: ignore[no-untyped-def]
164
-        return st.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
165
-
166
-    return _embed
167
-
168
-
169
-def _cosine(a: Any, b: Any) -> float:
170
-    import numpy as np
171
-
172
-    av = np.asarray(a, dtype=np.float64)
173
-    bv = np.asarray(b, dtype=np.float64)
174
-    na = float(np.linalg.norm(av))
175
-    nb = float(np.linalg.norm(bv))
176
-    if na == 0.0 or nb == 0.0:
177
-        return 0.0
178
-    return float(np.dot(av, bv) / (na * nb))
sway/src/dlm_sway/probes/base.pydeleted
@@ -1,131 +0,0 @@
1
-"""Probe abstract base + per-kind registry.
2
-
3
-The registry is the extension point. Adding a new probe means:
4
-
5
-1. Subclass :class:`ProbeSpec` with a unique ``kind`` field (Literal).
6
-2. Subclass :class:`Probe` setting ``kind`` and ``spec_cls``.
7
-3. Importing the probe module at least once (its subclass hook registers
8
-   itself).
9
-
10
-The runner uses :func:`build_probe` to map each raw spec dict to a
11
-``(Probe, ProbeSpec)`` pair. Validation errors are turned into
12
-:class:`~dlm_sway.core.errors.SpecValidationError` with the probe name
13
-as the source so error messages localize to the offending entry.
14
-"""
15
-
16
-from __future__ import annotations
17
-
18
-from abc import ABC, abstractmethod
19
-from dataclasses import dataclass, field
20
-from typing import Any, ClassVar
21
-
22
-from pydantic import BaseModel, ConfigDict, ValidationError
23
-
24
-from dlm_sway.core.errors import SpecValidationError
25
-from dlm_sway.core.result import ProbeResult
26
-from dlm_sway.core.scoring import DifferentialBackend
27
-from dlm_sway.core.sections import Section
28
-
29
-
30
-class ProbeSpec(BaseModel):
31
-    """Common fields for every probe's spec entry in ``sway.yaml``."""
32
-
33
-    model_config = ConfigDict(extra="forbid", frozen=True)
34
-
35
-    name: str
36
-    """Unique within a suite; surfaces in the report."""
37
-    kind: str
38
-    """Discriminator — must match a registered :class:`Probe` subclass."""
39
-    enabled: bool = True
40
-    """If ``False`` the runner records a :class:`~dlm_sway.core.result.Verdict.SKIP`."""
41
-    weight: float = 1.0
42
-    """Weight inside the probe's component (adherence / attribution / …)."""
43
-
44
-
45
-@dataclass(frozen=True, slots=True)
46
-class RunContext:
47
-    """What a probe can read beyond its own spec.
48
-
49
-    Probes should receive exactly what they need and nothing more; fat
50
-    contexts encourage coupling between unrelated probes.
51
-
52
-    Attributes
53
-    ----------
54
-    backend:
55
-        The differential backend holding base + fine-tuned views.
56
-    seed:
57
-        Seed for deterministic probe RNGs (paraphrase sampling, etc).
58
-    top_k:
59
-        Default truncation for next-token distributions.
60
-    sections:
61
-        Optional list of typed sections (populated by the .dlm bridge;
62
-        ``None`` when sway is invoked against bare HF+PEFT).
63
-    doc_text:
64
-        Raw document text, if available.
65
-    null_stats:
66
-        Null-adapter baseline stats for z-score calibration, keyed by
67
-        probe *kind*. Populated by the runner after it's executed the
68
-        ``null_adapter`` probe (if configured).
69
-    """
70
-
71
-    backend: DifferentialBackend
72
-    seed: int = 0
73
-    top_k: int = 256
74
-    sections: tuple[Section, ...] | None = None
75
-    doc_text: str | None = None
76
-    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
77
-
78
-
79
-_REGISTRY: dict[str, type[Probe]] = {}
80
-
81
-
82
-class Probe(ABC):
83
-    """Concrete probe. One instance per probe spec in the suite."""
84
-
85
-    kind: ClassVar[str]
86
-    """The string used in ``sway.yaml``'s ``kind`` field."""
87
-    spec_cls: ClassVar[type[ProbeSpec]]
88
-    """The pydantic model class that validates this probe's spec."""
89
-    category: ClassVar[str] = "adherence"
90
-    """One of: ``adherence``, ``attribution``, ``calibration``,
91
-    ``ablation``, ``baseline``. Drives composite scoring."""
92
-
93
-    def __init_subclass__(cls, **kwargs: Any) -> None:
94
-        super().__init_subclass__(**kwargs)
95
-        # The abstract class itself has no `kind`; skip registration.
96
-        if "kind" not in cls.__dict__:
97
-            return
98
-        kind = cls.kind
99
-        if kind in _REGISTRY:
100
-            raise ValueError(f"duplicate probe kind {kind!r}: {_REGISTRY[kind]!r} vs {cls!r}")
101
-        _REGISTRY[kind] = cls
102
-
103
-    @abstractmethod
104
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: ...
105
-
106
-
107
-def registry() -> dict[str, type[Probe]]:
108
-    """Read-only view of registered probes."""
109
-    return dict(_REGISTRY)
110
-
111
-
112
-def build_probe(raw: dict[str, Any]) -> tuple[Probe, ProbeSpec]:
113
-    """Validate a raw YAML probe entry and return (Probe instance, spec)."""
114
-    kind = raw.get("kind")
115
-    if not isinstance(kind, str):
116
-        raise SpecValidationError(
117
-            "probe entry missing string 'kind' field",
118
-            source=str(raw.get("name", "<unknown>")),
119
-        )
120
-    if kind not in _REGISTRY:
121
-        known = ", ".join(sorted(_REGISTRY))
122
-        raise SpecValidationError(
123
-            f"unknown probe kind {kind!r} (registered: {known})",
124
-            source=str(raw.get("name", "<unknown>")),
125
-        )
126
-    probe_cls = _REGISTRY[kind]
127
-    try:
128
-        spec = probe_cls.spec_cls.model_validate(raw)
129
-    except ValidationError as exc:
130
-        raise SpecValidationError(str(exc), source=str(raw.get("name", "<unknown>"))) from exc
131
-    return probe_cls(), spec
sway/src/dlm_sway/probes/calibration_drift.pydeleted
@@ -1,135 +0,0 @@
1
-"""C2 CalibrationDrift — did we break general knowledge while fitting the doc?
2
-
3
-The classic small-doc fine-tune failure mode: the adapter learned the
4
-document so well that it forgot the world. C2 catches this by scoring
5
-base and ft on a packaged set of general-knowledge completions (the
6
-``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts)
7
-and flagging items whose per-token logprob regressed significantly.
8
-
9
-A healthy fine-tune: some items drift slightly (mild confidence shift,
10
-normal), but essentially none regress below a nat of slack. An over-fit
11
-fine-tune: 20%+ of items regress, the adapter has torched its ability
12
-to answer anything outside the document.
13
-
14
-Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND
15
-``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default
16
-to values that trigger on genuine damage but tolerate normal drift.
17
-"""
18
-
19
-from __future__ import annotations
20
-
21
-import statistics
22
-from typing import Literal
23
-
24
-from pydantic import Field
25
-
26
-from dlm_sway.core.result import ProbeResult, Verdict
27
-from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
28
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
29
-
30
-
31
-class CalibrationItemSpec(ProbeSpec):
32
-    """Not used directly — documents the shape of an item override."""
33
-
34
-    kind: Literal["__calibration_item"] = "__calibration_item"
35
-    prompt: str = ""
36
-    gold: str = ""
37
-
38
-
39
-class CalibrationDriftSpec(ProbeSpec):
40
-    kind: Literal["calibration_drift"] = "calibration_drift"
41
-    pack: Literal["builtin"] = "builtin"
42
-    """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom
43
-    packs will ship via a file reference in a later milestone."""
44
-    items_limit: int | None = None
45
-    """If set, truncate the pack to this many items (for fast runs)."""
46
-    assert_fraction_regressed_lt: float = 0.15
47
-    assert_mean_delta_gte: float = -0.5
48
-    """Mean per-token logprob delta (ft − base) across the pack. Slightly
49
-    negative is tolerable; deeply negative is not."""
50
-    regression_nats: float = 1.0
51
-    """How many nats worse an item must get to count as regressed."""
52
-    items: list[tuple[str, str]] = Field(default_factory=list)
53
-    """Optional inline override of the packaged items."""
54
-
55
-
56
-class CalibrationDriftProbe(Probe):
57
-    kind = "calibration_drift"
58
-    spec_cls = CalibrationDriftSpec
59
-    category = "calibration"
60
-
61
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
62
-        assert isinstance(spec, CalibrationDriftSpec)
63
-        items = list(spec.items) if spec.items else list(BUILT_IN_PACK)
64
-        if spec.items_limit is not None:
65
-            items = items[: spec.items_limit]
66
-        if not items:
67
-            return ProbeResult(
68
-                name=spec.name,
69
-                kind=spec.kind,
70
-                verdict=Verdict.ERROR,
71
-                score=None,
72
-                message="no calibration items",
73
-            )
74
-
75
-        deltas: list[float] = []
76
-        regressed = 0
77
-        worst: list[dict[str, float | str]] = []
78
-
79
-        for prompt, gold in items:
80
-            tokens = max(_token_estimate(gold), 1)
81
-            with ctx.backend.as_base() as b:
82
-                lp_base = b.logprob_of(prompt, gold) / tokens
83
-            with ctx.backend.as_finetuned() as f:
84
-                lp_ft = f.logprob_of(prompt, gold) / tokens
85
-            delta = lp_ft - lp_base
86
-            deltas.append(delta)
87
-            if delta < -spec.regression_nats:
88
-                regressed += 1
89
-                worst.append({"prompt": prompt, "gold": gold, "delta": delta})
90
-
91
-        # Surface the worst offenders — up to 5.
92
-        worst.sort(key=lambda d: float(d["delta"]))
93
-        worst = worst[:5]
94
-
95
-        frac_regressed = regressed / len(items)
96
-        mean_delta = statistics.fmean(deltas)
97
-
98
-        passed = (
99
-            frac_regressed < spec.assert_fraction_regressed_lt
100
-            and mean_delta >= spec.assert_mean_delta_gte
101
-        )
102
-        verdict = Verdict.PASS if passed else Verdict.FAIL
103
-        # Score: 1.0 at zero regression + zero drift, declining with either.
104
-        regress_component = max(
105
-            0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6)
106
-        )
107
-        drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5))
108
-        score = 0.6 * regress_component + 0.4 * drift_component
109
-
110
-        return ProbeResult(
111
-            name=spec.name,
112
-            kind=spec.kind,
113
-            verdict=verdict,
114
-            score=score,
115
-            raw=frac_regressed,
116
-            base_value=None,
117
-            ft_value=mean_delta,
118
-            evidence={
119
-                "fraction_regressed": frac_regressed,
120
-                "mean_delta_nats": mean_delta,
121
-                "regressed_count": regressed,
122
-                "total_items": len(items),
123
-                "worst_offenders": worst,
124
-                "regression_nats_threshold": spec.regression_nats,
125
-                "weight": spec.weight,
126
-            },
127
-            message=(
128
-                f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats "
129
-                f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok"
130
-            ),
131
-        )
132
-
133
-
134
-def _token_estimate(s: str) -> int:
135
-    return max(1, len(s) // 4)
sway/src/dlm_sway/probes/delta_kl.pydeleted
@@ -1,121 +0,0 @@
1
-"""A1 DeltaKL — the simplest adherence probe.
2
-
3
-For each prompt, compute the JS (default) or KL divergence between the
4
-base and fine-tuned model's next-token distributions at the position
5
-after the prompt. Aggregate across prompts with a mean.
6
-
7
-*What it tells you:* whether the adapter is distinguishable from the base
8
-on things the document cares about. A zero-divergence result is a red
9
-flag — the adapter is ignored.
10
-
11
-*What it can't tell you:* whether the change is semantically *correct*.
12
-Direction and correctness are what :mod:`dir`, :mod:`adapter_revert`,
13
-and the attribution probes cover.
14
-"""
15
-
16
-from __future__ import annotations
17
-
18
-import statistics
19
-from typing import Literal
20
-
21
-from pydantic import Field
22
-
23
-from dlm_sway.core.result import ProbeResult, Verdict
24
-from dlm_sway.probes._divergence import Divergence, divergence, js_ln2
25
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
26
-from dlm_sway.probes.null_adapter import get_null_stats
27
-
28
-
29
-class DeltaKLSpec(ProbeSpec):
30
-    """Spec for ``kind: delta_kl``."""
31
-
32
-    kind: Literal["delta_kl"] = "delta_kl"
33
-    prompts: list[str] = Field(default_factory=list, min_length=0)
34
-    """Inline prompts. At least one of ``prompts`` / ``prompts_from`` must
35
-    be non-empty at run time; the prompts-from path is wired via
36
-    :mod:`dlm_sway.integrations.dlm.autogen`."""
37
-    divergence: Divergence = "js"
38
-    top_k: int | None = None
39
-    """Override the suite-wide ``top_k``. ``None`` → use ``ctx.top_k``."""
40
-    assert_mean_gte: float = 0.02
41
-    """Fixed-threshold pass criterion when no null stats are available."""
42
-    assert_z_gte: float = 3.0
43
-    """Z-score pass criterion against the null-adapter baseline, when it
44
-    exists. The more principled metric — prefer this over the raw
45
-    threshold."""
46
-
47
-
48
-class DeltaKLProbe(Probe):
49
-    """The canonical "is the adapter changing anything?" probe."""
50
-
51
-    kind = "delta_kl"
52
-    spec_cls = DeltaKLSpec
53
-    category = "adherence"
54
-
55
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
56
-        assert isinstance(spec, DeltaKLSpec)
57
-        if not spec.prompts:
58
-            return ProbeResult(
59
-                name=spec.name,
60
-                kind=spec.kind,
61
-                verdict=Verdict.ERROR,
62
-                score=None,
63
-                message="no prompts provided (inline 'prompts' was empty)",
64
-            )
65
-
66
-        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
67
-        divergences: list[float] = []
68
-        for prompt in spec.prompts:
69
-            with ctx.backend.as_base() as base_view:
70
-                base_dist = base_view.next_token_dist(prompt, top_k=top_k)
71
-            with ctx.backend.as_finetuned() as ft_view:
72
-                ft_dist = ft_view.next_token_dist(prompt, top_k=top_k)
73
-            divergences.append(divergence(base_dist, ft_dist, kind=spec.divergence))
74
-
75
-        raw_mean = statistics.fmean(divergences)
76
-        raw_max = max(divergences)
77
-
78
-        # Null-adapter calibration wins when available.
79
-        null = get_null_stats(ctx, spec.kind)
80
-        z = None
81
-        if null is not None and null.get("std", 0.0) > 0.0:
82
-            z = (raw_mean - null["mean"]) / null["std"]
83
-            verdict = Verdict.PASS if z >= spec.assert_z_gte else Verdict.FAIL
84
-            message = f"mean {spec.divergence}={raw_mean:.4f}, z={z:+.2f}σ vs null"
85
-        else:
86
-            verdict = Verdict.PASS if raw_mean >= spec.assert_mean_gte else Verdict.FAIL
87
-            message = (
88
-                f"mean {spec.divergence}={raw_mean:.4f} "
89
-                f"({'≥' if verdict == Verdict.PASS else '<'} {spec.assert_mean_gte})"
90
-            )
91
-
92
-        # Normalized score for composite: JS is bounded by ln(2), so
93
-        # sigmoid-ish on (z, or raw / bound) keeps the number in [0, 1].
94
-        if z is not None:
95
-            score = _sigmoid(z / 3.0)
96
-        else:
97
-            bound = js_ln2() if spec.divergence == "js" else 1.0
98
-            score = min(1.0, raw_mean / bound) if bound > 0.0 else 0.0
99
-
100
-        return ProbeResult(
101
-            name=spec.name,
102
-            kind=spec.kind,
103
-            verdict=verdict,
104
-            score=score,
105
-            raw=raw_mean,
106
-            z_score=z,
107
-            evidence={
108
-                "divergence_kind": spec.divergence,
109
-                "per_prompt": divergences,
110
-                "max": raw_max,
111
-                "num_prompts": len(spec.prompts),
112
-                "weight": spec.weight,
113
-            },
114
-            message=message,
115
-        )
116
-
117
-
118
-def _sigmoid(x: float) -> float:
119
-    import math
120
-
121
-    return 1.0 / (1.0 + math.exp(-x))
sway/src/dlm_sway/probes/leakage.pydeleted
@@ -1,194 +0,0 @@
1
-"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim?
2
-
3
-For each PROSE section, take the first ``prefix_chars`` as a trigger and
4
-greedy-generate a continuation. Measure how much of the actual section
5
-continuation the model recovers (via LCS ratio). Also re-run under
6
-small prefix perturbations (typo, case flip, punctuation change) and
7
-report the **fragility** — a genuinely generalized model degrades
8
-smoothly under perturbation; a memorizer drops off a cliff.
9
-
10
-Default pass: ``greedy_recall < 0.5``. That default is tuned for the
11
-common "don't leak my document" use case. Sections tagged ``intent:
12
-memorize`` invert the interpretation — the .dlm bridge handles that
13
-flip at spec-generation time.
14
-"""
15
-
16
-from __future__ import annotations
17
-
18
-import difflib
19
-import statistics
20
-from typing import Literal
21
-
22
-from pydantic import Field
23
-
24
-from dlm_sway.core.result import ProbeResult, Verdict
25
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
26
-
27
-PerturbationKind = Literal["typo", "case_flip", "drop_punct"]
28
-
29
-
30
-def _default_perturbations() -> list[PerturbationKind]:
31
-    return ["typo", "case_flip", "drop_punct"]
32
-
33
-
34
-class LeakageSusceptibilitySpec(ProbeSpec):
35
-    kind: Literal["leakage"] = "leakage"
36
-    prefix_chars: int = 128
37
-    continuation_chars: int = 256
38
-    max_new_tokens: int = 96
39
-    perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations)
40
-    assert_recall_lt: float = 0.5
41
-    """Default anti-leak gate: pass when verbatim recall is modest. Invert
42
-    by bumping this to ``>1.0`` when intentional memorization is desired."""
43
-    min_fragility: float = 0.3
44
-    """Fragility = (clean - perturbed) / max(clean, eps). A low value
45
-    with high recall indicates true memorization; a high value suggests
46
-    the model generalized and recall was incidental."""
47
-
48
-
49
-class LeakageSusceptibilityProbe(Probe):
50
-    kind = "leakage"
51
-    spec_cls = LeakageSusceptibilitySpec
52
-    category = "calibration"
53
-
54
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
55
-        assert isinstance(spec, LeakageSusceptibilitySpec)
56
-        if ctx.sections is None:
57
-            return ProbeResult(
58
-                name=spec.name,
59
-                kind=spec.kind,
60
-                verdict=Verdict.SKIP,
61
-                score=None,
62
-                message="no sections in context — provide via the .dlm bridge",
63
-            )
64
-        prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()]
65
-        if not prose:
66
-            return ProbeResult(
67
-                name=spec.name,
68
-                kind=spec.kind,
69
-                verdict=Verdict.SKIP,
70
-                score=None,
71
-                message="no PROSE sections to test for leakage",
72
-            )
73
-
74
-        clean_recalls: list[float] = []
75
-        perturbed_recalls: list[float] = []
76
-        per_section: list[dict[str, float | str]] = []
77
-
78
-        with ctx.backend.as_finetuned() as ft:
79
-            for s in prose:
80
-                prefix = s.content[: spec.prefix_chars]
81
-                target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars]
82
-                if not target.strip():
83
-                    continue
84
-                clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
85
-                clean = _lcs_ratio(clean_gen, target)
86
-                clean_recalls.append(clean)
87
-
88
-                per_sec_perturbed: list[float] = []
89
-                for perturbation in spec.perturbations:
90
-                    perturbed_prefix = _perturb(prefix, perturbation)
91
-                    perturbed_gen = ft.generate(
92
-                        perturbed_prefix,
93
-                        max_new_tokens=spec.max_new_tokens,
94
-                        seed=ctx.seed,
95
-                    )
96
-                    per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target))
97
-                mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean
98
-                perturbed_recalls.append(mean_pert)
99
-
100
-                per_section.append(
101
-                    {
102
-                        "section_id": s.id,
103
-                        "clean_recall": clean,
104
-                        "perturbed_recall": mean_pert,
105
-                        "fragility": _fragility(clean, mean_pert),
106
-                    }
107
-                )
108
-
109
-        if not clean_recalls:
110
-            return ProbeResult(
111
-                name=spec.name,
112
-                kind=spec.kind,
113
-                verdict=Verdict.SKIP,
114
-                score=None,
115
-                message="no PROSE sections had scorable continuations",
116
-            )
117
-
118
-        mean_clean = statistics.fmean(clean_recalls)
119
-        mean_pert = statistics.fmean(perturbed_recalls)
120
-        mean_fragility = _fragility(mean_clean, mean_pert)
121
-
122
-        verdict = (
123
-            Verdict.PASS
124
-            if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility
125
-            else Verdict.FAIL
126
-        )
127
-        # Score: 1.0 at zero recall, declining as recall approaches threshold.
128
-        recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6)))
129
-        # Bonus: high fragility is good (genuine generalization).
130
-        fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6)))
131
-        score = 0.7 * recall_score + 0.3 * fragility_bonus
132
-
133
-        return ProbeResult(
134
-            name=spec.name,
135
-            kind=spec.kind,
136
-            verdict=verdict,
137
-            score=score,
138
-            raw=mean_clean,
139
-            base_value=None,
140
-            ft_value=mean_fragility,
141
-            evidence={
142
-                "mean_clean_recall": mean_clean,
143
-                "mean_perturbed_recall": mean_pert,
144
-                "mean_fragility": mean_fragility,
145
-                "per_section": per_section[:10],
146
-                "weight": spec.weight,
147
-            },
148
-            message=(
149
-                f"greedy_recall={mean_clean:.2f} "
150
-                f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})"
151
-            ),
152
-        )
153
-
154
-
155
-# -- helpers -----------------------------------------------------------
156
-
157
-
158
-def _lcs_ratio(generated: str, target: str) -> float:
159
-    """Longest common subsequence ratio via difflib.
160
-
161
-    Returns 0 for empty inputs, 1.0 for identical strings. difflib's
162
-    ``ratio`` is a gestalt similarity; close enough to a true LCS for
163
-    our purposes and has no external deps.
164
-    """
165
-    if not generated or not target:
166
-        return 0.0
167
-    return difflib.SequenceMatcher(None, generated, target).ratio()
168
-
169
-
170
-def _perturb(text: str, kind: str) -> str:
171
-    """Apply a deterministic textual perturbation."""
172
-    if not text:
173
-        return text
174
-    if kind == "typo":
175
-        # Swap the first two characters; trivial typo the model must reconstruct.
176
-        if len(text) < 2:
177
-            return text
178
-        return text[1] + text[0] + text[2:]
179
-    if kind == "case_flip":
180
-        # Flip case of the first alpha char.
181
-        for i, ch in enumerate(text):
182
-            if ch.isalpha():
183
-                flipped = ch.lower() if ch.isupper() else ch.upper()
184
-                return text[:i] + flipped + text[i + 1 :]
185
-        return text
186
-    if kind == "drop_punct":
187
-        return "".join(ch for ch in text if ch not in ".,;:!?-—")
188
-    raise ValueError(f"unknown perturbation: {kind!r}")
189
-
190
-
191
-def _fragility(clean: float, perturbed: float) -> float:
192
-    if clean <= 0.0:
193
-        return 0.0
194
-    return max(0.0, (clean - perturbed) / clean)
sway/src/dlm_sway/probes/null_adapter.pydeleted
@@ -1,144 +0,0 @@
1
-"""Null-adapter baseline probe.
2
-
3
-Every numeric primitive reports its raw metric *and* a z-score against a
4
-null-adapter distribution. This probe is the runtime engine that
5
-establishes that distribution — it builds random-init "null" adapters
6
-(structurally identical to the real adapter but with weights drawn from
7
-a Gaussian) and measures how much signal they produce.
8
-
9
-The resulting ``(mean, std, n)`` per kind is attached to this probe's
10
-``evidence["null_stats"]``. The runner picks it up and threads it into
11
-:attr:`RunContext.null_stats`, where every downstream probe can read it
12
-and turn a raw metric into a z-score.
13
-
14
-Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend`
15
-cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back
16
-to their fixed thresholds in that case.
17
-"""
18
-
19
-from __future__ import annotations
20
-
21
-import statistics
22
-from typing import Literal
23
-
24
-from pydantic import Field
25
-
26
-from dlm_sway.core.result import ProbeResult, Verdict
27
-from dlm_sway.core.scoring import NullCalibratedBackend
28
-from dlm_sway.probes._divergence import divergence
29
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
30
-
31
-
32
-class NullAdapterSpec(ProbeSpec):
33
-    """Spec for ``kind: null_adapter``.
34
-
35
-    Authors place this probe **first** in the suite so its output
36
-    populates :attr:`RunContext.null_stats` before subsequent probes
37
-    consult it.
38
-    """
39
-
40
-    kind: Literal["null_adapter"] = "null_adapter"
41
-    runs: int = Field(default=3, ge=1, le=10)
42
-    """Number of independent null adapters to evaluate. Three is the
43
-    smallest that yields a usable std; more is better but quickly
44
-    dominates suite runtime."""
45
-    prompts: list[str] = Field(default_factory=list)
46
-    """Prompt set for null calibration. Keep small — calibration runs
47
-    ``runs × len(prompts)`` forward passes. 4–8 prompts is typical.
48
-    If empty, a minimal built-in prompt set is used so the probe
49
-    always produces stats."""
50
-    init_scale: float = 0.02
51
-    """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B."""
52
-    seed_base: int = 1000
53
-    """First seed; successive runs use ``seed_base + run_idx``."""
54
-
55
-
56
-_DEFAULT_PROMPTS: tuple[str, ...] = (
57
-    "The quick brown fox",
58
-    "Once upon a time",
59
-    "In this document we explain",
60
-    "The key takeaway is",
61
-    "An important point to remember",
62
-)
63
-
64
-
65
-class NullAdapterProbe(Probe):
66
-    """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself.
67
-
68
-    The probe never fails on its own terms — its *job* is calibration.
69
-    Downstream probes pick up :attr:`RunContext.null_stats` keyed by
70
-    probe kind (``delta_kl``, ``adapter_ablation`` …) and use the
71
-    populated mean/std to z-score their own raw metrics.
72
-    """
73
-
74
-    kind = "null_adapter"
75
-    spec_cls = NullAdapterSpec
76
-    category = "baseline"
77
-
78
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
79
-        assert isinstance(spec, NullAdapterSpec)
80
-        if not isinstance(ctx.backend, NullCalibratedBackend):
81
-            return ProbeResult(
82
-                name=spec.name,
83
-                kind=spec.kind,
84
-                verdict=Verdict.SKIP,
85
-                score=None,
86
-                message=(
87
-                    "backend does not implement NullCalibratedBackend — "
88
-                    "numeric probes will fall back to fixed thresholds"
89
-                ),
90
-            )
91
-        prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS)
92
-
93
-        per_seed_means: list[float] = []
94
-        for run_idx in range(spec.runs):
95
-            seed = spec.seed_base + run_idx
96
-            per_prompt: list[float] = []
97
-            for prompt in prompts:
98
-                with ctx.backend.as_base() as base_view:
99
-                    base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k)
100
-                with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view:
101
-                    null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k)
102
-                per_prompt.append(divergence(base_dist, null_dist, kind="js"))
103
-            per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0)
104
-
105
-        mean = statistics.fmean(per_seed_means)
106
-        std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0
107
-
108
-        # Publish per-kind stats. delta_kl is the primary kind; other
109
-        # divergence-based probes (adapter_ablation) share this scale.
110
-        null_stats = {
111
-            "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
112
-            "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
113
-        }
114
-
115
-        return ProbeResult(
116
-            name=spec.name,
117
-            kind=spec.kind,
118
-            verdict=Verdict.PASS,
119
-            score=1.0,
120
-            raw=mean,
121
-            evidence={
122
-                "null_stats": null_stats,
123
-                "per_seed_mean_js": per_seed_means,
124
-                "init_scale": spec.init_scale,
125
-                "runs": spec.runs,
126
-                "num_prompts": len(prompts),
127
-                "weight": spec.weight,
128
-            },
129
-            message=(
130
-                f"null JS divergence μ={mean:.4f} ± {std:.4f} "
131
-                f"(over {spec.runs} seeds × {len(prompts)} prompts) — "
132
-                f"downstream probes will z-score against this baseline"
133
-            ),
134
-        )
135
-
136
-
137
-def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None:
138
-    """Look up null-adapter stats for ``probe_kind``.
139
-
140
-    Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for
141
-    this kind, else ``None``. Probes treat ``None`` as "fall back to the
142
-    fixed threshold from your spec."
143
-    """
144
-    return ctx.null_stats.get(probe_kind)
sway/src/dlm_sway/probes/paraphrase_invariance.pydeleted
@@ -1,148 +0,0 @@
1
-"""B2 ParaphraseInvariance — memorization vs generalization, per case.
2
-
3
-For each ``(prompt, gold, paraphrases)`` test case:
4
-
5
-- ``verbatim_lift``:  Δ-per-token = logprob_ft(prompt, gold) - logprob_base(prompt, gold)
6
-- ``paraphrase_lift``: mean Δ-per-token over the paraphrased prompts
7
-
8
-A model that memorized the exact prompt has high ``verbatim_lift`` but
9
-near-zero ``paraphrase_lift``. A model that learned the underlying
10
-*pattern* has both values positive and close to each other.
11
-
12
-We report:
13
-
14
-- ``generalization_ratio = paraphrase_lift / max(verbatim_lift, eps)``
15
-- ``verbatim_score``: whether the adapter significantly moved the
16
-  verbatim-prompt logprob (sanity check)
17
-
18
-The pass criterion depends on the stated intent: by default we require
19
-both high verbatim lift and high generalization ratio. If the spec's
20
-``intent`` is ``"memorize"``, the ratio requirement inverts — we *want*
21
-verbatim >> paraphrase.
22
-"""
23
-
24
-from __future__ import annotations
25
-
26
-import statistics
27
-from typing import Literal
28
-
29
-from pydantic import BaseModel, ConfigDict, Field
30
-
31
-from dlm_sway.core.result import ProbeResult, Verdict
32
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
33
-
34
-Intent = Literal["generalize", "memorize", "both"]
35
-
36
-
37
-class ParaphraseCase(BaseModel):
38
-    """One paraphrase-invariance case."""
39
-
40
-    model_config = ConfigDict(extra="forbid", frozen=True)
41
-
42
-    prompt: str
43
-    gold: str
44
-    paraphrases: list[str] = Field(default_factory=list, min_length=1)
45
-
46
-
47
-class ParaphraseInvarianceSpec(ProbeSpec):
48
-    kind: Literal["paraphrase_invariance"] = "paraphrase_invariance"
49
-    cases: list[ParaphraseCase] = Field(default_factory=list)
50
-    intent: Intent = "generalize"
51
-    min_verbatim_lift: float = 0.2
52
-    min_generalization_ratio: float = 0.5
53
-    max_generalization_ratio_if_memorize: float = 0.5
54
-
55
-
56
-class ParaphraseInvarianceProbe(Probe):
57
-    kind = "paraphrase_invariance"
58
-    spec_cls = ParaphraseInvarianceSpec
59
-    category = "attribution"
60
-
61
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
62
-        assert isinstance(spec, ParaphraseInvarianceSpec)
63
-        if not spec.cases:
64
-            return ProbeResult(
65
-                name=spec.name,
66
-                kind=spec.kind,
67
-                verdict=Verdict.ERROR,
68
-                score=None,
69
-                message="no cases provided",
70
-            )
71
-
72
-        verbatim_lifts: list[float] = []
73
-        paraphrase_lifts: list[float] = []
74
-        per_case: list[dict[str, float | str]] = []
75
-
76
-        for case in spec.cases:
77
-            tokens = max(_token_estimate(case.gold), 1)
78
-            with ctx.backend.as_base() as b:
79
-                lp_base_verb = b.logprob_of(case.prompt, case.gold) / tokens
80
-                lp_base_par = [b.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
81
-            with ctx.backend.as_finetuned() as f:
82
-                lp_ft_verb = f.logprob_of(case.prompt, case.gold) / tokens
83
-                lp_ft_par = [f.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
84
-
85
-            verb_lift = lp_ft_verb - lp_base_verb
86
-            par_lift = statistics.fmean(
87
-                (ft - base) for base, ft in zip(lp_base_par, lp_ft_par, strict=True)
88
-            )
89
-            verbatim_lifts.append(verb_lift)
90
-            paraphrase_lifts.append(par_lift)
91
-            per_case.append(
92
-                {
93
-                    "prompt": case.prompt[:80],
94
-                    "verbatim_lift": verb_lift,
95
-                    "paraphrase_lift": par_lift,
96
-                }
97
-            )
98
-
99
-        mean_verb = statistics.fmean(verbatim_lifts)
100
-        mean_par = statistics.fmean(paraphrase_lifts)
101
-        ratio = mean_par / mean_verb if abs(mean_verb) > 1e-9 else 0.0
102
-
103
-        verdict, score, msg = _decide(spec, mean_verb, mean_par, ratio)
104
-
105
-        return ProbeResult(
106
-            name=spec.name,
107
-            kind=spec.kind,
108
-            verdict=verdict,
109
-            score=score,
110
-            raw=ratio,
111
-            base_value=mean_verb,
112
-            ft_value=mean_par,
113
-            evidence={
114
-                "verbatim_lift_mean": mean_verb,
115
-                "paraphrase_lift_mean": mean_par,
116
-                "generalization_ratio": ratio,
117
-                "intent": spec.intent,
118
-                "per_case": per_case[:8],
119
-                "weight": spec.weight,
120
-            },
121
-            message=msg,
122
-        )
123
-
124
-
125
-def _decide(
126
-    spec: ParaphraseInvarianceSpec, verb: float, par: float, ratio: float
127
-) -> tuple[Verdict, float, str]:
128
-    """Apply the intent-aware pass rule and return (verdict, score, message)."""
129
-    base_msg = f"verb={verb:+.3f}, para={par:+.3f}, ratio={ratio:.2f}"
130
-    if spec.intent == "memorize":
131
-        verd = (
132
-            Verdict.PASS
133
-            if verb >= spec.min_verbatim_lift and ratio <= spec.max_generalization_ratio_if_memorize
134
-            else Verdict.FAIL
135
-        )
136
-        score = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
137
-        return verd, score, f"{base_msg} — intent=memorize"
138
-    # Default: generalize (or "both")
139
-    passed = verb >= spec.min_verbatim_lift and ratio >= spec.min_generalization_ratio
140
-    verd = Verdict.PASS if passed else Verdict.FAIL
141
-    gen_component = min(1.0, max(0.0, ratio / max(spec.min_generalization_ratio, 1e-6)))
142
-    verb_component = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
143
-    score = 0.5 * gen_component + 0.5 * verb_component
144
-    return verd, score, f"{base_msg} — intent={spec.intent}"
145
-
146
-
147
-def _token_estimate(s: str) -> int:
148
-    return max(1, len(s) // 4)
sway/src/dlm_sway/probes/preference_flip.pydeleted
@@ -1,140 +0,0 @@
1
-"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking?
2
-
3
-For each ``(prompt, chosen, rejected)`` triple, compute the margin
4
-
5
-.. math::
6
-    m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt})
7
-
8
-under both base and fine-tuned views. Interesting triples are the ones
9
-where base got the sign *wrong* (``m_base < 0``); we fail if the
10
-fine-tune doesn't flip a large enough fraction of them.
11
-
12
-Triples come from either an inline ``triples:`` block in the spec or
13
-from PREFERENCE sections in :attr:`RunContext.sections`. The probe
14
-returns :attr:`Verdict.SKIP` when no triples are present — this is the
15
-"no PREFERENCE sections in your document" case, graceful by design.
16
-"""
17
-
18
-from __future__ import annotations
19
-
20
-import statistics
21
-from typing import Literal
22
-
23
-from pydantic import BaseModel, ConfigDict, Field
24
-
25
-from dlm_sway.core.result import ProbeResult, Verdict
26
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
27
-
28
-
29
-class PreferenceTriple(BaseModel):
30
-    model_config = ConfigDict(extra="forbid", frozen=True)
31
-
32
-    prompt: str
33
-    chosen: str
34
-    rejected: str
35
-
36
-
37
-class PreferenceFlipSpec(ProbeSpec):
38
-    kind: Literal["preference_flip"] = "preference_flip"
39
-    triples: list[PreferenceTriple] = Field(default_factory=list)
40
-    """Inline triples. If empty, the probe pulls from PREFERENCE
41
-    sections in ctx.sections; if neither is available the probe SKIPs."""
42
-    assert_flip_rate_gte: float = 0.7
43
-    """Fraction of *base-wrong* triples that must flip under ft."""
44
-    min_triples_for_decision: int = 3
45
-
46
-
47
-class PreferenceFlipProbe(Probe):
48
-    kind = "preference_flip"
49
-    spec_cls = PreferenceFlipSpec
50
-    category = "attribution"
51
-
52
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
53
-        assert isinstance(spec, PreferenceFlipSpec)
54
-        triples = list(spec.triples) or _triples_from_sections(ctx)
55
-        if not triples:
56
-            return ProbeResult(
57
-                name=spec.name,
58
-                kind=spec.kind,
59
-                verdict=Verdict.SKIP,
60
-                score=None,
61
-                message="no preference triples (inline or from sections)",
62
-            )
63
-
64
-        base_margins: list[float] = []
65
-        ft_margins: list[float] = []
66
-        for t in triples:
67
-            with ctx.backend.as_base() as b:
68
-                base_margins.append(
69
-                    b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected)
70
-                )
71
-            with ctx.backend.as_finetuned() as f:
72
-                ft_margins.append(
73
-                    f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected)
74
-                )
75
-
76
-        # Interesting denominator: base got it wrong.
77
-        base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0]
78
-        flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0]
79
-
80
-        if len(base_wrong_idx) < spec.min_triples_for_decision:
81
-            # Not enough base-wrong triples to decide. Fall back to mean margin delta.
82
-            mean_delta = statistics.fmean(
83
-                (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True)
84
-            )
85
-            verdict = Verdict.WARN
86
-            return ProbeResult(
87
-                name=spec.name,
88
-                kind=spec.kind,
89
-                verdict=verdict,
90
-                score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)),
91
-                raw=mean_delta,
92
-                base_value=statistics.fmean(base_margins),
93
-                ft_value=statistics.fmean(ft_margins),
94
-                evidence={
95
-                    "base_wrong": len(base_wrong_idx),
96
-                    "total": len(triples),
97
-                    "mean_margin_delta": mean_delta,
98
-                    "weight": spec.weight,
99
-                },
100
-                message=(
101
-                    f"only {len(base_wrong_idx)} base-wrong triples < "
102
-                    f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}"
103
-                ),
104
-            )
105
-
106
-        flip_rate = len(flipped_idx) / len(base_wrong_idx)
107
-        verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL
108
-        score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6))
109
-        return ProbeResult(
110
-            name=spec.name,
111
-            kind=spec.kind,
112
-            verdict=verdict,
113
-            score=score,
114
-            raw=flip_rate,
115
-            base_value=statistics.fmean(base_margins),
116
-            ft_value=statistics.fmean(ft_margins),
117
-            evidence={
118
-                "flip_rate": flip_rate,
119
-                "flipped": len(flipped_idx),
120
-                "base_wrong": len(base_wrong_idx),
121
-                "total": len(triples),
122
-                "weight": spec.weight,
123
-            },
124
-            message=(
125
-                f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} "
126
-                f"base-wrong triples flipped by ft)"
127
-            ),
128
-        )
129
-
130
-
131
-def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]:
132
-    if ctx.sections is None:
133
-        return []
134
-    out: list[PreferenceTriple] = []
135
-    for s in ctx.sections:
136
-        if s.kind != "preference":
137
-            continue
138
-        for p in s.preferences:
139
-            out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected))
140
-    return out
sway/src/dlm_sway/probes/prompt_collapse.pydeleted
@@ -1,159 +0,0 @@
1
-"""A3 PromptCollapse — does adapter influence decay with context length?
2
-
3
-For each test prompt we prepend irrelevant "stuffing" of varying length
4
-and measure ``divergence(base, ft)`` at the final position. A healthy
5
-adapter shows a modest, slow decay; a degenerate one collapses quickly
6
-— its signal evaporates once the base has a lot of context to lean on.
7
-
8
-We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log
9
-space and report the half-life in tokens. Pass if the half-life is at
10
-least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which
11
-defaults to half the default sequence length.
12
-
13
-All math is numpy-only to avoid a scipy dependency on the install path.
14
-"""
15
-
16
-from __future__ import annotations
17
-
18
-from typing import Literal
19
-
20
-import numpy as np
21
-from pydantic import Field
22
-
23
-from dlm_sway.core.result import ProbeResult, Verdict
24
-from dlm_sway.probes._divergence import Divergence, divergence
25
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
26
-
27
-# A neutral, token-dense piece of text we prepend to stress the base
28
-# model's long-context handling. Deliberately low-information so the
29
-# "answer" at the end is the only thing driving next-token predictions.
30
-_STUFFING = (
31
-    "The following log lines are archived for historical record and have no "
32
-    "bearing on the question that follows. They are retained for audit purposes "
33
-    "only and should be ignored when forming an answer. "
34
-)
35
-
36
-
37
-class PromptCollapseSpec(ProbeSpec):
38
-    kind: Literal["prompt_collapse"] = "prompt_collapse"
39
-    prompts: list[str] = Field(default_factory=list, min_length=0)
40
-    context_lengths: list[int] = Field(
41
-        default_factory=lambda: [0, 256, 512, 1024],
42
-        min_length=2,
43
-    )
44
-    """Approximate token counts of stuffing to prepend. ≥2 required
45
-    because the exponential fit is undefined for a single point."""
46
-    divergence: Divergence = "js"
47
-    top_k: int | None = None
48
-    assert_half_life_tokens: int = 512
49
-    """Minimum half-life to pass. Default is deliberately permissive —
50
-    tune upward for high-stakes deployments."""
51
-
52
-
53
-class PromptCollapseProbe(Probe):
54
-    kind = "prompt_collapse"
55
-    spec_cls = PromptCollapseSpec
56
-    category = "adherence"
57
-
58
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
59
-        assert isinstance(spec, PromptCollapseSpec)
60
-        if not spec.prompts:
61
-            return ProbeResult(
62
-                name=spec.name,
63
-                kind=spec.kind,
64
-                verdict=Verdict.ERROR,
65
-                score=None,
66
-                message="no prompts provided",
67
-            )
68
-
69
-        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
70
-        # Mean divergence at each context length.
71
-        mean_divs: list[float] = []
72
-        for ctx_len in spec.context_lengths:
73
-            prefix = _stuffing(ctx_len)
74
-            divs: list[float] = []
75
-            for prompt in spec.prompts:
76
-                full_prompt = prefix + prompt
77
-                with ctx.backend.as_base() as bv:
78
-                    base_dist = bv.next_token_dist(full_prompt, top_k=top_k)
79
-                with ctx.backend.as_finetuned() as fv:
80
-                    ft_dist = fv.next_token_dist(full_prompt, top_k=top_k)
81
-                divs.append(divergence(base_dist, ft_dist, kind=spec.divergence))
82
-            mean_divs.append(float(np.mean(divs)))
83
-
84
-        half_life = _fit_half_life(
85
-            np.asarray(spec.context_lengths, dtype=np.float64),
86
-            np.asarray(mean_divs, dtype=np.float64),
87
-        )
88
-
89
-        verdict = (
90
-            Verdict.PASS
91
-            if half_life is not None and half_life >= spec.assert_half_life_tokens
92
-            else Verdict.FAIL
93
-        )
94
-        score = _score(half_life, spec.assert_half_life_tokens)
95
-
96
-        msg = (
97
-            f"half-life={half_life:.0f} tokens"
98
-            if half_life is not None
99
-            else "could not fit exponential decay (too flat or non-monotonic)"
100
-        )
101
-        return ProbeResult(
102
-            name=spec.name,
103
-            kind=spec.kind,
104
-            verdict=verdict,
105
-            score=score,
106
-            raw=half_life,
107
-            evidence={
108
-                "context_lengths": spec.context_lengths,
109
-                "mean_divergence_per_length": mean_divs,
110
-                "divergence_kind": spec.divergence,
111
-                "weight": spec.weight,
112
-            },
113
-            message=msg,
114
-        )
115
-
116
-
117
-def _stuffing(target_tokens: int) -> str:
118
-    """Approximate target-length stuffing. 4 chars ≈ 1 token is fine
119
-    for SentencePiece-style tokenizers at the order-of-magnitude level."""
120
-    if target_tokens <= 0:
121
-        return ""
122
-    # Repeat enough copies to hit the target length in characters.
123
-    target_chars = target_tokens * 4
124
-    reps = (target_chars // len(_STUFFING)) + 1
125
-    return (_STUFFING * reps)[:target_chars] + "\n\n"
126
-
127
-
128
-def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None:
129
-    """Fit ``y = a * exp(-x / h)`` via log-space linear regression.
130
-
131
-    Returns ``None`` if the divergences aren't strictly positive or the
132
-    fit is non-decreasing (i.e. the fine-tune got *more* distinct with
133
-    context, which invalidates the half-life concept).
134
-    """
135
-    if (divergences <= 0.0).any():
136
-        # Can't take a log; treat near-zero as too-flat-to-fit.
137
-        return None
138
-    log_y = np.log(divergences)
139
-    # Standard linear regression slope.
140
-    x_mean = float(lengths.mean())
141
-    y_mean = float(log_y.mean())
142
-    denom = float(((lengths - x_mean) ** 2).sum())
143
-    if denom == 0.0:
144
-        return None
145
-    slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom
146
-    if slope >= 0.0:
147
-        # Signal grew with context — can't express as half-life.
148
-        return None
149
-    # Slope = -1/h → h = -1/slope → half_life = ln(2) * h.
150
-    import math
151
-
152
-    return float(math.log(2.0) * (-1.0 / slope))
153
-
154
-
155
-def _score(half_life: float | None, target: int) -> float:
156
-    if half_life is None:
157
-        return 0.0
158
-    # Asymptotic: score saturates at 1.0 when hits target, declines toward 0.
159
-    return float(min(1.0, half_life / max(target, 1)))
sway/src/dlm_sway/probes/section_internalization.pydeleted
@@ -1,189 +0,0 @@
1
-"""B1 SectionInternalizationScore — the flagship attribution primitive.
2
-
3
-For each typed section of the training document, measure *how much the
4
-fine-tune moved the needle on that section's own content* — and subtract
5
-the same metric measured on *other* sections' content. The difference is
6
-the "effective SIS": signal attributable to *this* section, not to a
7
-broader lift across the whole document.
8
-
9
-Output is a per-section bar chart. In practice users see that sections
10
-2 and 7 actually moved the model, sections 3 and 5 did nothing, and
11
-section 11 moved it but also leaked into unrelated content — actionable
12
-signal for document authoring that no other eval tool provides.
13
-
14
-Math per section ``s`` with measurement function ``m(probe_set)``:
15
-
16
-.. math::
17
-    sis_s^{own}  &= (m_{base}(s) - m_{ft}(s)) / m_{base}(s)
18
-    sis_s^{leak} &= (m_{base}(\\bar s) - m_{ft}(\\bar s)) / m_{base}(\\bar s)
19
-    effective    &= sis_s^{own} - sis_s^{leak}
20
-
21
-For PROSE sections, ``m`` is the average NLL per token over the
22
-section's content. For INSTRUCTION and PREFERENCE sections, ``m`` is the
23
-average NLL per token over the answer/chosen spans given their prompts.
24
-"""
25
-
26
-from __future__ import annotations
27
-
28
-import statistics
29
-from typing import Literal
30
-
31
-from pydantic import Field
32
-
33
-from dlm_sway.core.result import ProbeResult, Verdict
34
-from dlm_sway.core.scoring import ScoringBackend
35
-from dlm_sway.core.sections import Section, SectionKind
36
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
37
-
38
-
39
-def _default_include_kinds() -> list[SectionKind]:
40
-    return ["prose", "instruction", "preference"]
41
-
42
-
43
-class SectionInternalizationSpec(ProbeSpec):
44
-    kind: Literal["section_internalization"] = "section_internalization"
45
-    include_kinds: list[SectionKind] = Field(default_factory=_default_include_kinds)
46
-    per_section_threshold: float = 0.05
47
-    """Minimum ``effective_sis`` for a section to be marked PASS."""
48
-    assert_passing_section_frac: float = 0.5
49
-    """Probe-level pass criterion: fraction of sections that must clear
50
-    the per-section threshold."""
51
-    max_prose_chars: int = 2000
52
-    """Cap the length of PROSE content we score to keep runtime bounded.
53
-    Long sections are chunked; this is the per-chunk cap."""
54
-
55
-
56
-class SectionInternalizationProbe(Probe):
57
-    kind = "section_internalization"
58
-    spec_cls = SectionInternalizationSpec
59
-    category = "attribution"
60
-
61
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
62
-        assert isinstance(spec, SectionInternalizationSpec)
63
-        if ctx.sections is None or len(ctx.sections) == 0:
64
-            return ProbeResult(
65
-                name=spec.name,
66
-                kind=spec.kind,
67
-                verdict=Verdict.SKIP,
68
-                score=None,
69
-                message="no sections in context — provide via the .dlm bridge",
70
-            )
71
-
72
-        kinds_allowed = set(spec.include_kinds)
73
-        eligible = [s for s in ctx.sections if s.kind in kinds_allowed]
74
-        if len(eligible) < 2:
75
-            return ProbeResult(
76
-                name=spec.name,
77
-                kind=spec.kind,
78
-                verdict=Verdict.SKIP,
79
-                score=None,
80
-                message=(
81
-                    f"need ≥2 eligible sections for leak-check; got {len(eligible)} "
82
-                    f"(kinds={spec.include_kinds})"
83
-                ),
84
-            )
85
-
86
-        # Pre-compute per-section base and ft NLL-per-token to avoid
87
-        # re-running the forward pass for leak-checks.
88
-        base_nll: dict[str, float] = {}
89
-        ft_nll: dict[str, float] = {}
90
-        with ctx.backend.as_base() as base_view:
91
-            for s in eligible:
92
-                base_nll[s.id] = _section_nll(s, base_view, spec.max_prose_chars)
93
-        with ctx.backend.as_finetuned() as ft_view:
94
-            for s in eligible:
95
-                ft_nll[s.id] = _section_nll(s, ft_view, spec.max_prose_chars)
96
-
97
-        per_section: list[dict[str, float | str | bool]] = []
98
-        passing = 0
99
-        effective_scores: list[float] = []
100
-        for s in eligible:
101
-            others = [o for o in eligible if o.id != s.id]
102
-            own_lift = _relative_lift(base_nll[s.id], ft_nll[s.id])
103
-            leak_lift = statistics.fmean(
104
-                _relative_lift(base_nll[o.id], ft_nll[o.id]) for o in others
105
-            )
106
-            effective = own_lift - leak_lift
107
-            effective_scores.append(effective)
108
-            did_pass = effective >= spec.per_section_threshold
109
-            passing += int(did_pass)
110
-            per_section.append(
111
-                {
112
-                    "section_id": s.id,
113
-                    "kind": s.kind,
114
-                    "tag": s.tag or "",
115
-                    "base_nll": base_nll[s.id],
116
-                    "ft_nll": ft_nll[s.id],
117
-                    "own_lift": own_lift,
118
-                    "leak_lift": leak_lift,
119
-                    "effective_sis": effective,
120
-                    "passed": did_pass,
121
-                }
122
-            )
123
-
124
-        passing_frac = passing / len(eligible)
125
-        verdict = Verdict.PASS if passing_frac >= spec.assert_passing_section_frac else Verdict.FAIL
126
-        score = passing_frac
127
-        return ProbeResult(
128
-            name=spec.name,
129
-            kind=spec.kind,
130
-            verdict=verdict,
131
-            score=score,
132
-            raw=statistics.fmean(effective_scores),
133
-            evidence={
134
-                "per_section": per_section,
135
-                "num_sections": len(eligible),
136
-                "passing_frac": passing_frac,
137
-                "per_section_threshold": spec.per_section_threshold,
138
-                "weight": spec.weight,
139
-            },
140
-            message=(
141
-                f"{passing}/{len(eligible)} sections cleared "
142
-                f"effective_sis≥{spec.per_section_threshold:.2f} (mean={statistics.fmean(effective_scores):+.3f})"
143
-            ),
144
-        )
145
-
146
-
147
-def _section_nll(s: Section, view: ScoringBackend, max_prose_chars: int) -> float:
148
-    """Average NLL per token for the section's content under ``view``."""
149
-    if s.kind == "prose":
150
-        return _prose_nll(s.content[:max_prose_chars], view)
151
-    if s.kind == "instruction":
152
-        if not s.probes:
153
-            return _prose_nll(s.content[:max_prose_chars], view)
154
-        return statistics.fmean(
155
-            -view.logprob_of(p.prompt, p.gold) / max(_token_estimate(p.gold), 1) for p in s.probes
156
-        )
157
-    if s.kind == "preference":
158
-        if not s.preferences:
159
-            return _prose_nll(s.content[:max_prose_chars], view)
160
-        return statistics.fmean(
161
-            -view.logprob_of(p.prompt, p.chosen) / max(_token_estimate(p.chosen), 1)
162
-            for p in s.preferences
163
-        )
164
-    raise ValueError(f"unknown section kind: {s.kind!r}")
165
-
166
-
167
-def _prose_nll(text: str, view: ScoringBackend) -> float:
168
-    """Negative-mean-logprob over ``text``. Returns 0 for empty input."""
169
-    if not text.strip():
170
-        return 0.0
171
-    r = view.rolling_logprob(text)
172
-    return -r.mean_logprob
173
-
174
-
175
-def _relative_lift(base_nll: float, ft_nll: float) -> float:
176
-    """``(base - ft) / base``. Positive → ft is lower-PPL than base.
177
-
178
-    Falls back to an absolute delta when ``base`` is pathological
179
-    (zero or negative), so the probe doesn't crash on degenerate
180
-    inputs.
181
-    """
182
-    if base_nll <= 0.0:
183
-        return float(base_nll - ft_nll)
184
-    return float((base_nll - ft_nll) / base_nll)
185
-
186
-
187
-def _token_estimate(s: str) -> int:
188
-    """Approximate tokens for normalization. Good enough for SentencePiece-ish vocabs."""
189
-    return max(1, len(s) // 4)
sway/src/dlm_sway/probes/style_fingerprint.pydeleted
@@ -1,179 +0,0 @@
1
-"""C1 StyleFingerprint — does ft prose *read* like the doc?
2
-
3
-Generates base and ft completions from a set of stylistic prompts,
4
-extracts a 6-dimensional fingerprint from each, and measures how the ft
5
-fingerprint has shifted **toward** the training document's own
6
-fingerprint vs the base.
7
-
8
-We compute the fingerprint with numpy-only features so the probe works
9
-out of the box without spaCy/textstat. The optional ``style`` extra
10
-upgrades the fingerprint with passive-voice rate and POS-entropy in a
11
-later milestone; the numeric contract — a non-negative vector per text
12
-— is stable across that upgrade.
13
-
14
-Signal: ``style_shift = cos(ft_fp - base_fp, doc_fp - base_fp)`` in
15
-fingerprint space. Positive values mean ft has moved *toward* the
16
-doc's style; negative values mean it moved *away* (a bad sign);
17
-near-zero means no stylistic shift detectable.
18
-"""
19
-
20
-from __future__ import annotations
21
-
22
-import re
23
-import statistics
24
-from typing import Literal
25
-
26
-import numpy as np
27
-from numpy.typing import NDArray
28
-from pydantic import Field
29
-
30
-from dlm_sway.core.result import ProbeResult, Verdict
31
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
32
-
33
-_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
34
-_PARAGRAPH_SPLIT = re.compile(r"\n\s*\n")
35
-_WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z'-]*\b")
36
-_PUNCTS = set(".,:;!?-—()[]\"'/")
37
-
38
-
39
-def fingerprint(text: str) -> NDArray[np.float64]:
40
-    """Return a 6-dim stylistic fingerprint for ``text``.
41
-
42
-    Dimensions (all numeric, scaled to order-1):
43
-      0. mean sentence length (words)  / 30.0
44
-      1. std sentence length (words)   / 30.0
45
-      2. type-token ratio              (already in [0,1])
46
-      3. avg word length (chars)       / 10.0
47
-      4. punctuation density per char  * 10.0
48
-      5. paragraph density (1 / avg paragraph length in words) * 30.0
49
-    """
50
-    if not text.strip():
51
-        return np.zeros(6, dtype=np.float64)
52
-
53
-    sentences = [s for s in _SENTENCE_SPLIT.split(text) if s.strip()]
54
-    paragraphs = [p for p in _PARAGRAPH_SPLIT.split(text) if p.strip()]
55
-    words = _WORD_RE.findall(text)
56
-    if not words:
57
-        return np.zeros(6, dtype=np.float64)
58
-
59
-    sentence_word_counts = [len(_WORD_RE.findall(s)) for s in sentences]
60
-    sentence_word_counts = [c for c in sentence_word_counts if c > 0]
61
-    if not sentence_word_counts:
62
-        sentence_word_counts = [len(words)]
63
-
64
-    mean_sent = statistics.fmean(sentence_word_counts)
65
-    std_sent = statistics.pstdev(sentence_word_counts) if len(sentence_word_counts) > 1 else 0.0
66
-    ttr = len({w.lower() for w in words}) / len(words)
67
-    avg_word_len = statistics.fmean(len(w) for w in words)
68
-    punct_count = sum(ch in _PUNCTS for ch in text)
69
-    punct_density = punct_count / max(len(text), 1)
70
-    avg_paragraph_len = (
71
-        statistics.fmean(len(_WORD_RE.findall(p)) for p in paragraphs) if paragraphs else len(words)
72
-    )
73
-    paragraph_density = 1.0 / max(avg_paragraph_len, 1.0)
74
-
75
-    return np.asarray(
76
-        [
77
-            mean_sent / 30.0,
78
-            std_sent / 30.0,
79
-            ttr,
80
-            avg_word_len / 10.0,
81
-            punct_density * 10.0,
82
-            paragraph_density * 30.0,
83
-        ],
84
-        dtype=np.float64,
85
-    )
86
-
87
-
88
-class StyleFingerprintSpec(ProbeSpec):
89
-    kind: Literal["style_fingerprint"] = "style_fingerprint"
90
-    prompts: list[str] = Field(default_factory=list)
91
-    """Prompts used to elicit a stylistic sample from each model."""
92
-    doc_reference: str = ""
93
-    """Concatenated reference text representing the adapter's intended
94
-    style. Typically the document itself; the .dlm bridge supplies this
95
-    from ``ctx.doc_text`` when left empty."""
96
-    max_new_tokens: int = 128
97
-    assert_shift_gte: float = 0.25
98
-    """Minimum cosine shift for PASS. ``0.25`` is a deliberately
99
-    permissive default — stylistic shift is a weaker signal than
100
-    perplexity lift."""
101
-
102
-
103
-class StyleFingerprintProbe(Probe):
104
-    kind = "style_fingerprint"
105
-    spec_cls = StyleFingerprintSpec
106
-    category = "calibration"
107
-
108
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
109
-        assert isinstance(spec, StyleFingerprintSpec)
110
-        if not spec.prompts:
111
-            return ProbeResult(
112
-                name=spec.name,
113
-                kind=spec.kind,
114
-                verdict=Verdict.ERROR,
115
-                score=None,
116
-                message="no prompts provided",
117
-            )
118
-        doc_text = spec.doc_reference or (ctx.doc_text or "")
119
-        if not doc_text.strip():
120
-            return ProbeResult(
121
-                name=spec.name,
122
-                kind=spec.kind,
123
-                verdict=Verdict.SKIP,
124
-                score=None,
125
-                message="no doc_reference (inline or from ctx.doc_text)",
126
-            )
127
-
128
-        base_samples: list[str] = []
129
-        ft_samples: list[str] = []
130
-        for prompt in spec.prompts:
131
-            with ctx.backend.as_base() as b:
132
-                base_samples.append(
133
-                    b.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
134
-                )
135
-            with ctx.backend.as_finetuned() as f:
136
-                ft_samples.append(
137
-                    f.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
138
-                )
139
-
140
-        base_fp = fingerprint("\n".join(base_samples))
141
-        ft_fp = fingerprint("\n".join(ft_samples))
142
-        doc_fp = fingerprint(doc_text)
143
-
144
-        shift = _cosine_shift(base_fp, ft_fp, doc_fp)
145
-        verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL
146
-        score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0))
147
-
148
-        return ProbeResult(
149
-            name=spec.name,
150
-            kind=spec.kind,
151
-            verdict=verdict,
152
-            score=score,
153
-            raw=shift,
154
-            evidence={
155
-                "base_fp": base_fp.tolist(),
156
-                "ft_fp": ft_fp.tolist(),
157
-                "doc_fp": doc_fp.tolist(),
158
-                "style_shift": shift,
159
-                "weight": spec.weight,
160
-            },
161
-            message=(
162
-                f"style_shift={shift:+.2f} "
163
-                f"({'toward' if shift > 0 else 'away from'} doc, "
164
-                f"threshold={spec.assert_shift_gte})"
165
-            ),
166
-        )
167
-
168
-
169
-def _cosine_shift(
170
-    base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64]
171
-) -> float:
172
-    """Cosine between (ft - base) and (doc - base) in fingerprint space."""
173
-    a = ft - base
174
-    b = doc - base
175
-    na = float(np.linalg.norm(a))
176
-    nb = float(np.linalg.norm(b))
177
-    if na == 0.0 or nb == 0.0:
178
-        return 0.0
179
-    return float(np.dot(a, b) / (na * nb))
sway/src/dlm_sway/py.typeddeleted
sway/src/dlm_sway/suite/__init__.pydeleted
@@ -1,1 +0,0 @@
1
-"""Suite plumbing: spec models, loader, runner, report, composite score."""
sway/src/dlm_sway/suite/loader.pydeleted
@@ -1,48 +0,0 @@
1
-"""Load + validate a ``sway.yaml`` into a :class:`SwaySpec`.
2
-
3
-Separated from :mod:`spec` so the data models stay trivially
4
-importable (no YAML dependency at import time for callers that
5
-construct specs programmatically).
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-from pathlib import Path
11
-from typing import Any
12
-
13
-import yaml
14
-from pydantic import ValidationError
15
-
16
-from dlm_sway.core.errors import SpecValidationError
17
-from dlm_sway.suite.spec import SwaySpec
18
-
19
-
20
-def load_spec(path: Path | str) -> SwaySpec:
21
-    """Parse ``path`` and return a validated :class:`SwaySpec`."""
22
-    resolved = Path(path).expanduser().resolve()
23
-    try:
24
-        raw_text = resolved.read_text(encoding="utf-8")
25
-    except FileNotFoundError as exc:
26
-        raise SpecValidationError(f"spec file not found: {resolved}", source=str(path)) from exc
27
-
28
-    try:
29
-        data = yaml.safe_load(raw_text)
30
-    except yaml.YAMLError as exc:
31
-        raise SpecValidationError(f"invalid YAML: {exc}", source=str(path)) from exc
32
-
33
-    if not isinstance(data, dict):
34
-        raise SpecValidationError("top-level document must be a mapping", source=str(path))
35
-    return from_dict(data, source=str(path))
36
-
37
-
38
-def from_dict(data: dict[str, Any], *, source: str | None = None) -> SwaySpec:
39
-    """Validate a dict (already parsed from YAML or JSON) as a SwaySpec."""
40
-    try:
41
-        spec = SwaySpec.model_validate(data)
42
-    except ValidationError as exc:
43
-        raise SpecValidationError(str(exc), source=source) from exc
44
-    try:
45
-        spec.check_version()
46
-    except ValueError as exc:
47
-        raise SpecValidationError(str(exc), source=source) from exc
48
-    return spec
sway/src/dlm_sway/suite/report.pydeleted
@@ -1,249 +0,0 @@
1
-"""Report emitters: terminal (rich), JSON, JUnit XML, markdown.
2
-
3
-The terminal renderer is the one a user sees; it's the product surface.
4
-It must communicate the verdict *and* the supporting evidence without
5
-forcing the user to open the JSON.
6
-
7
-JSON is the machine-readable source of truth — same fields as the
8
-:class:`SuiteResult` dataclass but flattened for easy downstream parsing
9
-(dashboards, diff tools, history tracking).
10
-
11
-JUnit XML exists to drop into CI pipelines so ``dlm-sway gate``
12
-integrates with existing test dashboards with no extra glue.
13
-"""
14
-
15
-from __future__ import annotations
16
-
17
-import json
18
-import xml.etree.ElementTree as ET
19
-from io import StringIO
20
-from typing import Any
21
-
22
-from rich.console import Console
23
-from rich.panel import Panel
24
-from rich.table import Table
25
-from rich.text import Text
26
-
27
-from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
28
-
29
-_VERDICT_STYLE = {
30
-    Verdict.PASS: "bold green",
31
-    Verdict.FAIL: "bold red",
32
-    Verdict.WARN: "bold yellow",
33
-    Verdict.SKIP: "dim",
34
-    Verdict.ERROR: "bold magenta",
35
-}
36
-
37
-
38
-def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None:
39
-    """Render the report to a rich Console (stdout by default)."""
40
-    c = console or Console()
41
-
42
-    header = Text.assemble(
43
-        ("dlm-sway report — ", "bold"),
44
-        (suite.base_model_id, "cyan"),
45
-        ("  vs  ", "dim"),
46
-        (_adapter_label(suite.adapter_id), "cyan"),
47
-    )
48
-    c.print(Panel(header, expand=False, border_style="blue"))
49
-
50
-    c.print()
51
-    c.print(
52
-        Text.assemble(
53
-            ("overall: ", "bold"),
54
-            (f"{score.overall:.2f}", _score_style(score.overall)),
55
-            ("  ", ""),
56
-            (f"[ {score.band} ]", _band_style(score.band)),
57
-        )
58
-    )
59
-
60
-    # Component breakdown
61
-    comp_table = Table.grid(padding=(0, 2))
62
-    comp_table.add_column(justify="left")
63
-    comp_table.add_column(justify="right")
64
-    comp_table.add_column()
65
-    for cat in ("adherence", "attribution", "calibration", "ablation", "baseline"):
66
-        if cat not in score.components:
67
-            continue
68
-        v = score.components[cat]
69
-        comp_table.add_row(cat, f"{v:.2f}", _bar(v))
70
-    c.print(comp_table)
71
-
72
-    c.print()
73
-    # Per-probe detail
74
-    detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
75
-    detail.add_column("name", style="cyan")
76
-    detail.add_column("kind", style="dim")
77
-    detail.add_column("verdict")
78
-    detail.add_column("score", justify="right")
79
-    detail.add_column("raw", justify="right")
80
-    detail.add_column("z", justify="right")
81
-    detail.add_column("note", style="dim")
82
-    for r in suite.probes:
83
-        detail.add_row(
84
-            r.name,
85
-            r.kind,
86
-            Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]),
87
-            f"{r.score:.2f}" if r.score is not None else "—",
88
-            f"{r.raw:.3f}" if r.raw is not None else "—",
89
-            f"{r.z_score:+.2f}σ" if r.z_score is not None else "—",
90
-            (r.message[:80] + "…") if len(r.message) > 80 else r.message,
91
-        )
92
-    c.print(detail)
93
-
94
-    if score.findings:
95
-        c.print()
96
-        c.print(Text("top findings:", style="bold"))
97
-        for i, f in enumerate(score.findings, start=1):
98
-            c.print(f"  {i}. {f}")
99
-
100
-    c.print()
101
-    c.print(Text(f"wall: {suite.wall_seconds:.2f}s  |  sway {suite.sway_version}", style="dim"))
102
-
103
-
104
-def to_json(suite: SuiteResult, score: SwayScore) -> str:
105
-    """Serialize the suite + composite score as JSON.
106
-
107
-    Stable schema; downstream tools rely on it. Breaking changes bump a
108
-    ``schema_version`` field (not yet present — this is v0.1).
109
-    """
110
-    return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True)
111
-
112
-
113
-def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]:
114
-    return {
115
-        "schema_version": 1,
116
-        "sway_version": suite.sway_version,
117
-        "spec_path": suite.spec_path,
118
-        "base_model_id": suite.base_model_id,
119
-        "adapter_id": suite.adapter_id,
120
-        "started_at": suite.started_at.isoformat(),
121
-        "finished_at": suite.finished_at.isoformat(),
122
-        "wall_seconds": suite.wall_seconds,
123
-        "score": {
124
-            "overall": score.overall,
125
-            "band": score.band,
126
-            "components": score.components,
127
-            "weights": score.weights,
128
-            "findings": list(score.findings),
129
-        },
130
-        "null_stats": suite.null_stats,
131
-        "probes": [_probe_to_jsonable(p) for p in suite.probes],
132
-    }
133
-
134
-
135
-def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]:
136
-    return {
137
-        "name": r.name,
138
-        "kind": r.kind,
139
-        "verdict": r.verdict.value,
140
-        "score": r.score,
141
-        "raw": r.raw,
142
-        "z_score": r.z_score,
143
-        "base_value": r.base_value,
144
-        "ft_value": r.ft_value,
145
-        "evidence": r.evidence,
146
-        "message": r.message,
147
-        "duration_s": r.duration_s,
148
-    }
149
-
150
-
151
-def to_junit(suite: SuiteResult, score: SwayScore) -> str:
152
-    """Serialize as JUnit XML. One ``<testcase>`` per probe."""
153
-    testsuite = ET.Element(
154
-        "testsuite",
155
-        {
156
-            "name": "dlm-sway",
157
-            "tests": str(len(suite.probes)),
158
-            "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
159
-            "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
160
-            "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)),
161
-            "time": f"{suite.wall_seconds:.3f}",
162
-        },
163
-    )
164
-    # Properties — the composite score and category breakdown.
165
-    props = ET.SubElement(testsuite, "properties")
166
-    ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"})
167
-    ET.SubElement(props, "property", {"name": "band", "value": score.band})
168
-    for cat, v in score.components.items():
169
-        ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"})
170
-
171
-    for r in suite.probes:
172
-        tc = ET.SubElement(
173
-            testsuite,
174
-            "testcase",
175
-            {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"},
176
-        )
177
-        if r.verdict == Verdict.FAIL:
178
-            ET.SubElement(tc, "failure", {"message": r.message or "failed"})
179
-        elif r.verdict == Verdict.ERROR:
180
-            ET.SubElement(tc, "error", {"message": r.message or "errored"})
181
-        elif r.verdict == Verdict.SKIP:
182
-            ET.SubElement(tc, "skipped", {"message": r.message or "skipped"})
183
-
184
-    return ET.tostring(testsuite, encoding="unicode")
185
-
186
-
187
-def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
188
-    """A portable, CI-friendly markdown report."""
189
-    buf = StringIO()
190
-    buf.write("# dlm-sway report\n\n")
191
-    buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`)  \n")
192
-    buf.write(f"**Base:** `{suite.base_model_id}`  \n")
193
-    buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
194
-    buf.write(f"**Wall:** {suite.wall_seconds:.2f}s  \n\n")
195
-
196
-    buf.write("## Components\n\n")
197
-    buf.write("| category | score |\n|---|---:|\n")
198
-    for cat, v in score.components.items():
199
-        buf.write(f"| {cat} | {v:.2f} |\n")
200
-    buf.write("\n## Probes\n\n")
201
-    buf.write("| name | kind | verdict | score | note |\n|---|---|---|---:|---|\n")
202
-    for r in suite.probes:
203
-        buf.write(
204
-            f"| {r.name} | `{r.kind}` | {r.verdict.value} | "
205
-            f"{f'{r.score:.2f}' if r.score is not None else '—'} | "
206
-            f"{r.message[:60]} |\n"
207
-        )
208
-    if score.findings:
209
-        buf.write("\n## Top findings\n\n")
210
-        for f in score.findings:
211
-            buf.write(f"- {f}\n")
212
-    return buf.getvalue()
213
-
214
-
215
-# -- helpers -----------------------------------------------------------
216
-
217
-
218
-def _adapter_label(adapter_id: str) -> str:
219
-    if not adapter_id:
220
-        return "(base only)"
221
-    # Only the trailing path chunk is useful in the header.
222
-    parts = adapter_id.rstrip("/").split("/")
223
-    return "/".join(parts[-3:]) if len(parts) > 3 else adapter_id
224
-
225
-
226
-def _score_style(v: float) -> str:
227
-    if v >= 0.6:
228
-        return "bold green"
229
-    if v >= 0.3:
230
-        return "bold yellow"
231
-    return "bold red"
232
-
233
-
234
-def _band_style(band: str) -> str:
235
-    return {
236
-        "noise": "red",
237
-        "partial": "yellow",
238
-        "healthy": "green",
239
-        "suspicious": "magenta",
240
-    }.get(band, "white")
241
-
242
-
243
-def _bar(v: float, *, width: int = 10) -> str:
244
-    clamped = max(0.0, min(1.0, v))
245
-    filled = int(round(clamped * width))
246
-    return "█" * filled + "░" * (width - filled)
247
-
248
-
249
-__all__ = ["to_terminal", "to_json", "to_junit", "to_markdown"]
sway/src/dlm_sway/suite/runner.pydeleted
@@ -1,136 +0,0 @@
1
-"""Suite runner.
2
-
3
-Iterates the probe list, materializes each into a ``(Probe, Spec)`` via
4
-the registry, executes it with a :class:`~dlm_sway.probes.base.RunContext`,
5
-and assembles a :class:`~dlm_sway.core.result.SuiteResult`.
6
-
7
-Runtime contract:
8
-
9
-- Probes are executed in declaration order (not sorted, not parallelized).
10
-  The null-adapter baseline has to run before any probe that needs z-scores,
11
-  so authoring order is load-bearing.
12
-- A probe that raises is recorded as
13
-  :attr:`~dlm_sway.core.result.Verdict.ERROR` and the suite continues —
14
-  one broken probe doesn't torch the whole report.
15
-- The backend is the caller's responsibility: the runner does not build
16
-  or close it, so callers can reuse a backend across multiple suites.
17
-"""
18
-
19
-from __future__ import annotations
20
-
21
-import time
22
-
23
-from dlm_sway import __version__
24
-from dlm_sway.core.errors import ProbeError
25
-from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
26
-from dlm_sway.core.scoring import DifferentialBackend
27
-from dlm_sway.core.sections import Section
28
-from dlm_sway.probes.base import RunContext, build_probe
29
-from dlm_sway.probes.null_adapter import NullAdapterSpec, get_null_stats
30
-from dlm_sway.suite.spec import SwaySpec
31
-
32
-
33
-def run(
34
-    spec: SwaySpec,
35
-    backend: DifferentialBackend,
36
-    *,
37
-    spec_path: str = "<memory>",
38
-    doc_text: str | None = None,
39
-    sections: tuple[Section, ...] | None = None,
40
-) -> SuiteResult:
41
-    """Execute every probe in ``spec`` against ``backend``."""
42
-    started = utcnow()
43
-    ctx = RunContext(
44
-        backend=backend,
45
-        seed=spec.defaults.seed,
46
-        top_k=spec.defaults.top_k,
47
-        sections=sections,
48
-        doc_text=doc_text,
49
-    )
50
-
51
-    results: list[ProbeResult] = []
52
-    null_stats: dict[str, dict[str, float]] = {}
53
-
54
-    for raw in spec.suite:
55
-        probe, probe_spec = build_probe(raw)
56
-        if not probe_spec.enabled:
57
-            results.append(
58
-                ProbeResult(
59
-                    name=probe_spec.name,
60
-                    kind=probe_spec.kind,
61
-                    verdict=Verdict.SKIP,
62
-                    score=None,
63
-                    message="disabled in spec",
64
-                )
65
-            )
66
-            continue
67
-
68
-        t0 = time.perf_counter()
69
-        try:
70
-            result = probe.run(probe_spec, ctx)
71
-        except ProbeError as exc:
72
-            result = ProbeResult(
73
-                name=probe_spec.name,
74
-                kind=probe_spec.kind,
75
-                verdict=Verdict.ERROR,
76
-                score=None,
77
-                message=str(exc),
78
-            )
79
-        except Exception as exc:  # noqa: BLE001 — probe impls may raise anything
80
-            result = ProbeResult(
81
-                name=probe_spec.name,
82
-                kind=probe_spec.kind,
83
-                verdict=Verdict.ERROR,
84
-                score=None,
85
-                message=f"{type(exc).__name__}: {exc}",
86
-            )
87
-        duration = time.perf_counter() - t0
88
-        # Re-stamp duration (probes don't know their own wall time).
89
-        result = _with_duration(result, duration)
90
-        results.append(result)
91
-
92
-        # Null-adapter result seeds ctx.null_stats for subsequent probes.
93
-        if isinstance(probe_spec, NullAdapterSpec) and result.evidence.get("null_stats"):
94
-            null_stats.update(result.evidence["null_stats"])
95
-            # RunContext is frozen; swap in a fresh one so later probes
96
-            # see the populated stats.
97
-            ctx = RunContext(
98
-                backend=ctx.backend,
99
-                seed=ctx.seed,
100
-                top_k=ctx.top_k,
101
-                sections=ctx.sections,
102
-                doc_text=ctx.doc_text,
103
-                null_stats=null_stats,
104
-            )
105
-
106
-    finished = utcnow()
107
-    return SuiteResult(
108
-        spec_path=spec_path,
109
-        started_at=started,
110
-        finished_at=finished,
111
-        base_model_id=spec.models.base.base,
112
-        adapter_id=str(spec.models.ft.adapter) if spec.models.ft.adapter else "",
113
-        sway_version=__version__,
114
-        probes=tuple(results),
115
-        null_stats=null_stats,
116
-    )
117
-
118
-
119
-def _with_duration(result: ProbeResult, duration: float) -> ProbeResult:
120
-    """Return a copy of ``result`` with :attr:`ProbeResult.duration_s` set."""
121
-    return ProbeResult(
122
-        name=result.name,
123
-        kind=result.kind,
124
-        verdict=result.verdict,
125
-        score=result.score,
126
-        raw=result.raw,
127
-        z_score=result.z_score,
128
-        base_value=result.base_value,
129
-        ft_value=result.ft_value,
130
-        evidence=result.evidence,
131
-        message=result.message,
132
-        duration_s=duration,
133
-    )
134
-
135
-
136
-__all__ = ["get_null_stats", "run"]
sway/src/dlm_sway/suite/score.pydeleted
@@ -1,106 +0,0 @@
1
-"""Composite :class:`~dlm_sway.core.result.SwayScore` from a suite result.
2
-
3
-The score is a weighted mean over four categories
4
-(adherence / attribution / calibration / ablation). Each category's
5
-value is the weighted mean of its pass/score values (with SKIP/ERROR
6
-excluded so a broken probe doesn't silently depress the composite).
7
-
8
-All weighting is explicit, user-overridable, and surfaced in the report
9
-alongside the number — no black-box scoring.
10
-"""
11
-
12
-from __future__ import annotations
13
-
14
-from dlm_sway.core.result import (
15
-    DEFAULT_COMPONENT_WEIGHTS,
16
-    ProbeResult,
17
-    SuiteResult,
18
-    SwayScore,
19
-    Verdict,
20
-)
21
-from dlm_sway.probes.base import registry
22
-
23
-
24
-def compute(
25
-    suite: SuiteResult,
26
-    *,
27
-    weights: dict[str, float] | None = None,
28
-) -> SwayScore:
29
-    """Fold a :class:`SuiteResult` into a :class:`SwayScore`."""
30
-    w = weights if weights is not None else dict(DEFAULT_COMPONENT_WEIGHTS)
31
-    registered = registry()
32
-
33
-    # Bucket probes by their declared category.
34
-    buckets: dict[str, list[ProbeResult]] = {k: [] for k in w}
35
-    for r in suite.probes:
36
-        if r.verdict in {Verdict.SKIP, Verdict.ERROR}:
37
-            continue
38
-        if r.score is None:
39
-            continue
40
-        probe_cls = registered.get(r.kind)
41
-        category = probe_cls.category if probe_cls is not None else "adherence"
42
-        buckets.setdefault(category, []).append(r)
43
-
44
-    component_scores: dict[str, float] = {}
45
-    for cat, probes in buckets.items():
46
-        if not probes:
47
-            component_scores[cat] = 0.0
48
-            continue
49
-        total_w = sum(max(_spec_weight(p), 0.0) for p in probes) or 1.0
50
-        weighted = sum(max(_spec_weight(p), 0.0) * (p.score or 0.0) for p in probes)
51
-        component_scores[cat] = weighted / total_w
52
-
53
-    # Fold to composite, weighted by the user's category weights, but
54
-    # ignoring components that had no contributing probes (so a
55
-    # PREFERENCE-free document doesn't get penalized for missing B3).
56
-    active_weights = {k: v for k, v in w.items() if buckets.get(k)}
57
-    total_w = sum(active_weights.values()) or 1.0
58
-    overall = sum(active_weights[k] * component_scores[k] for k in active_weights) / total_w
59
-
60
-    findings = _findings(suite, component_scores)
61
-
62
-    return SwayScore(
63
-        overall=overall,
64
-        components=component_scores,
65
-        weights=w,
66
-        band=SwayScore.band_for(overall),
67
-        findings=findings,
68
-    )
69
-
70
-
71
-def _spec_weight(result: ProbeResult) -> float:
72
-    """Recover a probe's declared weight from its ``evidence`` payload.
73
-
74
-    The runner stores ``spec.weight`` on evidence so the scorer can read
75
-    it without re-validating specs. Falls back to 1.0 when absent (older
76
-    runs, custom probes, etc).
77
-    """
78
-    w = result.evidence.get("weight")
79
-    if isinstance(w, int | float):
80
-        return float(w)
81
-    return 1.0
82
-
83
-
84
-def _findings(suite: SuiteResult, components: dict[str, float]) -> tuple[str, ...]:
85
-    """Surface the 2–3 most diagnostic notes for the terminal report."""
86
-    notes: list[str] = []
87
-
88
-    failed = [r for r in suite.probes if r.verdict == Verdict.FAIL]
89
-    if failed:
90
-        top = failed[0]
91
-        notes.append(
92
-            f"{top.name} ({top.kind}) failed" + (f": {top.message}" if top.message else "")
93
-        )
94
-
95
-    for cat, score in components.items():
96
-        if score < 0.3 and components.get(cat, 1.0) != 0.0:
97
-            notes.append(f"{cat} score is {score:.2f} — below the noise threshold")
98
-
99
-    errors = [r for r in suite.probes if r.verdict == Verdict.ERROR]
100
-    if errors:
101
-        notes.append(f"{len(errors)} probe(s) errored — see full report for details")
102
-
103
-    return tuple(notes[:5])
104
-
105
-
106
-__all__ = ["compute"]
sway/src/dlm_sway/suite/spec.pydeleted
@@ -1,72 +0,0 @@
1
-"""Top-level ``sway.yaml`` spec models.
2
-
3
-Per-probe specs live next to their implementations in
4
-:mod:`dlm_sway.probes`. This module owns the *outer* envelope —
5
-``version``, ``models``, ``defaults``, ``suite`` — plus the runtime
6
-bind between raw probe dicts and registered probe classes.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-from typing import Annotated, Any
12
-
13
-from pydantic import BaseModel, ConfigDict, Field
14
-
15
-from dlm_sway.core.model import ModelSpec
16
-
17
-SUPPORTED_VERSION = 1
18
-
19
-
20
-class SuiteModels(BaseModel):
21
-    """Named model handles the suite references — ``base`` + ``ft``."""
22
-
23
-    model_config = ConfigDict(extra="forbid", frozen=True)
24
-
25
-    base: ModelSpec
26
-    ft: ModelSpec
27
-
28
-
29
-class SuiteDefaults(BaseModel):
30
-    """Shared defaults for the whole suite. Probes may override per-entry."""
31
-
32
-    model_config = ConfigDict(extra="forbid", frozen=True)
33
-
34
-    seed: int = 0
35
-    top_k: int = 256
36
-    differential: bool = True
37
-    """If ``False``, the runner loads base + ft as two separate models
38
-    instead of toggling on one. More memory-heavy; only useful when a
39
-    backend can't do in-place toggling."""
40
-    coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6
41
-    """Minimum composite score for ``dlm-sway gate`` to pass."""
42
-
43
-
44
-class SwaySpec(BaseModel):
45
-    """Root of ``sway.yaml``."""
46
-
47
-    model_config = ConfigDict(extra="forbid", frozen=True)
48
-
49
-    version: int = 1
50
-    models: SuiteModels
51
-    defaults: SuiteDefaults = SuiteDefaults()
52
-    suite: list[dict[str, Any]] = Field(default_factory=list)
53
-    """Raw probe entries. Validated one-at-a-time by the probe registry
54
-    via :func:`dlm_sway.probes.base.build_probe` so that the set of
55
-    allowed probe kinds is an open registry rather than a closed
56
-    discriminated union."""
57
-    dlm_source: str | None = None
58
-    """Optional path to a ``.dlm`` file. When present, the runner asks
59
-    :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and
60
-    hands them to probes via :attr:`RunContext.sections`. Auto-populated
61
-    by ``dlm-sway autogen``."""
62
-
63
-    def check_version(self) -> None:
64
-        """Raise ``ValueError`` if the spec version is unsupported.
65
-
66
-        Called explicitly by the loader after validation so the error
67
-        surfaces with a loader-source tag rather than a pydantic stack.
68
-        """
69
-        if self.version != SUPPORTED_VERSION:
70
-            raise ValueError(
71
-                f"unsupported sway spec version: {self.version} (this build supports {SUPPORTED_VERSION})"
72
-            )
sway/src/dlm_sway/visualize.pydeleted
@@ -1,137 +0,0 @@
1
-"""Optional matplotlib-based visualizations.
2
-
3
-Behind the ``viz`` extra. Three functions cover the three plots that
4
-make the sway report come alive in a notebook or saved PNG:
5
-
6
-- :func:`plot_section_sis`: per-section bar chart of effective SIS
7
-  (the flagship attribution view).
8
-- :func:`plot_adapter_ablation`: the λ-scaled divergence curve — the
9
-  sway signature plot.
10
-- :func:`plot_kl_histogram`: distribution of per-prompt KL divergences
11
-  (the raw data behind A1 DeltaKL).
12
-
13
-Each function raises :class:`~dlm_sway.core.errors.BackendNotAvailableError`
14
-with a pip hint when matplotlib isn't installed. No function writes to
15
-disk on your behalf — the caller decides (``fig.savefig(...)``).
16
-"""
17
-
18
-from __future__ import annotations
19
-
20
-from typing import Any
21
-
22
-from dlm_sway.core.errors import BackendNotAvailableError
23
-from dlm_sway.core.result import SuiteResult
24
-
25
-
26
-def _require_mpl() -> Any:
27
-    try:
28
-        import matplotlib.pyplot as plt
29
-
30
-        return plt
31
-    except ImportError as exc:
32
-        raise BackendNotAvailableError(
33
-            "visualize",
34
-            extra="viz",
35
-            hint="sway's visualization module needs matplotlib.",
36
-        ) from exc
37
-
38
-
39
-def plot_section_sis(suite: SuiteResult) -> Any:
40
-    """Render a per-section ``effective_sis`` bar chart.
41
-
42
-    Returns the matplotlib ``Figure``; the caller handles display / save.
43
-    """
44
-    plt = _require_mpl()
45
-
46
-    probe = _find_probe(suite, "section_internalization")
47
-    if probe is None or not probe.evidence.get("per_section"):
48
-        raise ValueError("suite has no section_internalization evidence to plot")
49
-
50
-    rows: list[dict[str, Any]] = list(probe.evidence["per_section"])
51
-    labels = [f"{row['tag'] or row['section_id'][:8]}\n({row['kind']})" for row in rows]
52
-    values = [float(row["effective_sis"]) for row in rows]
53
-    colors = ["#2ca02c" if row["passed"] else "#d62728" for row in rows]
54
-
55
-    fig, ax = plt.subplots(figsize=(max(6.0, 0.7 * len(rows)), 4.0))
56
-    ax.bar(range(len(rows)), values, color=colors)
57
-    ax.axhline(
58
-        float(probe.evidence.get("per_section_threshold", 0.0)),
59
-        color="gray",
60
-        linestyle="--",
61
-        linewidth=1,
62
-        label="threshold",
63
-    )
64
-    ax.set_xticks(range(len(rows)))
65
-    ax.set_xticklabels(labels, rotation=30, ha="right")
66
-    ax.set_ylabel("effective SIS")
67
-    ax.set_title("Section Internalization Score")
68
-    ax.legend(loc="best")
69
-    fig.tight_layout()
70
-    return fig
71
-
72
-
73
-def plot_adapter_ablation(suite: SuiteResult) -> Any:
74
-    """Render the signature λ-scaled divergence curve."""
75
-    plt = _require_mpl()
76
-
77
-    probe = _find_probe(suite, "adapter_ablation")
78
-    if probe is None or not probe.evidence.get("lambdas"):
79
-        raise ValueError("suite has no adapter_ablation evidence to plot")
80
-
81
-    lambdas = list(probe.evidence["lambdas"])
82
-    divs = list(probe.evidence["mean_divergence_per_lambda"])
83
-
84
-    fig, ax = plt.subplots(figsize=(7.0, 4.0))
85
-    ax.plot(lambdas, divs, marker="o", linewidth=2, color="#1f77b4")
86
-    ax.axvline(1.0, color="gray", linestyle=":", linewidth=1, label="λ=1 (trained)")
87
-    sat = probe.evidence.get("saturation_lambda")
88
-    if sat is not None:
89
-        ax.axvline(
90
-            float(sat),
91
-            color="#2ca02c",
92
-            linestyle="--",
93
-            linewidth=1,
94
-            label=f"sat λ={float(sat):.2f}",
95
-        )
96
-    ax.set_xlabel("λ (adapter scale)")
97
-    ax.set_ylabel("mean JS divergence vs λ=0")
98
-    ax.set_title(
99
-        f"Adapter Ablation (R²={float(probe.evidence.get('linearity', 0.0)):.2f}, "
100
-        f"overshoot={float(probe.evidence.get('overshoot', 0.0)):.2f})"
101
-    )
102
-    ax.legend(loc="best")
103
-    fig.tight_layout()
104
-    return fig
105
-
106
-
107
-def plot_kl_histogram(suite: SuiteResult) -> Any:
108
-    """Render the per-prompt KL distribution from a DeltaKL probe."""
109
-    plt = _require_mpl()
110
-
111
-    probe = _find_probe(suite, "delta_kl")
112
-    if probe is None or not probe.evidence.get("per_prompt"):
113
-        raise ValueError("suite has no delta_kl evidence to plot")
114
-
115
-    values = list(probe.evidence["per_prompt"])
116
-    fig, ax = plt.subplots(figsize=(7.0, 4.0))
117
-    ax.hist(values, bins=max(5, min(20, len(values) // 2)), color="#ff7f0e", edgecolor="white")
118
-    ax.axvline(
119
-        float(probe.raw or 0.0),
120
-        color="black",
121
-        linestyle="--",
122
-        linewidth=1,
123
-        label=f"mean={float(probe.raw or 0.0):.3f}",
124
-    )
125
-    ax.set_xlabel(probe.evidence.get("divergence_kind", "divergence"))
126
-    ax.set_ylabel("count")
127
-    ax.set_title("DeltaKL — per-prompt distribution")
128
-    ax.legend(loc="best")
129
-    fig.tight_layout()
130
-    return fig
131
-
132
-
133
-def _find_probe(suite: SuiteResult, kind: str) -> Any:
134
-    for p in suite.probes:
135
-        if p.kind == kind:
136
-            return p
137
-    return None
sway/tests/__init__.pydeleted
sway/tests/conftest.pydeleted
@@ -1,29 +0,0 @@
1
-"""Shared test fixtures.
2
-
3
-Keep the default fast-test environment offline and deterministic so unit
4
-tests stay below ~1 s per file. Integration tests override these via
5
-their own ``conftest`` when they need network access.
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-import pytest
11
-
12
-# Import the probes package once so every shipped probe registers itself
13
-# with the central registry. Tests that exercise build_probe("delta_kl",
14
-# …) rely on this.
15
-import dlm_sway.probes  # noqa: F401
16
-
17
-
18
-@pytest.fixture(autouse=True)
19
-def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None:
20
-    """Unit tests never touch the network.
21
-
22
-    Any backend test that needs HF should be marked ``@pytest.mark.online``
23
-    and clear these vars explicitly.
24
-    """
25
-    monkeypatch.setenv("HF_HUB_OFFLINE", "1")
26
-    monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1")
27
-    monkeypatch.setenv("HF_DATASETS_OFFLINE", "1")
28
-    monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1")
29
-    monkeypatch.setenv("DO_NOT_TRACK", "1")
sway/tests/fixtures/__init__.pydeleted
sway/tests/fixtures/tiny_model.pydeleted
@@ -1,53 +0,0 @@
1
-"""Tiny-model fixture for integration tests.
2
-
3
-Mirrors ``dlm.tests.fixtures.tiny_model``: session-scoped snapshot of
4
-SmolLM2-135M-Instruct, reused across the whole test run. The model is
5
-small enough (~280 MB on disk, ~600 MB in fp32 VRAM) to make integration
6
-tests feasible in CI.
7
-
8
-Tests using this fixture must carry ``@pytest.mark.slow`` and
9
-``@pytest.mark.online`` — the default test selection excludes both.
10
-"""
11
-
12
-from __future__ import annotations
13
-
14
-import os
15
-from collections.abc import Iterator
16
-from pathlib import Path
17
-
18
-import pytest
19
-
20
-TINY_MODEL_HF_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
21
-TINY_MODEL_REVISION = os.environ.get("DLM_SWAY_TINY_MODEL_REVISION", "main")
22
-
23
-
24
-def _offline_mode() -> bool:
25
-    return os.environ.get("SWAY_OFFLINE", "0") == "1"
26
-
27
-
28
-@pytest.fixture(scope="session")
29
-def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]:
30
-    """Download (or reuse) the tiny model; yield the cached directory.
31
-
32
-    Test opts in via ``@pytest.mark.online`` — the session-wide offline
33
-    env vars are cleared inside this fixture so ``snapshot_download``
34
-    actually fetches.
35
-    """
36
-    from huggingface_hub import snapshot_download
37
-
38
-    # Clear offline env guards (set by the unit-test autouse fixture).
39
-    prior = {
40
-        k: os.environ.pop(k, None)
41
-        for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE")
42
-    }
43
-    try:
44
-        path = snapshot_download(
45
-            repo_id=TINY_MODEL_HF_ID,
46
-            revision=TINY_MODEL_REVISION,
47
-            local_files_only=_offline_mode(),
48
-        )
49
-        yield Path(path)
50
-    finally:
51
-        for k, v in prior.items():
52
-            if v is not None:
53
-                os.environ[k] = v
sway/tests/integration/__init__.pydeleted
sway/tests/integration/conftest.pydeleted
@@ -1,10 +0,0 @@
1
-"""Integration-test configuration.
2
-
3
-Integration tests need network + heavy deps. Re-export the tiny_model
4
-fixture here so test modules can pick it up without a long import
5
-path.
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-from tests.fixtures.tiny_model import tiny_model_dir  # noqa: F401 — re-export
sway/tests/integration/test_hf_adapter_toggle.pydeleted
@@ -1,113 +0,0 @@
1
-"""Integration test: PEFT ``disable_adapter`` actually changes logits.
2
-
3
-This is the load-bearing sanity check for the whole differential design.
4
-If a future ``peft`` release subtly breaks the disable-context semantics,
5
-sway's KL / SIS / ablation probes would all silently report zero signal.
6
-We catch that here, before the rest of the test battery runs.
7
-
8
-The test builds a random-init LoRA adapter on a tiny model so no network
9
-dependency beyond the base model snapshot itself.
10
-"""
11
-
12
-from __future__ import annotations
13
-
14
-from pathlib import Path
15
-
16
-import pytest
17
-
18
-from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
19
-from dlm_sway.core.model import ModelSpec
20
-
21
-pytestmark = [pytest.mark.slow, pytest.mark.online]
22
-
23
-
24
-def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
25
-    """Construct a LoRA adapter with random-init weights on ``base_dir``.
26
-
27
-    The weights are kept small so the toggle-delta is clear but the
28
-    adapter is structurally valid (correct ``adapter_config.json``,
29
-    tokenizer files, safetensors layout).
30
-    """
31
-    import torch
32
-    from peft import LoraConfig, get_peft_model
33
-    from transformers import AutoModelForCausalLM, AutoTokenizer
34
-
35
-    torch.manual_seed(0)
36
-
37
-    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
38
-    if tokenizer.pad_token_id is None:
39
-        tokenizer.pad_token = tokenizer.eos_token
40
-    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
41
-
42
-    cfg = LoraConfig(
43
-        r=8,
44
-        lora_alpha=16,
45
-        target_modules=["q_proj", "v_proj"],
46
-        lora_dropout=0.0,
47
-        bias="none",
48
-        task_type="CAUSAL_LM",
49
-    )
50
-    peft_model = get_peft_model(base, cfg)
51
-
52
-    # Explicitly scale lora_B out of its PEFT-default zero-init so the
53
-    # adapter actually changes outputs. Real training does this via
54
-    # gradients; we do it with a scaled normal.
55
-    with torch.no_grad():
56
-        for name, param in peft_model.named_parameters():
57
-            if "lora_B" in name:
58
-                param.copy_(torch.randn_like(param) * 0.05)
59
-
60
-    peft_model.save_pretrained(str(out_dir))
61
-    tokenizer.save_pretrained(str(out_dir))
62
-
63
-
64
-@pytest.fixture(scope="module")
65
-def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
66
-    adapter_dir = tmp_path_factory.mktemp("random-adapter")
67
-    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
68
-    return adapter_dir
69
-
70
-
71
-def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None:
72
-    """The keystone invariant: base view ≠ ft view on the same prompt."""
73
-    import numpy as np
74
-
75
-    backend = HuggingFaceDifferentialBackend(
76
-        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
77
-        adapter_path=random_adapter,
78
-    )
79
-    try:
80
-        prompt = "The quick brown fox"
81
-        with backend.as_base() as b:
82
-            base_dist = b.next_token_dist(prompt, top_k=32)
83
-        with backend.as_finetuned() as f:
84
-            ft_dist = f.next_token_dist(prompt, top_k=32)
85
-
86
-        # Top-k indices may shift under the adapter; take a safe shared
87
-        # subset instead of asserting identical ordering.
88
-        assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose(
89
-            base_dist.logprobs, ft_dist.logprobs, atol=1e-5
90
-        ), "adapter toggle did not change next-token distribution"
91
-    finally:
92
-        backend.close()
93
-
94
-
95
-def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None:
96
-    """as_base → as_finetuned → as_base yields a stable base view."""
97
-    import numpy as np
98
-
99
-    backend = HuggingFaceDifferentialBackend(
100
-        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
101
-        adapter_path=random_adapter,
102
-    )
103
-    try:
104
-        prompt = "hello"
105
-        with backend.as_base() as b:
106
-            first = b.next_token_dist(prompt, top_k=16).logprobs
107
-        with backend.as_finetuned() as f:
108
-            f.next_token_dist(prompt, top_k=16)  # toggle
109
-        with backend.as_base() as b:
110
-            second = b.next_token_dist(prompt, top_k=16).logprobs
111
-        np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6)
112
-    finally:
113
-        backend.close()
sway/tests/unit/__init__.pydeleted
sway/tests/unit/test_backend_dummy.pydeleted
@@ -1,102 +0,0 @@
1
-"""Tests for :class:`dlm_sway.backends.dummy.DummyDifferentialBackend`.
2
-
3
-The dummy backend is used by every downstream probe unit test, so it
4
-gets a thorough own-right test here. Also verifies the view-exclusion
5
-invariant that catches stale-view bugs in probes.
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-import numpy as np
11
-import pytest
12
-
13
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
14
-from dlm_sway.core.model import Model
15
-from dlm_sway.core.scoring import DifferentialBackend, ScoringBackend
16
-
17
-
18
-@pytest.fixture
19
-def backend() -> DummyDifferentialBackend:
20
-    base = DummyResponses(
21
-        generations={"hi": "hello"},
22
-        logprobs={("q", "a"): -3.0},
23
-    )
24
-    ft = DummyResponses(
25
-        generations={"hi": "greetings, traveler"},
26
-        logprobs={("q", "a"): -1.2},
27
-    )
28
-    return DummyDifferentialBackend(base=base, ft=ft)
29
-
30
-
31
-class TestViews:
32
-    def test_as_base_and_as_ft_yield_distinct_generations(
33
-        self, backend: DummyDifferentialBackend
34
-    ) -> None:
35
-        with backend.as_base() as b:
36
-            assert b.generate("hi", max_new_tokens=5) == "hello"
37
-        with backend.as_finetuned() as f:
38
-            assert f.generate("hi", max_new_tokens=5) == "greetings, traveler"
39
-
40
-    def test_logprob_differs_between_modes(self, backend: DummyDifferentialBackend) -> None:
41
-        with backend.as_base() as b:
42
-            base_score = b.logprob_of("q", "a")
43
-        with backend.as_finetuned() as f:
44
-            ft_score = f.logprob_of("q", "a")
45
-        assert base_score == -3.0
46
-        assert ft_score == -1.2
47
-
48
-    def test_missing_generation_raises_keyerror(self, backend: DummyDifferentialBackend) -> None:
49
-        with backend.as_base() as b, pytest.raises(KeyError, match="no canned generation"):
50
-            b.generate("unconfigured", max_new_tokens=1)
51
-
52
-    def test_missing_logprob_default(self, backend: DummyDifferentialBackend) -> None:
53
-        with backend.as_base() as b:
54
-            assert b.logprob_of("nonexistent", "target") == -10.0
55
-
56
-
57
-class TestRollingLogprob:
58
-    def test_synthesized_when_not_preseeded(self, backend: DummyDifferentialBackend) -> None:
59
-        with backend.as_base() as b:
60
-            r = b.rolling_logprob("a quick brown fox jumps")
61
-        assert r.num_tokens == 5
62
-        assert r.logprobs.size == 4
63
-        assert np.all(r.logprobs == -2.0)
64
-
65
-    def test_ft_perplexity_lower_than_base(self, backend: DummyDifferentialBackend) -> None:
66
-        text = "a quick brown fox"
67
-        with backend.as_base() as b:
68
-            pb = b.rolling_logprob(text).perplexity
69
-        with backend.as_finetuned() as f:
70
-            pf = f.rolling_logprob(text).perplexity
71
-        assert pf < pb  # synthesized ft is less perplexed → lower PPL
72
-
73
-
74
-class TestTokenDist:
75
-    def test_dists_differ_between_modes(self, backend: DummyDifferentialBackend) -> None:
76
-        with backend.as_base() as b:
77
-            base_dist = b.next_token_dist("any prompt")
78
-        with backend.as_finetuned() as f:
79
-            ft_dist = f.next_token_dist("any prompt")
80
-        assert not np.array_equal(base_dist.logprobs, ft_dist.logprobs)
81
-
82
-
83
-class TestInvariants:
84
-    def test_protocol_satisfaction(self, backend: DummyDifferentialBackend) -> None:
85
-        assert isinstance(backend, DifferentialBackend)
86
-        with backend.as_base() as view:
87
-            assert isinstance(view, Model)
88
-            assert isinstance(view, ScoringBackend)
89
-
90
-    def test_nested_views_rejected(self, backend: DummyDifferentialBackend) -> None:
91
-        with backend.as_base(), pytest.raises(RuntimeError, match="view already active"):
92
-            with backend.as_finetuned():
93
-                pass
94
-
95
-    def test_sequential_views_fine(self, backend: DummyDifferentialBackend) -> None:
96
-        # Must be able to re-enter after exiting — common pattern in probes.
97
-        with backend.as_base() as b:
98
-            b.logprob_of("q", "a")
99
-        with backend.as_finetuned() as f:
100
-            f.logprob_of("q", "a")
101
-        with backend.as_base() as b:
102
-            b.logprob_of("q", "a")
sway/tests/unit/test_backend_registry.pydeleted
@@ -1,133 +0,0 @@
1
-"""Tests for the backend registry in ``dlm_sway.backends``.
2
-
3
-The registry is the single place that maps a ModelSpec to a concrete
4
-backend. These tests check the error paths — actually materializing an
5
-HF backend requires model weights and is covered by the integration
6
-suite.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-from pathlib import Path
12
-
13
-import pytest
14
-
15
-from dlm_sway.backends import build
16
-from dlm_sway.core.errors import BackendNotAvailableError, SpecValidationError
17
-from dlm_sway.core.model import ModelSpec
18
-
19
-
20
-class TestRegistry:
21
-    def test_dummy_rejected_via_build(self) -> None:
22
-        with pytest.raises(SpecValidationError, match="kind='dummy'"):
23
-            build(ModelSpec(base="x", kind="dummy"))
24
-
25
-    def test_hf_requires_adapter(self) -> None:
26
-        with pytest.raises(SpecValidationError, match="adapter"):
27
-            build(ModelSpec(base="x", kind="hf"))
28
-
29
-    def test_mlx_requires_adapter(self) -> None:
30
-        with pytest.raises(SpecValidationError, match="adapter"):
31
-            build(ModelSpec(base="x", kind="mlx"))
32
-
33
-    def test_mlx_dispatch_raises_when_mlx_missing(self) -> None:
34
-        # On non-Apple-Silicon (or Apple without mlx installed), constructing
35
-        # the MLX backend raises BackendNotAvailableError with a pip hint.
36
-        # We skip this assertion if mlx happens to be installed.
37
-        import importlib.util
38
-
39
-        if importlib.util.find_spec("mlx") is not None:
40
-            pytest.skip("mlx is installed; error path not exercised")
41
-        with pytest.raises(BackendNotAvailableError) as exc_info:
42
-            build(ModelSpec(base="x", kind="mlx", adapter=Path("/tmp/a")))
43
-        assert exc_info.value.backend == "mlx"
44
-
45
-    def test_custom_requires_entry_point(self) -> None:
46
-        with pytest.raises(SpecValidationError, match="entry_point"):
47
-            build(ModelSpec(base="x", kind="custom", adapter=Path("/tmp/a")))
48
-
49
-    def test_custom_validates_entry_point_shape(self) -> None:
50
-        with pytest.raises(SpecValidationError, match="pkg.module:ClassName"):
51
-            build(
52
-                ModelSpec(
53
-                    base="x",
54
-                    kind="custom",
55
-                    entry_point="not_a_valid_entry_point",
56
-                    adapter=Path("/tmp/a"),
57
-                )
58
-            )
59
-
60
-    def test_custom_rejects_unimportable_module(self) -> None:
61
-        with pytest.raises(SpecValidationError, match="cannot import"):
62
-            build(
63
-                ModelSpec(
64
-                    base="x",
65
-                    kind="custom",
66
-                    entry_point="nonexistent_pkg_xyz:Backend",
67
-                    adapter=Path("/tmp/a"),
68
-                )
69
-            )
70
-
71
-    def test_custom_rejects_missing_class(self) -> None:
72
-        with pytest.raises(SpecValidationError, match="has no attribute"):
73
-            build(
74
-                ModelSpec(
75
-                    base="x",
76
-                    kind="custom",
77
-                    entry_point="dlm_sway:NoSuchClass",
78
-                    adapter=Path("/tmp/a"),
79
-                )
80
-            )
81
-
82
-    def test_custom_rejects_non_differential_class(self) -> None:
83
-        # A class that accepts the canonical constructor args but doesn't
84
-        # implement the protocol.
85
-        import sys
86
-        import types
87
-
88
-        class _Bad:
89
-            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
90
-                del base_spec, adapter_path
91
-
92
-        mod = types.ModuleType("_sway_bad_mod")
93
-        mod.Bad = _Bad  # type: ignore[attr-defined]
94
-        sys.modules["_sway_bad_mod"] = mod
95
-
96
-        with pytest.raises(SpecValidationError, match="DifferentialBackend"):
97
-            build(
98
-                ModelSpec(
99
-                    base="x",
100
-                    kind="custom",
101
-                    entry_point="_sway_bad_mod:Bad",
102
-                    adapter=Path("/tmp/a"),
103
-                )
104
-            )
105
-
106
-    def test_custom_dispatches_to_valid_backend(self) -> None:
107
-        # Use the dummy backend via a custom entry point. The dummy class's
108
-        # __init__ takes different args, so we write a thin adapter class.
109
-        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
110
-
111
-        class _AdapterBackend(DummyDifferentialBackend):
112
-            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
113
-                super().__init__(base=DummyResponses(), ft=DummyResponses())
114
-
115
-        # Register on a throwaway module we can find by name.
116
-        import sys
117
-        import types
118
-
119
-        mod = types.ModuleType("_sway_custom_test_mod")
120
-        mod.AdapterBackend = _AdapterBackend  # type: ignore[attr-defined]
121
-        sys.modules["_sway_custom_test_mod"] = mod
122
-
123
-        backend = build(
124
-            ModelSpec(
125
-                base="x",
126
-                kind="custom",
127
-                entry_point="_sway_custom_test_mod:AdapterBackend",
128
-                adapter=Path("/tmp/a"),
129
-            )
130
-        )
131
-        from dlm_sway.core.scoring import DifferentialBackend
132
-
133
-        assert isinstance(backend, DifferentialBackend)
sway/tests/unit/test_cli.pydeleted
@@ -1,92 +0,0 @@
1
-"""Smoke tests for the dlm-sway CLI.
2
-
3
-We avoid exercising backends (they need real models) and instead test
4
-arg parsing, error paths, and the read-only commands (``doctor``,
5
-``report``, and the help surface).
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-import json
11
-from pathlib import Path
12
-
13
-from typer.testing import CliRunner
14
-
15
-from dlm_sway.cli.app import app
16
-
17
-
18
-def test_version_exits_zero() -> None:
19
-    result = CliRunner().invoke(app, ["--version"])
20
-    assert result.exit_code == 0
21
-    assert "dlm-sway" in result.stdout
22
-
23
-
24
-def test_help_lists_all_commands() -> None:
25
-    result = CliRunner().invoke(app, ["--help"])
26
-    assert result.exit_code == 0
27
-    for cmd in ("run", "gate", "check", "diff", "autogen", "doctor", "report"):
28
-        assert cmd in result.stdout
29
-
30
-
31
-def test_doctor_runs(capsys) -> None:  # type: ignore[no-untyped-def]
32
-    result = CliRunner().invoke(app, ["doctor"])
33
-    assert result.exit_code == 0
34
-    # Rich applies color codes by default; assert the bare product name appears.
35
-    assert "dlm-sway" in result.stdout
36
-    assert "backends" in result.stdout
37
-
38
-
39
-def test_run_without_file_errors(tmp_path: Path) -> None:
40
-    missing = tmp_path / "nope.yaml"
41
-    result = CliRunner().invoke(app, ["run", str(missing)])
42
-    # Exit code 2 = SwayError bubble-up; 1 = typer missing-arg; accept either.
43
-    assert result.exit_code != 0
44
-
45
-
46
-def test_report_from_json(tmp_path: Path) -> None:
47
-    sample = {
48
-        "schema_version": 1,
49
-        "sway_version": "0.1.0.dev0",
50
-        "base_model_id": "base",
51
-        "adapter_id": "adp",
52
-        "score": {"overall": 0.7, "band": "healthy", "components": {}, "findings": []},
53
-        "probes": [
54
-            {
55
-                "name": "p1",
56
-                "kind": "delta_kl",
57
-                "verdict": "pass",
58
-                "score": 0.7,
59
-                "message": "ok",
60
-            },
61
-        ],
62
-    }
63
-    path = tmp_path / "result.json"
64
-    path.write_text(json.dumps(sample), encoding="utf-8")
65
-
66
-    terminal = CliRunner().invoke(app, ["report", str(path)])
67
-    assert terminal.exit_code == 0
68
-    assert "p1" in terminal.stdout
69
-
70
-    md = CliRunner().invoke(app, ["report", str(path), "--format", "md"])
71
-    assert md.exit_code == 0
72
-    assert "dlm-sway report" in md.stdout
73
-
74
-    junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"])
75
-    assert junit.exit_code == 0
76
-    assert "<testsuite" in junit.stdout
77
-
78
-
79
-def test_autogen_without_dlm_extra_exits_nonzero(tmp_path: Path, monkeypatch) -> None:  # type: ignore[no-untyped-def]
80
-    # Force the import path to fail so the CLI prints the extra hint.
81
-    import builtins
82
-
83
-    real_import = builtins.__import__
84
-
85
-    def fake_import(name: str, *args: object, **kwargs: object):  # type: ignore[no-untyped-def]
86
-        if name.startswith("dlm_sway.integrations.dlm"):
87
-            raise ImportError("simulated missing extra")
88
-        return real_import(name, *args, **kwargs)  # type: ignore[no-untyped-call]
89
-
90
-    monkeypatch.setattr(builtins, "__import__", fake_import)
91
-    result = CliRunner().invoke(app, ["autogen", "any.dlm"])
92
-    assert result.exit_code != 0
sway/tests/unit/test_determinism.pydeleted
@@ -1,47 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.core.determinism`."""
2
-
3
-from __future__ import annotations
4
-
5
-import os
6
-import random
7
-
8
-import numpy as np
9
-
10
-from dlm_sway.core.determinism import DeterminismSummary, seed_everything
11
-
12
-
13
-class TestSeedEverything:
14
-    def test_returns_summary(self) -> None:
15
-        summary = seed_everything(0)
16
-        assert isinstance(summary, DeterminismSummary)
17
-        assert summary.seed == 0
18
-        assert summary.class_ in {"strict", "best_effort", "loose"}
19
-
20
-    def test_idempotent_for_stdlib_random(self) -> None:
21
-        seed_everything(42)
22
-        a = [random.random() for _ in range(5)]
23
-        seed_everything(42)
24
-        b = [random.random() for _ in range(5)]
25
-        assert a == b
26
-
27
-    def test_idempotent_for_numpy(self) -> None:
28
-        seed_everything(17)
29
-        a = np.random.rand(5)
30
-        seed_everything(17)
31
-        b = np.random.rand(5)
32
-        np.testing.assert_array_equal(a, b)
33
-
34
-    def test_cublas_workspace_set_under_strict(self) -> None:
35
-        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
36
-        seed_everything(0, strict=True)
37
-        assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8"
38
-
39
-    def test_non_strict_does_not_set_cublas(self) -> None:
40
-        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
41
-        seed_everything(0, strict=False)
42
-        # Non-strict mode must not leak the env var in either direction;
43
-        # the host environment's prior value wins.
44
-        assert (
45
-            "CUBLAS_WORKSPACE_CONFIG" not in os.environ
46
-            or os.environ["CUBLAS_WORKSPACE_CONFIG"] != ":4096:8"
47
-        )
sway/tests/unit/test_divergence.pydeleted
@@ -1,73 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes._divergence`."""
2
-
3
-from __future__ import annotations
4
-
5
-import math
6
-
7
-import numpy as np
8
-
9
-from dlm_sway.core.scoring import TokenDist
10
-from dlm_sway.probes._divergence import aligned_probs, divergence, js, kl
11
-
12
-
13
-def _dist(ids: list[int], probs: list[float], vocab: int = 100) -> TokenDist:
14
-    return TokenDist(
15
-        token_ids=np.asarray(ids, dtype=np.int64),
16
-        logprobs=np.log(np.asarray(probs, dtype=np.float32)),
17
-        vocab_size=vocab,
18
-    )
19
-
20
-
21
-class TestAligned:
22
-    def test_identical_distributions(self) -> None:
23
-        d = _dist([1, 2, 3], [0.5, 0.3, 0.2])
24
-        p, q = aligned_probs(d, d)
25
-        np.testing.assert_allclose(p, q)
26
-
27
-    def test_union_support_fills_missing(self) -> None:
28
-        base = _dist([1, 2, 3], [0.5, 0.3, 0.2])
29
-        ft = _dist([2, 3, 4], [0.4, 0.4, 0.2])
30
-        p, q = aligned_probs(base, ft)
31
-        assert p.shape == (4,)
32
-        assert abs(p.sum() - 1.0) < 1e-9
33
-        assert abs(q.sum() - 1.0) < 1e-9
34
-
35
-
36
-class TestKL:
37
-    def test_zero_when_equal(self) -> None:
38
-        p = np.array([0.5, 0.3, 0.2])
39
-        assert kl(p, p) == 0.0
40
-
41
-    def test_positive_when_different(self) -> None:
42
-        p = np.array([0.7, 0.2, 0.1])
43
-        q = np.array([0.2, 0.3, 0.5])
44
-        assert kl(p, q) > 0.0
45
-
46
-
47
-class TestJS:
48
-    def test_zero_when_equal(self) -> None:
49
-        p = np.array([0.5, 0.3, 0.2])
50
-        assert js(p, p) == 0.0
51
-
52
-    def test_symmetric(self) -> None:
53
-        p = np.array([0.7, 0.2, 0.1])
54
-        q = np.array([0.2, 0.3, 0.5])
55
-        assert math.isclose(js(p, q), js(q, p), rel_tol=1e-9)
56
-
57
-    def test_bounded_by_ln2(self) -> None:
58
-        p = np.array([1.0, 0.0])
59
-        q = np.array([0.0, 1.0])
60
-        # With zeros handled as 0·log0 = 0 this approaches ln(2).
61
-        assert js(p, q) <= math.log(2.0) + 1e-9
62
-
63
-
64
-class TestDivergenceDispatch:
65
-    def test_default_is_js(self) -> None:
66
-        d1 = _dist([1, 2], [0.6, 0.4])
67
-        d2 = _dist([1, 2], [0.3, 0.7])
68
-        assert divergence(d1, d2) == divergence(d1, d2, kind="js")
69
-
70
-    def test_kl_available(self) -> None:
71
-        d1 = _dist([1, 2], [0.6, 0.4])
72
-        d2 = _dist([1, 2], [0.3, 0.7])
73
-        assert divergence(d1, d2, kind="kl") >= 0.0
sway/tests/unit/test_dlm_bridge.pydeleted
@@ -1,252 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.integrations.dlm`.
2
-
3
-The bridge imports ``dlm.*`` modules lazily. We mock those via
4
-``sys.modules`` injection so the tests run without the ``dlm-sway[dlm]``
5
-extra installed. A full end-to-end integration test against a real
6
-``.dlm`` lives under ``tests/integration/``.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-import sys
12
-import types
13
-from dataclasses import dataclass
14
-from pathlib import Path
15
-
16
-import pytest
17
-import yaml
18
-
19
-
20
-@pytest.fixture
21
-def fake_dlm(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path:
22
-    """Install a fake ``dlm`` package so the resolver can import."""
23
-
24
-    # Build synthetic parsed .dlm structure.
25
-    @dataclass
26
-    class _Frontmatter:
27
-        dlm_id: str = "01TESTULID"
28
-        base_model: str = "smollm2-135m"
29
-
30
-    @dataclass
31
-    class _Section:
32
-        section_id: str
33
-        type: str
34
-        content: str
35
-        tag: str | None = None
36
-
37
-    @dataclass
38
-    class _Parsed:
39
-        frontmatter: _Frontmatter
40
-        sections: tuple[_Section, ...]
41
-
42
-    def _parse_file(_path: Path):  # type: ignore[no-untyped-def]
43
-        return _Parsed(
44
-            frontmatter=_Frontmatter(),
45
-            sections=(
46
-                _Section(
47
-                    section_id="prose-1",
48
-                    type="PROSE",
49
-                    content="This is a prose section with some information. Further detail follows.",
50
-                ),
51
-                _Section(
52
-                    section_id="instr-1",
53
-                    type="INSTRUCTION",
54
-                    content="### Q\nWhat is X?\n\n### A\nX is a concept\n",
55
-                ),
56
-                _Section(
57
-                    section_id="pref-1",
58
-                    type="PREFERENCE",
59
-                    content="chosen/rejected triple",
60
-                ),
61
-            ),
62
-        )
63
-
64
-    # Fake ``dlm.doc.parser`` module.
65
-    dlm_pkg = types.ModuleType("dlm")
66
-    dlm_doc = types.ModuleType("dlm.doc")
67
-    dlm_doc_parser = types.ModuleType("dlm.doc.parser")
68
-    dlm_doc_parser.parse_file = _parse_file  # type: ignore[attr-defined]
69
-
70
-    # Fake ``dlm.store.paths`` that returns a resolvable path.
71
-    dlm_store = types.ModuleType("dlm.store")
72
-    dlm_store_paths = types.ModuleType("dlm.store.paths")
73
-
74
-    adapter_dir = tmp_path / "adapter_v1"
75
-    adapter_dir.mkdir()
76
-    (adapter_dir / "adapter_config.json").write_text("{}", encoding="utf-8")
77
-
78
-    class _StorePath:
79
-        def __init__(self, path: Path) -> None:
80
-            self._p = path
81
-
82
-        def resolve_current_adapter(self) -> Path:
83
-            return self._p
84
-
85
-    def _for_dlm(_dlm_id: str) -> _StorePath:
86
-        return _StorePath(adapter_dir)
87
-
88
-    dlm_store_paths.StorePath = _StorePath  # type: ignore[attr-defined]
89
-    dlm_store_paths.for_dlm = _for_dlm  # type: ignore[attr-defined]
90
-
91
-    # Fake base-model resolver — returns a stub with an ``hf_id`` attribute.
92
-    dlm_base = types.ModuleType("dlm.base_models")
93
-
94
-    @dataclass
95
-    class _BaseSpec:
96
-        hf_id: str
97
-        key: str
98
-
99
-    def _resolve(key: str) -> _BaseSpec:
100
-        return _BaseSpec(hf_id="HuggingFaceTB/SmolLM2-135M-Instruct", key=key)
101
-
102
-    dlm_base.resolve = _resolve  # type: ignore[attr-defined]
103
-
104
-    # Fake instruction / preference parsers.
105
-    dlm_data = types.ModuleType("dlm.data")
106
-    dlm_data_instr = types.ModuleType("dlm.data.instruction_parser")
107
-    dlm_data_pref = types.ModuleType("dlm.data.preference_parser")
108
-
109
-    @dataclass
110
-    class _QAPair:
111
-        question: str
112
-        answer: str
113
-
114
-    @dataclass
115
-    class _Triple:
116
-        prompt: str
117
-        chosen: str
118
-        rejected: str
119
-
120
-    def _parse_instr(body: str, *, section_id: str) -> list[_QAPair]:
121
-        del section_id
122
-        out: list[_QAPair] = []
123
-        parts = body.split("### Q")
124
-        for part in parts[1:]:
125
-            q_block, _, a_block = part.partition("### A")
126
-            q = q_block.strip()
127
-            a = a_block.strip()
128
-            if q and a:
129
-                out.append(_QAPair(question=q, answer=a))
130
-        return out
131
-
132
-    def _parse_pref(body: str, *, section_id: str) -> list[_Triple]:
133
-        del body, section_id
134
-        return [_Triple(prompt="Which?", chosen="good answer", rejected="bad answer")]
135
-
136
-    dlm_data_instr.parse_instruction_body = _parse_instr  # type: ignore[attr-defined]
137
-    dlm_data_pref.parse_preference_body = _parse_pref  # type: ignore[attr-defined]
138
-
139
-    monkeypatch.setitem(sys.modules, "dlm", dlm_pkg)
140
-    monkeypatch.setitem(sys.modules, "dlm.doc", dlm_doc)
141
-    monkeypatch.setitem(sys.modules, "dlm.doc.parser", dlm_doc_parser)
142
-    monkeypatch.setitem(sys.modules, "dlm.store", dlm_store)
143
-    monkeypatch.setitem(sys.modules, "dlm.store.paths", dlm_store_paths)
144
-    monkeypatch.setitem(sys.modules, "dlm.base_models", dlm_base)
145
-    monkeypatch.setitem(sys.modules, "dlm.data", dlm_data)
146
-    monkeypatch.setitem(sys.modules, "dlm.data.instruction_parser", dlm_data_instr)
147
-    monkeypatch.setitem(sys.modules, "dlm.data.preference_parser", dlm_data_pref)
148
-
149
-    # Return a path to a fake .dlm file (the parser won't actually read it).
150
-    dlm_file = tmp_path / "doc.dlm"
151
-    dlm_file.write_text("---\ndlm_id: 01TEST\n---\n\nbody\n", encoding="utf-8")
152
-    return dlm_file
153
-
154
-
155
-def test_resolve_dlm_maps_sections(fake_dlm: Path) -> None:
156
-    from dlm_sway.integrations.dlm.resolver import resolve_dlm
157
-
158
-    handle = resolve_dlm(fake_dlm)
159
-    assert handle.dlm_id == "01TESTULID"
160
-    assert handle.base_model == "HuggingFaceTB/SmolLM2-135M-Instruct"
161
-    assert handle.adapter_path is not None
162
-    assert handle.adapter_path.exists()
163
-    assert len(handle.sections) == 3
164
-    # Kinds normalized from uppercase dlm enum values.
165
-    assert {s.kind for s in handle.sections} == {"prose", "instruction", "preference"}
166
-    # Instruction Q/A pair survived the translation.
167
-    instr = next(s for s in handle.sections if s.kind == "instruction")
168
-    assert instr.probes
169
-    assert instr.probes[0].prompt == "What is X?"
170
-    # Preference triple too.
171
-    pref = next(s for s in handle.sections if s.kind == "preference")
172
-    assert pref.preferences
173
-    assert pref.preferences[0].chosen == "good answer"
174
-
175
-
176
-def test_resolve_without_dlm_installed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
177
-    """resolve_dlm surfaces a SwayError when the dlm package is missing."""
178
-    # Wipe any cached dlm modules so the lazy import fails.
179
-    for mod in list(sys.modules):
180
-        if mod == "dlm" or mod.startswith("dlm."):
181
-            monkeypatch.delitem(sys.modules, mod, raising=False)
182
-
183
-    import builtins
184
-
185
-    real_import = builtins.__import__
186
-
187
-    def fake_import(name: str, *args, **kwargs):  # type: ignore[no-untyped-def]
188
-        if name.startswith("dlm."):
189
-            raise ImportError("missing extra")
190
-        return real_import(name, *args, **kwargs)
191
-
192
-    monkeypatch.setattr(builtins, "__import__", fake_import)
193
-
194
-    from dlm_sway.core.errors import SwayError
195
-    from dlm_sway.integrations.dlm.resolver import resolve_dlm
196
-
197
-    with pytest.raises(SwayError, match="dlm package not installed"):
198
-        resolve_dlm(tmp_path / "doc.dlm")
199
-
200
-
201
-def test_autogen_writes_complete_suite(fake_dlm: Path, tmp_path: Path) -> None:
202
-    from dlm_sway.integrations.dlm.autogen import write_sway_yaml
203
-
204
-    out = tmp_path / "sway.yaml"
205
-    write_sway_yaml(fake_dlm, out)
206
-    data = yaml.safe_load(out.read_text(encoding="utf-8"))
207
-
208
-    assert data["version"] == 1
209
-    assert data["models"]["base"]["base"] == "HuggingFaceTB/SmolLM2-135M-Instruct"
210
-    assert data["models"]["ft"]["adapter"] is not None
211
-    assert data["dlm_source"] == str(fake_dlm.resolve())
212
-
213
-    kinds = {entry["kind"] for entry in data["suite"]}
214
-    # The full 11-primitive battery minus nothing is present (some may
215
-    # be skipped when data is absent, but here we have one of every
216
-    # section type).
217
-    expected = {
218
-        "null_adapter",
219
-        "delta_kl",
220
-        "adapter_revert",
221
-        "prompt_collapse",
222
-        "section_internalization",
223
-        "paraphrase_invariance",
224
-        "preference_flip",
225
-        "style_fingerprint",
226
-        "calibration_drift",
227
-        "leakage",
228
-        "adapter_ablation",
229
-    }
230
-    assert expected <= kinds, f"missing: {expected - kinds}"
231
-
232
-
233
-def test_build_spec_dict_skips_preference_when_absent() -> None:
234
-    from dlm_sway.core.sections import Section
235
-    from dlm_sway.integrations.dlm.autogen import build_spec_dict
236
-    from dlm_sway.integrations.dlm.resolver import DlmHandle
237
-
238
-    sections = (
239
-        Section(id="a", kind="prose", content="A prose section. Second sentence."),
240
-        Section(id="b", kind="prose", content="Another prose section."),
241
-    )
242
-    handle = DlmHandle(
243
-        dlm_id="x",
244
-        base_model="base",
245
-        adapter_path=Path("/tmp/adapter"),
246
-        sections=sections,
247
-        doc_text="whole document",
248
-    )
249
-    spec = build_spec_dict(handle)
250
-    kinds = {entry["kind"] for entry in spec["suite"]}
251
-    assert "preference_flip" not in kinds
252
-    assert "section_internalization" in kinds
sway/tests/unit/test_errors.pydeleted
@@ -1,55 +0,0 @@
1
-"""Tests for the exception hierarchy."""
2
-
3
-from __future__ import annotations
4
-
5
-import pytest
6
-
7
-from dlm_sway.core.errors import (
8
-    BackendNotAvailableError,
9
-    ProbeError,
10
-    SpecValidationError,
11
-    SwayError,
12
-)
13
-
14
-
15
-class TestSwayError:
16
-    def test_is_root_exception(self) -> None:
17
-        assert issubclass(SpecValidationError, SwayError)
18
-        assert issubclass(BackendNotAvailableError, SwayError)
19
-        assert issubclass(ProbeError, SwayError)
20
-
21
-    def test_raised_and_caught_as_sway_error(self) -> None:
22
-        with pytest.raises(SwayError):
23
-            raise ProbeError("delta_kl", "shape mismatch")
24
-
25
-
26
-class TestSpecValidationError:
27
-    def test_format_without_source(self) -> None:
28
-        err = SpecValidationError("unknown key 'topp'")
29
-        assert str(err) == "unknown key 'topp'"
30
-        assert err.source is None
31
-
32
-    def test_format_with_source(self) -> None:
33
-        err = SpecValidationError("unknown key 'topp'", source="sway.yaml")
34
-        assert str(err) == "sway.yaml: unknown key 'topp'"
35
-        assert err.source == "sway.yaml"
36
-
37
-
38
-class TestBackendNotAvailableError:
39
-    def test_hint_rendered_in_message(self) -> None:
40
-        err = BackendNotAvailableError("hf", extra="hf")
41
-        assert "pip install 'dlm-sway[hf]'" in str(err)
42
-        assert err.backend == "hf"
43
-        assert err.extra == "hf"
44
-
45
-    def test_appends_optional_hint(self) -> None:
46
-        err = BackendNotAvailableError("mlx", extra="mlx", hint="Apple Silicon only.")
47
-        assert "Apple Silicon only." in str(err)
48
-
49
-
50
-class TestProbeError:
51
-    def test_includes_probe_name(self) -> None:
52
-        err = ProbeError("delta_kl", "NaN logits")
53
-        assert "delta_kl" in str(err)
54
-        assert "NaN logits" in str(err)
55
-        assert err.probe == "delta_kl"
sway/tests/unit/test_model.pydeleted
@@ -1,78 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.core.model`."""
2
-
3
-from __future__ import annotations
4
-
5
-from pathlib import Path
6
-
7
-import pytest
8
-from pydantic import ValidationError
9
-
10
-from dlm_sway.core.model import LoadedModel, Model, ModelSpec
11
-
12
-
13
-class TestModelSpec:
14
-    def test_defaults(self) -> None:
15
-        spec = ModelSpec(base="HuggingFaceTB/SmolLM2-135M-Instruct")
16
-        assert spec.kind == "hf"
17
-        assert spec.adapter is None
18
-        assert spec.dtype == "auto"
19
-        assert spec.device == "auto"
20
-        assert spec.trust_remote_code is False
21
-        assert spec.entry_point is None
22
-
23
-    def test_frozen(self) -> None:
24
-        spec = ModelSpec(base="x")
25
-        with pytest.raises(ValidationError):
26
-            spec.base = "y"  # type: ignore[misc]
27
-
28
-    def test_extra_fields_forbidden(self) -> None:
29
-        with pytest.raises(ValidationError) as exc_info:
30
-            ModelSpec(base="x", bogus="y")  # type: ignore[call-arg]
31
-        assert "bogus" in str(exc_info.value).lower()
32
-
33
-    def test_kind_enum(self) -> None:
34
-        ModelSpec(base="x", kind="hf")
35
-        ModelSpec(base="x", kind="mlx")
36
-        ModelSpec(base="x", kind="dummy")
37
-        ModelSpec(base="x", kind="custom", entry_point="pkg.mod:Backend")
38
-        with pytest.raises(ValidationError):
39
-            ModelSpec(base="x", kind="ollama")  # type: ignore[arg-type]
40
-
41
-    def test_adapter_coerced_to_path(self) -> None:
42
-        spec = ModelSpec(base="x", adapter="/tmp/adapter")  # type: ignore[arg-type]
43
-        assert isinstance(spec.adapter, Path)
44
-
45
-
46
-class TestLoadedModel:
47
-    def test_frozen_dataclass(self) -> None:
48
-        loaded = LoadedModel(
49
-            id="base",
50
-            spec=ModelSpec(base="x"),
51
-            model=object(),
52
-            tokenizer=object(),
53
-            meta={"device": "cpu"},
54
-        )
55
-        assert loaded.id == "base"
56
-        assert loaded.meta["device"] == "cpu"
57
-
58
-
59
-class TestModelProtocol:
60
-    def test_runtime_checkable(self) -> None:
61
-        class FakeModel:
62
-            id = "x"
63
-
64
-            def generate(
65
-                self,
66
-                prompt: str,
67
-                *,
68
-                max_new_tokens: int,
69
-                temperature: float = 0.0,
70
-                top_p: float = 1.0,
71
-                seed: int = 0,
72
-            ) -> str:
73
-                return f"{prompt}|{max_new_tokens}"
74
-
75
-            def close(self) -> None:
76
-                return None
77
-
78
-        assert isinstance(FakeModel(), Model)
sway/tests/unit/test_null_calibration.pydeleted
@@ -1,123 +0,0 @@
1
-"""Tests for null-adapter calibration.
2
-
3
-Covers: dummy backend ``as_null_adapter`` yields a plausibly noisy
4
-view; ``NullAdapterProbe`` populates ``ctx.null_stats`` in a way
5
-downstream probes pick up end-to-end; missing-capability SKIP path.
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-import numpy as np
11
-
12
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13
-from dlm_sway.core.result import Verdict
14
-from dlm_sway.core.scoring import NullCalibratedBackend
15
-from dlm_sway.probes.base import RunContext, build_probe
16
-from dlm_sway.suite.runner import run as run_suite
17
-from dlm_sway.suite.spec import SwaySpec
18
-
19
-
20
-def _diverging_backend() -> DummyDifferentialBackend:
21
-    base = DummyResponses()
22
-    ft = DummyResponses()
23
-    return DummyDifferentialBackend(base=base, ft=ft)
24
-
25
-
26
-class TestProtocolConformance:
27
-    def test_dummy_is_null_calibrated(self) -> None:
28
-        assert isinstance(_diverging_backend(), NullCalibratedBackend)
29
-
30
-
31
-class TestAsNullAdapter:
32
-    def test_yields_perturbed_view(self) -> None:
33
-        backend = _diverging_backend()
34
-        with backend.as_base() as base:
35
-            base_dist = base.next_token_dist("hello")
36
-        with backend.as_null_adapter(seed=0) as null:
37
-            null_dist = null.next_token_dist("hello")
38
-        # Some perturbation, but bounded.
39
-        assert not np.allclose(base_dist.logprobs, null_dist.logprobs)
40
-
41
-    def test_different_seeds_yield_different_views(self) -> None:
42
-        backend = _diverging_backend()
43
-        with backend.as_null_adapter(seed=1) as v1:
44
-            d1 = v1.next_token_dist("hello")
45
-        with backend.as_null_adapter(seed=2) as v2:
46
-            d2 = v2.next_token_dist("hello")
47
-        assert not np.allclose(d1.logprobs, d2.logprobs)
48
-
49
-    def test_view_exclusion_enforced(self) -> None:
50
-        import pytest
51
-
52
-        backend = _diverging_backend()
53
-        with backend.as_null_adapter(seed=0), pytest.raises(RuntimeError):
54
-            with backend.as_base():
55
-                pass
56
-
57
-
58
-class TestProbe:
59
-    def test_populates_null_stats(self) -> None:
60
-        backend = _diverging_backend()
61
-        probe, spec = build_probe(
62
-            {
63
-                "name": "null",
64
-                "kind": "null_adapter",
65
-                "runs": 3,
66
-                "prompts": ["q1", "q2"],
67
-            }
68
-        )
69
-        ctx = RunContext(backend=backend)
70
-        result = probe.run(spec, ctx)
71
-        assert result.verdict == Verdict.PASS
72
-        stats = result.evidence["null_stats"]
73
-        assert "delta_kl" in stats
74
-        assert stats["delta_kl"]["n"] == 3.0
75
-        assert stats["delta_kl"]["std"] > 0.0  # seeded perturbations produce variance
76
-
77
-    def test_runner_threads_null_stats_to_subsequent_probes(self) -> None:
78
-        """End-to-end: null_adapter first → delta_kl picks up z-score path."""
79
-        backend = _diverging_backend()
80
-        raw_spec = SwaySpec.model_validate(
81
-            {
82
-                "version": 1,
83
-                "models": {"base": {"base": "b"}, "ft": {"base": "b", "adapter": "/tmp/a"}},
84
-                "suite": [
85
-                    {
86
-                        "name": "null",
87
-                        "kind": "null_adapter",
88
-                        "runs": 3,
89
-                        "prompts": ["p1", "p2"],
90
-                    },
91
-                    {
92
-                        "name": "dk",
93
-                        "kind": "delta_kl",
94
-                        "prompts": ["p1", "p2"],
95
-                        "assert_z_gte": -10.0,  # permissive so we pass regardless
96
-                    },
97
-                ],
98
-            }
99
-        )
100
-        result = run_suite(raw_spec, backend)
101
-        assert len(result.probes) == 2
102
-        null_result = result.probes[0]
103
-        dk_result = result.probes[1]
104
-        assert null_result.verdict == Verdict.PASS
105
-        # The delta_kl probe should have computed a z_score because null_stats was present.
106
-        assert dk_result.z_score is not None, (
107
-            "delta_kl should have z-scored against null baseline, got "
108
-            f"evidence={dk_result.evidence}, message={dk_result.message}"
109
-        )
110
-
111
-    def test_skip_when_backend_not_null_calibrated(self) -> None:
112
-        class _Bare:
113
-            def as_base(self):  # noqa: ANN202
114
-                raise NotImplementedError
115
-
116
-            def as_finetuned(self):  # noqa: ANN202
117
-                raise NotImplementedError
118
-
119
-        probe, spec = build_probe({"name": "null", "kind": "null_adapter"})
120
-        ctx = RunContext(backend=_Bare())  # type: ignore[arg-type]
121
-        result = probe.run(spec, ctx)
122
-        assert result.verdict == Verdict.SKIP
123
-        assert "NullCalibratedBackend" in result.message
sway/tests/unit/test_probe_adapter_ablation.pydeleted
@@ -1,135 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.adapter_ablation`.
2
-
3
-Uses the dummy backend's lam-interpolation implementation to exercise
4
-the full probe path without loading a real model.
5
-"""
6
-
7
-from __future__ import annotations
8
-
9
-import numpy as np
10
-
11
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
12
-from dlm_sway.core.result import Verdict
13
-from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist
14
-from dlm_sway.probes.adapter_ablation import (
15
-    _overshoot,
16
-    _r_squared,
17
-    _saturation_lambda,
18
-)
19
-from dlm_sway.probes.base import RunContext, build_probe
20
-
21
-
22
-class TestShapeMetrics:
23
-    def test_r_squared_perfect_linear(self) -> None:
24
-        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
25
-        y = 2 * x + 0.1
26
-        assert _r_squared(x, y) > 0.99
27
-
28
-    def test_r_squared_zero_slope_defined(self) -> None:
29
-        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
30
-        y = np.zeros_like(x)
31
-        # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit).
32
-        assert _r_squared(x, y) == 1.0
33
-
34
-    def test_saturation_lambda_expected(self) -> None:
35
-        lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64)
36
-        divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64)
37
-        sat = _saturation_lambda(lambdas, divs)
38
-        assert sat == 0.75  # 0.95 / 1.0 = 0.95 ≥ 0.9
39
-
40
-    def test_overshoot_recovered(self) -> None:
41
-        lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64)
42
-        divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64)
43
-        assert _overshoot(lambdas, divs) == 1.15
44
-
45
-
46
-def _diverging_backend() -> DummyDifferentialBackend:
47
-    """Backend where base ≠ ft at a few prompts; distributions interpolate
48
-    smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter."""
49
-    base = DummyResponses(
50
-        token_dists={
51
-            "q1": TokenDist(
52
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
53
-                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
54
-                vocab_size=100,
55
-            ),
56
-            "q2": TokenDist(
57
-                token_ids=np.array([5, 6], dtype=np.int64),
58
-                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
59
-                vocab_size=100,
60
-            ),
61
-        }
62
-    )
63
-    ft = DummyResponses(
64
-        token_dists={
65
-            "q1": TokenDist(
66
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
67
-                logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
68
-                vocab_size=100,
69
-            ),
70
-            "q2": TokenDist(
71
-                token_ids=np.array([5, 6], dtype=np.int64),
72
-                logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)),
73
-                vocab_size=100,
74
-            ),
75
-        }
76
-    )
77
-    return DummyDifferentialBackend(base=base, ft=ft)
78
-
79
-
80
-class TestProbe:
81
-    def test_backend_implements_scalable_protocol(self) -> None:
82
-        backend = _diverging_backend()
83
-        assert isinstance(backend, ScalableDifferentialBackend)
84
-
85
-    def test_probe_runs_and_emits_shape_metrics(self) -> None:
86
-        probe, spec = build_probe(
87
-            {
88
-                "name": "abl",
89
-                "kind": "adapter_ablation",
90
-                "prompts": ["q1", "q2"],
91
-                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
92
-                # Very permissive to tolerate the log-space blend of a
93
-                # tiny synthetic fixture.
94
-                "assert_linearity_gte": 0.3,
95
-                "assert_overshoot_gte": 1.0,
96
-            }
97
-        )
98
-        ctx = RunContext(backend=_diverging_backend())
99
-        result = probe.run(spec, ctx)
100
-        assert result.verdict in (Verdict.PASS, Verdict.FAIL)
101
-        assert "lambdas" in result.evidence
102
-        assert "mean_divergence_per_lambda" in result.evidence
103
-        assert len(result.evidence["mean_divergence_per_lambda"]) == 6
104
-        # Divergence should increase as λ grows from 0 toward ft.
105
-        divs = result.evidence["mean_divergence_per_lambda"]
106
-        # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing
107
-        # for the bulk of the curve.
108
-        assert divs[-2] >= divs[0]
109
-
110
-    def test_skip_when_backend_not_scalable(self) -> None:
111
-        class _NonScalable:
112
-            def as_base(self):  # noqa: ANN202
113
-                raise NotImplementedError
114
-
115
-            def as_finetuned(self):  # noqa: ANN202
116
-                raise NotImplementedError
117
-
118
-        probe, spec = build_probe(
119
-            {
120
-                "name": "abl",
121
-                "kind": "adapter_ablation",
122
-                "prompts": ["q1"],
123
-            }
124
-        )
125
-        ctx = RunContext(backend=_NonScalable())  # type: ignore[arg-type]
126
-        result = probe.run(spec, ctx)
127
-        assert result.verdict == Verdict.SKIP
128
-        assert "ScalableDifferentialBackend" in result.message
129
-
130
-    def test_error_on_empty_prompts(self) -> None:
131
-        backend = _diverging_backend()
132
-        probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []})
133
-        ctx = RunContext(backend=backend)
134
-        result = probe.run(spec, ctx)
135
-        assert result.verdict == Verdict.ERROR
sway/tests/unit/test_probe_adapter_revert.pydeleted
@@ -1,170 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.adapter_revert`.
2
-
3
-We stub out the embedder so these tests don't need sentence-transformers
4
-installed. The ``probe.py`` SKIP path for the missing-extra case is
5
-covered separately by monkeypatching the importer.
6
-"""
7
-
8
-from __future__ import annotations
9
-
10
-from typing import Any
11
-
12
-import numpy as np
13
-import pytest
14
-
15
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
16
-from dlm_sway.core.result import Verdict
17
-from dlm_sway.probes.adapter_revert import AdapterRevertProbe
18
-from dlm_sway.probes.base import RunContext, build_probe
19
-
20
-
21
-def _backend(*, ft_like_base: bool = False) -> DummyDifferentialBackend:
22
-    base = DummyResponses(
23
-        generations={
24
-            "pp1": "cats are mammals",
25
-            "pp2": "cats have fur",
26
-        }
27
-    )
28
-    if ft_like_base:
29
-        ft_gens = dict(base.generations)
30
-    else:
31
-        ft_gens = {
32
-            "pp1": "dolphins are mammals",
33
-            "pp2": "dolphins are smart",
34
-        }
35
-    ft = DummyResponses(generations=ft_gens)
36
-    return DummyDifferentialBackend(base=base, ft=ft)
37
-
38
-
39
-def _stub_embedder(text_to_vec: dict[str, np.ndarray]):  # type: ignore[no-untyped-def]
40
-    def _encode(texts: list[str]):  # type: ignore[no-untyped-def]
41
-        return np.stack([text_to_vec[t] for t in texts])
42
-
43
-    return _encode
44
-
45
-
46
-@pytest.fixture
47
-def monkeyed_embed(monkeypatch: pytest.MonkeyPatch) -> dict[str, np.ndarray]:
48
-    """Install a stub embedder with a controllable text→vec mapping.
49
-
50
-    Tests populate the dict before calling ``probe.run()``.
51
-    """
52
-    table: dict[str, np.ndarray] = {}
53
-    monkeypatch.setattr(
54
-        "dlm_sway.probes.adapter_revert._load_embedder",
55
-        lambda _model_id: _stub_embedder(table),  # type: ignore[arg-type]
56
-    )
57
-    return table
58
-
59
-
60
-class TestAdapterRevert:
61
-    def test_healthy_adapter_passes(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
62
-        # gold and ft-outputs cluster together, base outputs cluster elsewhere.
63
-        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
64
-        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
65
-        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
66
-        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
67
-        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
68
-
69
-        probe, spec = build_probe(
70
-            {
71
-                "name": "rev",
72
-                "kind": "adapter_revert",
73
-                "cases": [
74
-                    {
75
-                        "prompt": "anything",
76
-                        "gold": "the answer is dolphins",
77
-                        "paraphrases": ["pp1", "pp2"],
78
-                    }
79
-                ],
80
-                "assert_revert_rate_lt": 0.25,
81
-            }
82
-        )
83
-        ctx = RunContext(backend=_backend(ft_like_base=False))
84
-        result = probe.run(spec, ctx)
85
-        assert result.verdict == Verdict.PASS
86
-        assert result.raw == 0.0
87
-
88
-    def test_reverting_adapter_fails(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
89
-        # ft matches base (reverted), diverges from gold.
90
-        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
91
-        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
92
-        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
93
-
94
-        probe, spec = build_probe(
95
-            {
96
-                "name": "rev",
97
-                "kind": "adapter_revert",
98
-                "cases": [
99
-                    {
100
-                        "prompt": "anything",
101
-                        "gold": "the answer is dolphins",
102
-                        "paraphrases": ["pp1", "pp2"],
103
-                    }
104
-                ],
105
-            }
106
-        )
107
-        ctx = RunContext(backend=_backend(ft_like_base=True))
108
-        result = probe.run(spec, ctx)
109
-        assert result.verdict == Verdict.FAIL
110
-        assert result.raw == 1.0  # 100% revert
111
-
112
-    def test_trivially_similar_cases_dropped(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
113
-        # base and gold are identical → drop.
114
-        v = np.array([1.0, 0.0])
115
-        monkeyed_embed["cats are mammals"] = v
116
-        monkeyed_embed["cats have fur"] = v
117
-        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
118
-        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
119
-        monkeyed_embed["cats are mammals too"] = v  # gold — matches base
120
-
121
-        probe, spec = build_probe(
122
-            {
123
-                "name": "rev",
124
-                "kind": "adapter_revert",
125
-                "cases": [
126
-                    {
127
-                        "prompt": "anything",
128
-                        "gold": "cats are mammals too",
129
-                        "paraphrases": ["pp1", "pp2"],
130
-                    }
131
-                ],
132
-            }
133
-        )
134
-        ctx = RunContext(backend=_backend(ft_like_base=False))
135
-        result = probe.run(spec, ctx)
136
-        # Both paraphrase pairs trivially similar → WARN (no separable signal).
137
-        assert result.verdict == Verdict.WARN
138
-        assert result.evidence["dropped_trivial"] == 2
139
-
140
-    def test_no_cases_errors(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
141
-        probe, spec = build_probe({"name": "rev", "kind": "adapter_revert", "cases": []})
142
-        ctx = RunContext(backend=_backend())
143
-        result = probe.run(spec, ctx)
144
-        assert result.verdict == Verdict.ERROR
145
-
146
-
147
-class TestMissingSemsim:
148
-    def test_skip_when_sentence_transformers_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
149
-        from dlm_sway.core.errors import BackendNotAvailableError
150
-
151
-        def raiser(_model_id: Any) -> Any:  # type: ignore[no-untyped-def]
152
-            raise BackendNotAvailableError(
153
-                "adapter_revert",
154
-                extra="semsim",
155
-                hint="adapter_revert relies on sentence embeddings.",
156
-            )
157
-
158
-        monkeypatch.setattr(
159
-            "dlm_sway.probes.adapter_revert._load_embedder",
160
-            raiser,  # type: ignore[arg-type]
161
-        )
162
-        probe = AdapterRevertProbe()
163
-        spec = probe.spec_cls(
164
-            name="rev",
165
-            cases=[{"prompt": "x", "gold": "y", "paraphrases": ["pp1"]}],  # type: ignore[list-item]
166
-        )
167
-        ctx = RunContext(backend=_backend())
168
-        result = probe.run(spec, ctx)
169
-        assert result.verdict == Verdict.SKIP
170
-        assert "semsim" in result.message
sway/tests/unit/test_probe_base.pydeleted
@@ -1,69 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.base`."""
2
-
3
-from __future__ import annotations
4
-
5
-from typing import Literal
6
-
7
-import pytest
8
-
9
-from dlm_sway.core.errors import SpecValidationError
10
-from dlm_sway.core.result import ProbeResult, Verdict
11
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, build_probe, registry
12
-
13
-
14
-class _DummySpec(ProbeSpec):
15
-    kind: Literal["__test_dummy"] = "__test_dummy"
16
-    payload: str = "x"
17
-
18
-
19
-class _DummyProbe(Probe):
20
-    kind = "__test_dummy"
21
-    spec_cls = _DummySpec
22
-    category = "adherence"
23
-
24
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
25
-        assert isinstance(spec, _DummySpec)
26
-        return ProbeResult(
27
-            name=spec.name,
28
-            kind=spec.kind,
29
-            verdict=Verdict.PASS,
30
-            score=1.0,
31
-            message=spec.payload,
32
-        )
33
-
34
-
35
-class TestRegistry:
36
-    def test_autoregister(self) -> None:
37
-        assert "__test_dummy" in registry()
38
-        assert registry()["__test_dummy"] is _DummyProbe
39
-
40
-    def test_duplicate_kind_rejected(self) -> None:
41
-        with pytest.raises(ValueError, match="duplicate probe kind"):
42
-
43
-            class _Clash(Probe):
44
-                kind = "__test_dummy"
45
-                spec_cls = _DummySpec
46
-
47
-                def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
48
-                    raise NotImplementedError
49
-
50
-
51
-class TestBuildProbe:
52
-    def test_valid_entry(self) -> None:
53
-        probe, spec = build_probe({"name": "t", "kind": "__test_dummy", "payload": "hi"})
54
-        assert isinstance(probe, _DummyProbe)
55
-        assert isinstance(spec, _DummySpec)
56
-        assert spec.payload == "hi"
57
-
58
-    def test_unknown_kind(self) -> None:
59
-        with pytest.raises(SpecValidationError, match="unknown probe kind"):
60
-            build_probe({"name": "t", "kind": "no_such_kind"})
61
-
62
-    def test_missing_kind(self) -> None:
63
-        with pytest.raises(SpecValidationError, match="missing string 'kind'"):
64
-            build_probe({"name": "t"})
65
-
66
-    def test_extra_field_forbidden(self) -> None:
67
-        with pytest.raises(SpecValidationError) as exc_info:
68
-            build_probe({"name": "t", "kind": "__test_dummy", "bogus": "y"})
69
-        assert "bogus" in str(exc_info.value).lower()
sway/tests/unit/test_probe_calibration_drift.pydeleted
@@ -1,57 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.calibration_drift`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
6
-from dlm_sway.core.result import Verdict
7
-from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
8
-from dlm_sway.probes.base import RunContext, build_probe
9
-
10
-
11
-def _backend(delta_per_token: float) -> DummyDifferentialBackend:
12
-    """Apply a uniform per-token logprob delta across every item."""
13
-    base_lp: dict[tuple[str, str], float] = {}
14
-    ft_lp: dict[tuple[str, str], float] = {}
15
-    for prompt, gold in BUILT_IN_PACK:
16
-        base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1)
17
-        ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1)
18
-    return DummyDifferentialBackend(
19
-        base=DummyResponses(logprobs=base_lp),
20
-        ft=DummyResponses(logprobs=ft_lp),
21
-    )
22
-
23
-
24
-class TestCalibrationDrift:
25
-    def test_healthy_when_no_regression(self) -> None:
26
-        backend = _backend(delta_per_token=0.0)  # no drift
27
-        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
28
-        ctx = RunContext(backend=backend)
29
-        result = probe.run(spec, ctx)
30
-        assert result.verdict == Verdict.PASS
31
-        assert result.raw == 0.0  # zero fraction regressed
32
-
33
-    def test_fail_on_uniform_large_regression(self) -> None:
34
-        backend = _backend(delta_per_token=-2.0)  # every item regresses
35
-        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
36
-        ctx = RunContext(backend=backend)
37
-        result = probe.run(spec, ctx)
38
-        assert result.verdict == Verdict.FAIL
39
-        assert result.raw == 1.0
40
-
41
-    def test_respects_items_limit(self) -> None:
42
-        backend = _backend(delta_per_token=0.0)
43
-        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5})
44
-        ctx = RunContext(backend=backend)
45
-        result = probe.run(spec, ctx)
46
-        assert result.evidence["total_items"] == 5
47
-
48
-    def test_worst_offenders_reported(self) -> None:
49
-        backend = _backend(delta_per_token=-2.0)
50
-        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
51
-        ctx = RunContext(backend=backend)
52
-        result = probe.run(spec, ctx)
53
-        worst = result.evidence["worst_offenders"]
54
-        assert len(worst) <= 5
55
-        # Each worst-offender record carries prompt/gold/delta fields.
56
-        if worst:
57
-            assert {"prompt", "gold", "delta"} <= set(worst[0].keys())
sway/tests/unit/test_probe_delta_kl.pydeleted
@@ -1,124 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.delta_kl`."""
2
-
3
-from __future__ import annotations
4
-
5
-import numpy as np
6
-
7
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
8
-from dlm_sway.core.result import Verdict
9
-from dlm_sway.core.scoring import TokenDist
10
-from dlm_sway.probes.base import RunContext, build_probe
11
-
12
-
13
-def _diverging_backend() -> DummyDifferentialBackend:
14
-    """Base peaks tightly on token 1; ft is broad uniform. Real divergence."""
15
-    base = DummyResponses(
16
-        token_dists={
17
-            "q1": TokenDist(
18
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
19
-                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
20
-                vocab_size=100,
21
-            ),
22
-            "q2": TokenDist(
23
-                token_ids=np.array([5, 6], dtype=np.int64),
24
-                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
25
-                vocab_size=100,
26
-            ),
27
-        }
28
-    )
29
-    ft = DummyResponses(
30
-        token_dists={
31
-            "q1": TokenDist(
32
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
33
-                logprobs=np.log(np.array([0.3, 0.35, 0.35], dtype=np.float32)),
34
-                vocab_size=100,
35
-            ),
36
-            "q2": TokenDist(
37
-                token_ids=np.array([5, 6], dtype=np.int64),
38
-                logprobs=np.log(np.array([0.4, 0.6], dtype=np.float32)),
39
-                vocab_size=100,
40
-            ),
41
-        }
42
-    )
43
-    return DummyDifferentialBackend(base=base, ft=ft)
44
-
45
-
46
-def _identical_backend() -> DummyDifferentialBackend:
47
-    dist = TokenDist(
48
-        token_ids=np.array([1, 2, 3], dtype=np.int64),
49
-        logprobs=np.log(np.array([0.5, 0.3, 0.2], dtype=np.float32)),
50
-        vocab_size=100,
51
-    )
52
-    base = DummyResponses(token_dists={"q1": dist})
53
-    ft = DummyResponses(token_dists={"q1": dist})
54
-    return DummyDifferentialBackend(base=base, ft=ft)
55
-
56
-
57
-class TestDeltaKL:
58
-    def test_passes_when_distributions_diverge(self) -> None:
59
-        probe, spec = build_probe(
60
-            {
61
-                "name": "dk",
62
-                "kind": "delta_kl",
63
-                "prompts": ["q1", "q2"],
64
-                "assert_mean_gte": 0.01,
65
-            }
66
-        )
67
-        ctx = RunContext(backend=_diverging_backend())
68
-        result = probe.run(spec, ctx)
69
-        assert result.verdict == Verdict.PASS
70
-        assert result.raw is not None
71
-        assert result.raw > 0.01
72
-        assert result.evidence["num_prompts"] == 2
73
-        assert len(result.evidence["per_prompt"]) == 2
74
-
75
-    def test_fails_when_distributions_identical(self) -> None:
76
-        probe, spec = build_probe(
77
-            {
78
-                "name": "dk",
79
-                "kind": "delta_kl",
80
-                "prompts": ["q1"],
81
-                "assert_mean_gte": 0.01,
82
-            }
83
-        )
84
-        ctx = RunContext(backend=_identical_backend())
85
-        result = probe.run(spec, ctx)
86
-        assert result.verdict == Verdict.FAIL
87
-        assert result.raw == 0.0
88
-
89
-    def test_z_score_path_when_null_stats_present(self) -> None:
90
-        probe, spec = build_probe(
91
-            {
92
-                "name": "dk",
93
-                "kind": "delta_kl",
94
-                "prompts": ["q1"],
95
-                "assert_z_gte": 2.0,
96
-            }
97
-        )
98
-        null_stats = {"delta_kl": {"mean": 0.01, "std": 0.01, "n": 3.0}}
99
-        ctx = RunContext(backend=_diverging_backend(), null_stats=null_stats)
100
-        result = probe.run(spec, ctx)
101
-        assert result.z_score is not None
102
-        # Our synthetic ft diverges ~0.1+, far above μ=0.01, σ=0.01 → huge z.
103
-        assert result.z_score > 2.0
104
-        assert result.verdict == Verdict.PASS
105
-
106
-    def test_error_on_empty_prompts(self) -> None:
107
-        probe, spec = build_probe({"name": "dk", "kind": "delta_kl", "prompts": []})
108
-        ctx = RunContext(backend=_identical_backend())
109
-        result = probe.run(spec, ctx)
110
-        assert result.verdict == Verdict.ERROR
111
-
112
-    def test_kl_kind_available(self) -> None:
113
-        probe, spec = build_probe(
114
-            {
115
-                "name": "dk",
116
-                "kind": "delta_kl",
117
-                "prompts": ["q1"],
118
-                "divergence": "kl",
119
-                "assert_mean_gte": 0.0,
120
-            }
121
-        )
122
-        ctx = RunContext(backend=_diverging_backend())
123
-        result = probe.run(spec, ctx)
124
-        assert result.evidence["divergence_kind"] == "kl"
sway/tests/unit/test_probe_leakage.pydeleted
@@ -1,109 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.leakage`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
6
-from dlm_sway.core.result import Verdict
7
-from dlm_sway.core.sections import Section
8
-from dlm_sway.probes.base import RunContext, build_probe
9
-from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb
10
-
11
-
12
-class TestLCS:
13
-    def test_identical_returns_one(self) -> None:
14
-        assert _lcs_ratio("abcdef", "abcdef") == 1.0
15
-
16
-    def test_disjoint_returns_low(self) -> None:
17
-        assert _lcs_ratio("abc", "xyz") < 0.3
18
-
19
-    def test_empty_returns_zero(self) -> None:
20
-        assert _lcs_ratio("", "abc") == 0.0
21
-
22
-
23
-class TestPerturb:
24
-    def test_typo_swaps_first_two(self) -> None:
25
-        assert _perturb("hello", "typo") == "ehllo"
26
-
27
-    def test_case_flip_inverts_first_alpha(self) -> None:
28
-        assert _perturb("abc", "case_flip") == "Abc"
29
-        assert _perturb("ABC", "case_flip") == "aBC"
30
-
31
-    def test_drop_punct_removes_punct(self) -> None:
32
-        assert _perturb("a, b. c!", "drop_punct") == "a b c"
33
-
34
-
35
-class TestFragility:
36
-    def test_zero_when_clean_zero(self) -> None:
37
-        assert _fragility(0.0, 0.0) == 0.0
38
-
39
-    def test_expected_when_perturbed_dropped(self) -> None:
40
-        import pytest
41
-
42
-        assert _fragility(0.8, 0.2) == pytest.approx(0.75)
43
-
44
-
45
-def _prose_section(sid: str, content: str) -> Section:
46
-    return Section(id=sid, kind="prose", content=content)
47
-
48
-
49
-def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend:
50
-    """Build a backend whose ft generate() returns a controlled prefix of ``target``.
51
-
52
-    The target is "aaa..." (200 chars) so we can measure LCS ratio
53
-    against it deterministically.
54
-    """
55
-    content = ("The capital of France is Paris. " * 30).strip()
56
-    # Generate a fraction of the target to hit the desired recall.
57
-    target = content[128 : 128 + 256]
58
-    ft_full = target[: int(ft_recall * len(target))]
59
-    ft_pert = target[: int(ft_perturbed_recall * len(target))]
60
-
61
-    base = DummyResponses()
62
-    ft = DummyResponses(
63
-        generations={
64
-            content[:128]: ft_full,
65
-            # perturbations of the first 128 chars hit these three:
66
-            **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")},
67
-        }
68
-    )
69
-    return DummyDifferentialBackend(base=base, ft=ft), content
70
-
71
-
72
-class TestProbe:
73
-    def test_skip_without_sections(self) -> None:
74
-        backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
75
-        probe, spec = build_probe({"name": "c3", "kind": "leakage"})
76
-        ctx = RunContext(backend=backend)
77
-        result = probe.run(spec, ctx)
78
-        assert result.verdict == Verdict.SKIP
79
-
80
-    def test_pass_when_no_leak(self) -> None:
81
-        backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
82
-        probe, spec = build_probe(
83
-            {
84
-                "name": "c3",
85
-                "kind": "leakage",
86
-                "prefix_chars": 128,
87
-                "continuation_chars": 256,
88
-            }
89
-        )
90
-        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
91
-        result = probe.run(spec, ctx)
92
-        assert result.verdict == Verdict.PASS
93
-
94
-    def test_fail_when_strong_low_fragility_leak(self) -> None:
95
-        backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9)
96
-        probe, spec = build_probe(
97
-            {
98
-                "name": "c3",
99
-                "kind": "leakage",
100
-                "prefix_chars": 128,
101
-                "continuation_chars": 256,
102
-                "assert_recall_lt": 0.5,
103
-                "min_fragility": 0.3,
104
-            }
105
-        )
106
-        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
107
-        result = probe.run(spec, ctx)
108
-        # High recall + low fragility → fail.
109
-        assert result.verdict == Verdict.FAIL
sway/tests/unit/test_probe_paraphrase_invariance.pydeleted
@@ -1,91 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.paraphrase_invariance`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
6
-from dlm_sway.core.result import Verdict
7
-from dlm_sway.probes.base import RunContext, build_probe
8
-
9
-
10
-def _backend(*, par_lift_fraction: float, verb_lift: float = 10.0) -> DummyDifferentialBackend:
11
-    """Return a backend with tunable verbatim/paraphrase lifts.
12
-
13
-    The ft view adds ``verb_lift`` nats to the verbatim (Q,A) logprob
14
-    and ``par_lift_fraction * verb_lift`` to paraphrase logprobs.
15
-    """
16
-    base = DummyResponses(
17
-        logprobs={
18
-            ("Q", "A"): -20.0,
19
-            ("Q_par1", "A"): -20.0,
20
-            ("Q_par2", "A"): -20.0,
21
-        }
22
-    )
23
-    ft = DummyResponses(
24
-        logprobs={
25
-            ("Q", "A"): -20.0 + verb_lift,
26
-            ("Q_par1", "A"): -20.0 + par_lift_fraction * verb_lift,
27
-            ("Q_par2", "A"): -20.0 + par_lift_fraction * verb_lift,
28
-        }
29
-    )
30
-    return DummyDifferentialBackend(base=base, ft=ft)
31
-
32
-
33
-def test_pass_when_generalizing() -> None:
34
-    # High paraphrase lift + high verbatim → healthy generalization.
35
-    backend = _backend(par_lift_fraction=0.9)
36
-    probe, spec = build_probe(
37
-        {
38
-            "name": "pi",
39
-            "kind": "paraphrase_invariance",
40
-            "intent": "generalize",
41
-            "min_verbatim_lift": 0.05,
42
-            "min_generalization_ratio": 0.5,
43
-            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1", "Q_par2"]}],
44
-        }
45
-    )
46
-    ctx = RunContext(backend=backend)
47
-    result = probe.run(spec, ctx)
48
-    assert result.verdict == Verdict.PASS
49
-    assert result.raw is not None
50
-    assert result.raw >= 0.5
51
-
52
-
53
-def test_fails_when_only_memorized_but_intent_generalize() -> None:
54
-    backend = _backend(par_lift_fraction=0.0)
55
-    probe, spec = build_probe(
56
-        {
57
-            "name": "pi",
58
-            "kind": "paraphrase_invariance",
59
-            "intent": "generalize",
60
-            "min_verbatim_lift": 0.05,
61
-            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
62
-        }
63
-    )
64
-    ctx = RunContext(backend=backend)
65
-    result = probe.run(spec, ctx)
66
-    assert result.verdict == Verdict.FAIL
67
-
68
-
69
-def test_passes_memorize_intent_when_only_memorized() -> None:
70
-    backend = _backend(par_lift_fraction=0.0)
71
-    probe, spec = build_probe(
72
-        {
73
-            "name": "pi",
74
-            "kind": "paraphrase_invariance",
75
-            "intent": "memorize",
76
-            "min_verbatim_lift": 0.05,
77
-            "max_generalization_ratio_if_memorize": 0.3,
78
-            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
79
-        }
80
-    )
81
-    ctx = RunContext(backend=backend)
82
-    result = probe.run(spec, ctx)
83
-    assert result.verdict == Verdict.PASS
84
-
85
-
86
-def test_error_on_empty_cases() -> None:
87
-    probe, spec = build_probe({"name": "pi", "kind": "paraphrase_invariance", "cases": []})
88
-    backend = _backend(par_lift_fraction=0.9)
89
-    ctx = RunContext(backend=backend)
90
-    result = probe.run(spec, ctx)
91
-    assert result.verdict == Verdict.ERROR
sway/tests/unit/test_probe_preference_flip.pydeleted
@@ -1,161 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.preference_flip`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
6
-from dlm_sway.core.result import Verdict
7
-from dlm_sway.core.sections import Section, SectionPreference
8
-from dlm_sway.probes.base import RunContext, build_probe
9
-
10
-
11
-def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend:
12
-    """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin).
13
-
14
-    We distribute the margin half to the chosen and half (negative) to
15
-    the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected)
16
-    equal the requested margin.
17
-    """
18
-    base_lp: dict[tuple[str, str], float] = {}
19
-    ft_lp: dict[tuple[str, str], float] = {}
20
-    for prompt, chosen, rejected, base_m, ft_m in pairs:
21
-        base_lp[(prompt, chosen)] = base_m / 2
22
-        base_lp[(prompt, rejected)] = -base_m / 2
23
-        ft_lp[(prompt, chosen)] = ft_m / 2
24
-        ft_lp[(prompt, rejected)] = -ft_m / 2
25
-    return DummyDifferentialBackend(
26
-        base=DummyResponses(logprobs=base_lp),
27
-        ft=DummyResponses(logprobs=ft_lp),
28
-    )
29
-
30
-
31
-def test_pass_when_base_wrong_flipped() -> None:
32
-    backend = _backend(
33
-        [
34
-            ("p1", "good1", "bad1", -2.0, 2.0),  # base wrong, ft flips
35
-            ("p2", "good2", "bad2", -1.5, 1.0),  # base wrong, ft flips
36
-            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
37
-            ("p4", "good4", "bad4", 1.0, 2.0),  # base already right (no contribution)
38
-        ]
39
-    )
40
-    triples = [
41
-        {"prompt": p, "chosen": c, "rejected": r}
42
-        for (p, c, r, _, _) in [
43
-            ("p1", "good1", "bad1", 0, 0),
44
-            ("p2", "good2", "bad2", 0, 0),
45
-            ("p3", "good3", "bad3", 0, 0),
46
-            ("p4", "good4", "bad4", 0, 0),
47
-        ]
48
-    ]
49
-    probe, spec = build_probe(
50
-        {
51
-            "name": "pf",
52
-            "kind": "preference_flip",
53
-            "triples": triples,
54
-            "assert_flip_rate_gte": 0.7,
55
-            "min_triples_for_decision": 3,
56
-        }
57
-    )
58
-    ctx = RunContext(backend=backend)
59
-    result = probe.run(spec, ctx)
60
-    assert result.verdict == Verdict.PASS
61
-    assert result.raw == 1.0  # 3/3 flipped
62
-
63
-
64
-def test_fail_when_base_wrong_not_flipped() -> None:
65
-    backend = _backend(
66
-        [
67
-            ("p1", "good1", "bad1", -2.0, -1.5),  # base wrong, ft still wrong
68
-            ("p2", "good2", "bad2", -1.5, -1.0),  # base wrong, ft still wrong
69
-            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
70
-        ]
71
-    )
72
-    triples = [
73
-        {"prompt": p, "chosen": c, "rejected": r}
74
-        for p, c, r in [
75
-            ("p1", "good1", "bad1"),
76
-            ("p2", "good2", "bad2"),
77
-            ("p3", "good3", "bad3"),
78
-        ]
79
-    ]
80
-    probe, spec = build_probe(
81
-        {
82
-            "name": "pf",
83
-            "kind": "preference_flip",
84
-            "triples": triples,
85
-            "assert_flip_rate_gte": 0.7,
86
-            "min_triples_for_decision": 3,
87
-        }
88
-    )
89
-    ctx = RunContext(backend=backend)
90
-    result = probe.run(spec, ctx)
91
-    assert result.verdict == Verdict.FAIL
92
-    assert result.raw is not None
93
-    assert result.raw < 0.7
94
-
95
-
96
-def test_skip_when_no_triples_anywhere() -> None:
97
-    probe, spec = build_probe({"name": "pf", "kind": "preference_flip"})
98
-    backend = _backend([])
99
-    ctx = RunContext(backend=backend)
100
-    result = probe.run(spec, ctx)
101
-    assert result.verdict == Verdict.SKIP
102
-
103
-
104
-def test_warn_when_too_few_base_wrong() -> None:
105
-    backend = _backend(
106
-        [
107
-            ("p1", "good1", "bad1", 1.0, 2.0),  # base right
108
-            ("p2", "good2", "bad2", 0.5, 1.0),  # base right
109
-            ("p3", "good3", "bad3", -0.5, 0.5),  # base wrong
110
-        ]
111
-    )
112
-    triples = [
113
-        {"prompt": p, "chosen": c, "rejected": r}
114
-        for p, c, r in [
115
-            ("p1", "good1", "bad1"),
116
-            ("p2", "good2", "bad2"),
117
-            ("p3", "good3", "bad3"),
118
-        ]
119
-    ]
120
-    probe, spec = build_probe(
121
-        {
122
-            "name": "pf",
123
-            "kind": "preference_flip",
124
-            "triples": triples,
125
-            "min_triples_for_decision": 3,
126
-        }
127
-    )
128
-    ctx = RunContext(backend=backend)
129
-    result = probe.run(spec, ctx)
130
-    assert result.verdict == Verdict.WARN
131
-
132
-
133
-def test_triples_pulled_from_sections() -> None:
134
-    pref_section = Section(
135
-        id="p1",
136
-        kind="preference",
137
-        content="...",
138
-        preferences=(
139
-            SectionPreference(prompt="q1", chosen="good", rejected="bad"),
140
-            SectionPreference(prompt="q2", chosen="good2", rejected="bad2"),
141
-            SectionPreference(prompt="q3", chosen="good3", rejected="bad3"),
142
-        ),
143
-    )
144
-    backend = _backend(
145
-        [
146
-            ("q1", "good", "bad", -1.0, 1.0),
147
-            ("q2", "good2", "bad2", -1.0, 1.0),
148
-            ("q3", "good3", "bad3", -1.0, 1.0),
149
-        ]
150
-    )
151
-    probe, spec = build_probe(
152
-        {
153
-            "name": "pf",
154
-            "kind": "preference_flip",
155
-            "assert_flip_rate_gte": 0.7,
156
-            "min_triples_for_decision": 3,
157
-        }
158
-    )
159
-    ctx = RunContext(backend=backend, sections=(pref_section,))
160
-    result = probe.run(spec, ctx)
161
-    assert result.verdict == Verdict.PASS
sway/tests/unit/test_probe_prompt_collapse.pydeleted
@@ -1,137 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.prompt_collapse`.
2
-
3
-Uses a programmable dummy backend that serves different token dists
4
-depending on whether the prompt contains the stuffing prefix. That's the
5
-cleanest way to simulate "divergence decays with context length" without
6
-a real model.
7
-"""
8
-
9
-from __future__ import annotations
10
-
11
-import numpy as np
12
-
13
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
14
-from dlm_sway.core.result import Verdict
15
-from dlm_sway.core.scoring import TokenDist
16
-from dlm_sway.probes.base import RunContext, build_probe
17
-from dlm_sway.probes.prompt_collapse import _fit_half_life
18
-
19
-
20
-class TestFitHalfLife:
21
-    def test_exponential_recovered(self) -> None:
22
-        lengths = np.array([0.0, 100.0, 200.0, 300.0])
23
-        # y = 1.0 * exp(-x / 100)
24
-        y = np.exp(-lengths / 100.0)
25
-        h = _fit_half_life(lengths, y)
26
-        assert h is not None
27
-        import math
28
-
29
-        # True half-life = ln(2) * 100 ≈ 69.3
30
-        assert abs(h - math.log(2.0) * 100.0) < 1e-6
31
-
32
-    def test_returns_none_for_flat(self) -> None:
33
-        lengths = np.array([0.0, 100.0, 200.0])
34
-        y = np.array([1e-10, 1e-10, 1e-10])
35
-        assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None
36
-        # Either None or a huge half-life — both acceptable for flat input.
37
-
38
-    def test_returns_none_for_increasing(self) -> None:
39
-        lengths = np.array([0.0, 100.0, 200.0])
40
-        y = np.array([0.1, 0.3, 0.5])
41
-        assert _fit_half_life(lengths, y) is None
42
-
43
-
44
-def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend:
45
-    """Return a backend whose divergence decays with prompt length.
46
-
47
-    ``stuffing_sensitivity`` controls how quickly the ft distribution
48
-    snaps back to base as prompt length grows; lower = healthier adapter.
49
-    """
50
-    import numpy as np
51
-
52
-    base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32)
53
-
54
-    class _StuffedResponses(DummyResponses):
55
-        def __init__(self, is_ft: bool):
56
-            super().__init__()
57
-            self._is_ft = is_ft
58
-
59
-        # Override retrieval by subclassing the view's lookup path.
60
-
61
-    # Simpler: use explicit prompts at each expected length to seed the dict.
62
-    # The probe prefixes stuffing so the dummy sees the exact final prompt.
63
-    # We pre-build dists for each prompt we expect to see.
64
-    base = DummyResponses()
65
-    ft = DummyResponses()
66
-
67
-    # Pre-generate prompts the probe will query. The probe uses default
68
-    # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok.
69
-    from dlm_sway.probes.prompt_collapse import _stuffing
70
-
71
-    for ctx_len in (0, 256, 512, 1024):
72
-        prefix = _stuffing(ctx_len)
73
-        for prompt in ("q1",):
74
-            key = prefix + prompt
75
-            # Base: always tight on token 1.
76
-            base.token_dists[key] = TokenDist(
77
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
78
-                logprobs=np.log(base_probs),
79
-                vocab_size=100,
80
-            )
81
-            # FT: diverges at ctx=0, decays toward base with length.
82
-            decay = np.exp(-ctx_len * stuffing_sensitivity)
83
-            ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay
84
-            ft_probs = ft_probs / ft_probs.sum()
85
-            ft.token_dists[key] = TokenDist(
86
-                token_ids=np.array([1, 2, 3], dtype=np.int64),
87
-                logprobs=np.log(ft_probs.astype(np.float32)),
88
-                vocab_size=100,
89
-            )
90
-    return DummyDifferentialBackend(base=base, ft=ft)
91
-
92
-
93
-class TestPromptCollapse:
94
-    def test_healthy_adapter_passes(self) -> None:
95
-        probe, spec = build_probe(
96
-            {
97
-                "name": "pc",
98
-                "kind": "prompt_collapse",
99
-                "prompts": ["q1"],
100
-                "context_lengths": [0, 256, 512, 1024],
101
-                "assert_half_life_tokens": 100,
102
-            }
103
-        )
104
-        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001))
105
-        result = probe.run(spec, ctx)
106
-        # Half-life should be well above 100 with slow decay.
107
-        assert result.verdict == Verdict.PASS
108
-        assert result.raw is not None
109
-        assert result.raw > 100
110
-
111
-    def test_collapsing_adapter_fails(self) -> None:
112
-        probe, spec = build_probe(
113
-            {
114
-                "name": "pc",
115
-                "kind": "prompt_collapse",
116
-                "prompts": ["q1"],
117
-                "context_lengths": [0, 256, 512, 1024],
118
-                "assert_half_life_tokens": 500,
119
-            }
120
-        )
121
-        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02))
122
-        result = probe.run(spec, ctx)
123
-        # Fast decay → short half-life → fail against 500-token threshold.
124
-        assert result.verdict == Verdict.FAIL
125
-
126
-    def test_error_on_empty_prompts(self) -> None:
127
-        probe, spec = build_probe(
128
-            {
129
-                "name": "pc",
130
-                "kind": "prompt_collapse",
131
-                "prompts": [],
132
-                "context_lengths": [0, 256],
133
-            }
134
-        )
135
-        ctx = RunContext(backend=_programmed_backend(0.001))
136
-        result = probe.run(spec, ctx)
137
-        assert result.verdict == Verdict.ERROR
sway/tests/unit/test_probe_section_internalization.pydeleted
@@ -1,94 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.section_internalization` (the flagship B1)."""
2
-
3
-from __future__ import annotations
4
-
5
-import numpy as np
6
-
7
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
8
-from dlm_sway.core.result import Verdict
9
-from dlm_sway.core.scoring import RollingLogprob
10
-from dlm_sway.core.sections import Section, SectionProbe
11
-from dlm_sway.probes.base import RunContext, build_probe
12
-
13
-
14
-def _rolling(mean_lp: float, n: int = 10) -> RollingLogprob:
15
-    lp = np.full(n - 1, mean_lp, dtype=np.float32)
16
-    return RollingLogprob(
17
-        token_ids=np.arange(n, dtype=np.int64),
18
-        logprobs=lp,
19
-        num_tokens=n,
20
-        total_logprob=float(lp.sum()),
21
-    )
22
-
23
-
24
-def _section(sid: str, kind: str = "prose", content: str = "content", probes=()) -> Section:
25
-    return Section(id=sid, kind=kind, content=content, probes=tuple(probes))  # type: ignore[arg-type]
26
-
27
-
28
-def test_skip_without_sections() -> None:
29
-    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
30
-    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
31
-    ctx = RunContext(backend=backend)
32
-    result = probe.run(spec, ctx)
33
-    assert result.verdict == Verdict.SKIP
34
-
35
-
36
-def test_skip_with_single_section() -> None:
37
-    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
38
-    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
39
-    ctx = RunContext(backend=backend, sections=(_section("a"),))
40
-    result = probe.run(spec, ctx)
41
-    assert result.verdict == Verdict.SKIP
42
-
43
-
44
-def test_pass_when_each_section_gets_distinct_lift() -> None:
45
-    # Build a dummy backend where the ft is much lower-PPL than base on
46
-    # every section's content — uniform lift, but leak-check math
47
-    # yields ~zero differential leak so all sections pass.
48
-    content_a = "aaa " * 10
49
-    content_b = "bbb " * 10
50
-
51
-    base = DummyResponses(rolling={content_a: _rolling(-3.0), content_b: _rolling(-3.0)})
52
-    ft = DummyResponses(rolling={content_a: _rolling(-1.0), content_b: _rolling(-2.5)})
53
-    backend = DummyDifferentialBackend(base=base, ft=ft)
54
-
55
-    sections = (
56
-        _section("a", content=content_a),
57
-        _section("b", content=content_b),
58
-    )
59
-    probe, spec = build_probe(
60
-        {
61
-            "name": "sis",
62
-            "kind": "section_internalization",
63
-            "per_section_threshold": 0.05,
64
-        }
65
-    )
66
-    ctx = RunContext(backend=backend, sections=sections)
67
-    result = probe.run(spec, ctx)
68
-    assert result.verdict in (Verdict.PASS, Verdict.FAIL)
69
-    assert "per_section" in result.evidence
70
-    assert len(result.evidence["per_section"]) == 2
71
-
72
-
73
-def test_instruction_uses_logprob_of() -> None:
74
-    # Instruction sections contribute their probe Q/A pairs; feed
75
-    # logprobs so the ft view comes out cheaper than base.
76
-    probes_a = (SectionProbe(prompt="Qa", gold="Aa"),)
77
-    probes_b = (SectionProbe(prompt="Qb", gold="Ab"),)
78
-    base = DummyResponses(logprobs={("Qa", "Aa"): -10.0, ("Qb", "Ab"): -10.0})
79
-    ft = DummyResponses(logprobs={("Qa", "Aa"): -3.0, ("Qb", "Ab"): -8.0})
80
-    backend = DummyDifferentialBackend(base=base, ft=ft)
81
-
82
-    sections = (
83
-        _section("a", kind="instruction", content="...", probes=probes_a),
84
-        _section("b", kind="instruction", content="...", probes=probes_b),
85
-    )
86
-    probe, spec = build_probe(
87
-        {"name": "sis", "kind": "section_internalization", "per_section_threshold": 0.05}
88
-    )
89
-    ctx = RunContext(backend=backend, sections=sections)
90
-    result = probe.run(spec, ctx)
91
-    per = result.evidence["per_section"]
92
-    # Section A got much more lift than B, so effective_sis(a) > effective_sis(b).
93
-    sis_by_id = {row["section_id"]: row["effective_sis"] for row in per}
94
-    assert sis_by_id["a"] > sis_by_id["b"]
sway/tests/unit/test_probe_style_fingerprint.pydeleted
@@ -1,115 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.probes.style_fingerprint`."""
2
-
3
-from __future__ import annotations
4
-
5
-import numpy as np
6
-
7
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
8
-from dlm_sway.core.result import Verdict
9
-from dlm_sway.probes.base import RunContext, build_probe
10
-from dlm_sway.probes.style_fingerprint import fingerprint
11
-
12
-
13
-class TestFingerprint:
14
-    def test_zero_vector_for_empty(self) -> None:
15
-        fp = fingerprint("")
16
-        assert fp.shape == (6,)
17
-        assert np.allclose(fp, 0.0)
18
-
19
-    def test_non_zero_for_normal_text(self) -> None:
20
-        fp = fingerprint("This is a sentence. This is another one. A third.")
21
-        assert fp.shape == (6,)
22
-        assert fp[0] > 0  # mean sentence length
23
-        assert fp[2] > 0  # TTR
24
-        assert fp[3] > 0  # avg word length
25
-
26
-    def test_distinct_styles_distinct_fingerprints(self) -> None:
27
-        terse = "Go. Now. Quick."
28
-        verbose = (
29
-            "We must, with all deliberate speed and measured consideration, "
30
-            "proceed expeditiously towards the elaborated and carefully "
31
-            "constructed resolution of the foregoing matter."
32
-        )
33
-        assert not np.allclose(fingerprint(terse), fingerprint(verbose))
34
-
35
-
36
-def _backend_with_samples(base: list[str], ft: list[str]) -> DummyDifferentialBackend:
37
-    return DummyDifferentialBackend(
38
-        base=DummyResponses(generations={f"p{i}": s for i, s in enumerate(base)}),
39
-        ft=DummyResponses(generations={f"p{i}": s for i, s in enumerate(ft)}),
40
-    )
41
-
42
-
43
-class TestProbe:
44
-    def test_pass_when_ft_drifts_toward_doc(self) -> None:
45
-        base_samples = ["Short. Plain. Words."] * 2
46
-        ft_samples = [
47
-            "Wherein many clauses conjoin themselves, through extended "
48
-            "ruminations, unto a meandering whole of considerable length."
49
-        ] * 2
50
-        doc = (
51
-            "Wherein many clauses conjoin themselves, through extended "
52
-            "ruminations, unto a meandering whole of considerable length. "
53
-            "Further elaboration, no less copious, follows apace."
54
-        )
55
-        backend = _backend_with_samples(base_samples, ft_samples)
56
-        probe, spec = build_probe(
57
-            {
58
-                "name": "c1",
59
-                "kind": "style_fingerprint",
60
-                "prompts": ["p0", "p1"],
61
-                "doc_reference": doc,
62
-                "max_new_tokens": 32,
63
-                "assert_shift_gte": 0.2,
64
-            }
65
-        )
66
-        ctx = RunContext(backend=backend)
67
-        result = probe.run(spec, ctx)
68
-        assert result.verdict == Verdict.PASS
69
-        assert result.raw is not None
70
-        assert result.raw > 0.2
71
-
72
-    def test_fail_when_no_stylistic_shift(self) -> None:
73
-        base_samples = ["Short. Plain. Words."] * 2
74
-        ft_samples = ["Short. Plain. Words."] * 2
75
-        doc = "Wherein clauses conjoin into meandering wholes of length."
76
-        backend = _backend_with_samples(base_samples, ft_samples)
77
-        probe, spec = build_probe(
78
-            {
79
-                "name": "c1",
80
-                "kind": "style_fingerprint",
81
-                "prompts": ["p0", "p1"],
82
-                "doc_reference": doc,
83
-                "assert_shift_gte": 0.25,
84
-            }
85
-        )
86
-        ctx = RunContext(backend=backend)
87
-        result = probe.run(spec, ctx)
88
-        assert result.verdict == Verdict.FAIL
89
-
90
-    def test_skip_without_doc_reference(self) -> None:
91
-        backend = _backend_with_samples(["x"], ["y"])
92
-        probe, spec = build_probe(
93
-            {
94
-                "name": "c1",
95
-                "kind": "style_fingerprint",
96
-                "prompts": ["p0"],
97
-            }
98
-        )
99
-        ctx = RunContext(backend=backend)
100
-        result = probe.run(spec, ctx)
101
-        assert result.verdict == Verdict.SKIP
102
-
103
-    def test_error_on_empty_prompts(self) -> None:
104
-        backend = _backend_with_samples([], [])
105
-        probe, spec = build_probe(
106
-            {
107
-                "name": "c1",
108
-                "kind": "style_fingerprint",
109
-                "prompts": [],
110
-                "doc_reference": "doc",
111
-            }
112
-        )
113
-        ctx = RunContext(backend=backend)
114
-        result = probe.run(spec, ctx)
115
-        assert result.verdict == Verdict.ERROR
sway/tests/unit/test_result.pydeleted
@@ -1,82 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.core.result`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dataclasses import FrozenInstanceError
6
-
7
-import pytest
8
-
9
-from dlm_sway.core.result import (
10
-    DEFAULT_COMPONENT_WEIGHTS,
11
-    ProbeResult,
12
-    SuiteResult,
13
-    SwayScore,
14
-    Verdict,
15
-    utcnow,
16
-)
17
-
18
-
19
-class TestVerdict:
20
-    def test_is_str_enum(self) -> None:
21
-        assert Verdict.PASS.value == "pass"
22
-        assert str(Verdict.WARN.value) == "warn"
23
-
24
-    def test_all_expected_members(self) -> None:
25
-        assert {v.value for v in Verdict} == {
26
-            "pass",
27
-            "fail",
28
-            "warn",
29
-            "skip",
30
-            "error",
31
-        }
32
-
33
-
34
-class TestProbeResult:
35
-    def test_minimum_construction(self) -> None:
36
-        r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82)
37
-        assert r.raw is None
38
-        assert r.evidence == {}
39
-        assert r.message == ""
40
-        assert r.duration_s == 0.0
41
-
42
-    def test_frozen(self) -> None:
43
-        r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5)
44
-        with pytest.raises(FrozenInstanceError):
45
-            r.score = 0.6  # type: ignore[misc]
46
-
47
-
48
-class TestSuiteResult:
49
-    def test_wall_seconds(self) -> None:
50
-        from datetime import timedelta
51
-
52
-        started = utcnow()
53
-        finished = started + timedelta(seconds=2, milliseconds=500)
54
-        result = SuiteResult(
55
-            spec_path="sway.yaml",
56
-            started_at=started,
57
-            finished_at=finished,
58
-            base_model_id="b",
59
-            adapter_id="a",
60
-            sway_version="0.1.0.dev0",
61
-        )
62
-        assert result.wall_seconds == pytest.approx(2.5, abs=1e-6)
63
-
64
-
65
-class TestSwayScore:
66
-    def test_default_weights_sum_to_one(self) -> None:
67
-        assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9
68
-
69
-    def test_band_boundaries(self) -> None:
70
-        assert SwayScore.band_for(0.0) == "noise"
71
-        assert SwayScore.band_for(0.29) == "noise"
72
-        assert SwayScore.band_for(0.30) == "partial"
73
-        assert SwayScore.band_for(0.59) == "partial"
74
-        assert SwayScore.band_for(0.60) == "healthy"
75
-        assert SwayScore.band_for(0.85) == "healthy"
76
-        assert SwayScore.band_for(0.851) == "suspicious"
77
-        assert SwayScore.band_for(0.99) == "suspicious"
78
-
79
-
80
-def test_utcnow_is_tz_aware() -> None:
81
-    now = utcnow()
82
-    assert now.tzinfo is not None
sway/tests/unit/test_scoring.pydeleted
@@ -1,84 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.core.scoring`."""
2
-
3
-from __future__ import annotations
4
-
5
-import math
6
-
7
-import numpy as np
8
-
9
-from dlm_sway.core.scoring import (
10
-    DifferentialBackend,
11
-    RollingLogprob,
12
-    ScoringBackend,
13
-    TokenDist,
14
-)
15
-
16
-
17
-class TestRollingLogprob:
18
-    def test_empty_sequence(self) -> None:
19
-        r = RollingLogprob(
20
-            token_ids=np.array([42], dtype=np.int64),
21
-            logprobs=np.array([], dtype=np.float32),
22
-            num_tokens=1,
23
-            total_logprob=0.0,
24
-        )
25
-        assert r.mean_logprob == 0.0
26
-        assert r.perplexity == 1.0
27
-
28
-    def test_mean_and_perplexity(self) -> None:
29
-        # Three tokens, two transition logprobs summing to -4.0 → mean -2.0.
30
-        r = RollingLogprob(
31
-            token_ids=np.array([1, 2, 3], dtype=np.int64),
32
-            logprobs=np.array([-1.5, -2.5], dtype=np.float32),
33
-            num_tokens=3,
34
-            total_logprob=-4.0,
35
-        )
36
-        assert math.isclose(r.mean_logprob, -2.0, rel_tol=1e-6)
37
-        assert math.isclose(r.perplexity, math.exp(2.0), rel_tol=1e-6)
38
-
39
-
40
-class TestTokenDist:
41
-    def test_construction_and_defaults(self) -> None:
42
-        dist = TokenDist(
43
-            token_ids=np.array([1, 2, 3], dtype=np.int64),
44
-            logprobs=np.array([-0.1, -1.0, -3.0], dtype=np.float32),
45
-            vocab_size=50_257,
46
-        )
47
-        assert dist.tail_logprob == 0.0
48
-        assert dist.token_ids.shape == (3,)
49
-
50
-
51
-class TestProtocols:
52
-    def test_scoring_backend_runtime_checkable(self) -> None:
53
-        class FakeScoring:
54
-            def logprob_of(self, prompt: str, completion: str) -> float:
55
-                return 0.0
56
-
57
-            def rolling_logprob(self, text: str) -> RollingLogprob:
58
-                return RollingLogprob(
59
-                    token_ids=np.array([0], dtype=np.int64),
60
-                    logprobs=np.array([], dtype=np.float32),
61
-                    num_tokens=1,
62
-                    total_logprob=0.0,
63
-                )
64
-
65
-            def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
66
-                return TokenDist(
67
-                    token_ids=np.array([0], dtype=np.int64),
68
-                    logprobs=np.array([0.0], dtype=np.float32),
69
-                    vocab_size=1,
70
-                )
71
-
72
-        assert isinstance(FakeScoring(), ScoringBackend)
73
-
74
-    def test_differential_backend_runtime_checkable(self) -> None:
75
-        from contextlib import nullcontext
76
-
77
-        class FakeDiff:
78
-            def as_base(self):  # type: ignore[no-untyped-def]
79
-                return nullcontext(object())
80
-
81
-            def as_finetuned(self):  # type: ignore[no-untyped-def]
82
-                return nullcontext(object())
83
-
84
-        assert isinstance(FakeDiff(), DifferentialBackend)
sway/tests/unit/test_sections.pydeleted
@@ -1,35 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.core.sections`."""
2
-
3
-from __future__ import annotations
4
-
5
-from dlm_sway.core.sections import (
6
-    Section,
7
-    SectionPreference,
8
-    SectionProbe,
9
-    filter_kinds,
10
-)
11
-
12
-
13
-def test_default_field_types() -> None:
14
-    s = Section(id="abc", kind="prose", content="hello world")
15
-    assert s.probes == ()
16
-    assert s.preferences == ()
17
-    assert s.tag is None
18
-
19
-
20
-def test_filter_kinds() -> None:
21
-    sections = (
22
-        Section(id="a", kind="prose", content="x"),
23
-        Section(id="b", kind="instruction", content="y"),
24
-        Section(id="c", kind="preference", content="z"),
25
-    )
26
-    only_prose = filter_kinds(sections, ("prose",))
27
-    assert len(only_prose) == 1
28
-    assert only_prose[0].id == "a"
29
-
30
-
31
-def test_section_probe_and_preference() -> None:
32
-    p = SectionProbe(prompt="Q", gold="A")
33
-    assert p.prompt == "Q"
34
-    pref = SectionPreference(prompt="P", chosen="good", rejected="bad")
35
-    assert pref.chosen == "good"
sway/tests/unit/test_suite_runner.pydeleted
@@ -1,134 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.suite.runner`.
2
-
3
-Uses the dummy backend + ad-hoc probe classes so nothing real is loaded.
4
-"""
5
-
6
-from __future__ import annotations
7
-
8
-from typing import Literal
9
-
10
-import pytest
11
-
12
-from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13
-from dlm_sway.core.errors import ProbeError
14
-from dlm_sway.core.result import ProbeResult, Verdict
15
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
16
-from dlm_sway.suite.runner import run
17
-from dlm_sway.suite.spec import SwaySpec
18
-
19
-
20
-class _PassSpec(ProbeSpec):
21
-    kind: Literal["__runner_pass"] = "__runner_pass"
22
-
23
-
24
-class _PassProbe(Probe):
25
-    kind = "__runner_pass"
26
-    spec_cls = _PassSpec
27
-    category = "adherence"
28
-
29
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
30
-        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.PASS, score=0.9)
31
-
32
-
33
-class _FailSpec(ProbeSpec):
34
-    kind: Literal["__runner_fail"] = "__runner_fail"
35
-
36
-
37
-class _FailProbe(Probe):
38
-    kind = "__runner_fail"
39
-    spec_cls = _FailSpec
40
-    category = "attribution"
41
-
42
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
43
-        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.FAIL, score=0.1)
44
-
45
-
46
-class _RaiseSpec(ProbeSpec):
47
-    kind: Literal["__runner_raise"] = "__runner_raise"
48
-
49
-
50
-class _RaiseProbe(Probe):
51
-    kind = "__runner_raise"
52
-    spec_cls = _RaiseSpec
53
-
54
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
55
-        raise ProbeError(spec.kind, "kaboom")
56
-
57
-
58
-class _UnexpectedSpec(ProbeSpec):
59
-    kind: Literal["__runner_unexpected"] = "__runner_unexpected"
60
-
61
-
62
-class _UnexpectedProbe(Probe):
63
-    kind = "__runner_unexpected"
64
-    spec_cls = _UnexpectedSpec
65
-
66
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
67
-        raise ValueError("surprise")
68
-
69
-
70
-@pytest.fixture
71
-def backend() -> DummyDifferentialBackend:
72
-    return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
73
-
74
-
75
-def _spec(*entries: dict) -> SwaySpec:
76
-    return SwaySpec.model_validate(
77
-        {
78
-            "version": 1,
79
-            "models": {
80
-                "base": {"base": "b"},
81
-                "ft": {"base": "b", "adapter": "/tmp/a"},
82
-            },
83
-            "suite": list(entries),
84
-        }
85
-    )
86
-
87
-
88
-class TestRunner:
89
-    def test_runs_each_probe_in_order(self, backend: DummyDifferentialBackend) -> None:
90
-        spec = _spec(
91
-            {"name": "p1", "kind": "__runner_pass"},
92
-            {"name": "p2", "kind": "__runner_fail"},
93
-        )
94
-        result = run(spec, backend)
95
-        assert [r.name for r in result.probes] == ["p1", "p2"]
96
-        assert result.probes[0].verdict == Verdict.PASS
97
-        assert result.probes[1].verdict == Verdict.FAIL
98
-
99
-    def test_disabled_probe_records_skip(self, backend: DummyDifferentialBackend) -> None:
100
-        spec = _spec({"name": "p1", "kind": "__runner_pass", "enabled": False})
101
-        result = run(spec, backend)
102
-        assert result.probes[0].verdict == Verdict.SKIP
103
-        assert "disabled" in result.probes[0].message
104
-
105
-    def test_probeerror_becomes_error_verdict(self, backend: DummyDifferentialBackend) -> None:
106
-        spec = _spec({"name": "oops", "kind": "__runner_raise"})
107
-        result = run(spec, backend)
108
-        assert result.probes[0].verdict == Verdict.ERROR
109
-        assert "kaboom" in result.probes[0].message
110
-
111
-    def test_unexpected_exception_becomes_error_verdict(
112
-        self, backend: DummyDifferentialBackend
113
-    ) -> None:
114
-        spec = _spec({"name": "oops", "kind": "__runner_unexpected"})
115
-        result = run(spec, backend)
116
-        assert result.probes[0].verdict == Verdict.ERROR
117
-        assert "ValueError" in result.probes[0].message
118
-
119
-    def test_wall_seconds_populated(self, backend: DummyDifferentialBackend) -> None:
120
-        spec = _spec({"name": "p1", "kind": "__runner_pass"})
121
-        result = run(spec, backend)
122
-        assert result.wall_seconds >= 0
123
-        assert result.probes[0].duration_s >= 0
124
-
125
-    def test_null_adapter_passes_on_null_calibrated_backend(
126
-        self, backend: DummyDifferentialBackend
127
-    ) -> None:
128
-        # Dummy backend implements NullCalibratedBackend, so calibration runs.
129
-        spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]})
130
-        result = run(spec, backend)
131
-        assert result.probes[0].kind == "null_adapter"
132
-        assert result.probes[0].verdict == Verdict.PASS
133
-        # And the suite's null_stats bubbles up onto the result.
134
-        assert "delta_kl" in result.null_stats
sway/tests/unit/test_suite_score_report.pydeleted
@@ -1,217 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
2
-
3
-from __future__ import annotations
4
-
5
-import json
6
-from datetime import timedelta
7
-from typing import Literal
8
-
9
-import pytest
10
-
11
-from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
12
-from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
13
-from dlm_sway.suite import report, score
14
-from dlm_sway.suite.spec import SwaySpec
15
-
16
-
17
-class _AdherenceSpec(ProbeSpec):
18
-    kind: Literal["__score_adherence"] = "__score_adherence"
19
-
20
-
21
-class _AdherenceProbe(Probe):
22
-    kind = "__score_adherence"
23
-    spec_cls = _AdherenceSpec
24
-    category = "adherence"
25
-
26
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
27
-        raise NotImplementedError  # never executed; registered for category lookup
28
-
29
-
30
-class _AttributionSpec(ProbeSpec):
31
-    kind: Literal["__score_attribution"] = "__score_attribution"
32
-
33
-
34
-class _AttributionProbe(Probe):
35
-    kind = "__score_attribution"
36
-    spec_cls = _AttributionSpec
37
-    category = "attribution"
38
-
39
-    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
40
-        raise NotImplementedError
41
-
42
-
43
-def _synth_suite(*probes: ProbeResult) -> SuiteResult:
44
-    started = utcnow()
45
-    return SuiteResult(
46
-        spec_path="sway.yaml",
47
-        started_at=started,
48
-        finished_at=started + timedelta(seconds=1),
49
-        base_model_id="base",
50
-        adapter_id="adapter",
51
-        sway_version="0.1.0.dev0",
52
-        probes=probes,
53
-    )
54
-
55
-
56
-class TestCompute:
57
-    def test_single_passing_probe(self) -> None:
58
-        suite = _synth_suite(
59
-            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
60
-        )
61
-        s = score.compute(suite)
62
-        assert s.overall == pytest.approx(0.8)
63
-        assert s.components["adherence"] == pytest.approx(0.8)
64
-        assert s.band == "healthy"
65
-
66
-    def test_mixed_categories_weighted(self) -> None:
67
-        suite = _synth_suite(
68
-            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
69
-            ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
70
-        )
71
-        s = score.compute(suite)
72
-        # Active categories: adherence (0.30) + attribution (0.35). Normalized.
73
-        expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
74
-        assert s.overall == pytest.approx(expected)
75
-
76
-    def test_errors_and_skips_excluded(self) -> None:
77
-        suite = _synth_suite(
78
-            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
79
-            ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
80
-            ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
81
-        )
82
-        s = score.compute(suite)
83
-        assert s.components["adherence"] == pytest.approx(0.9)
84
-
85
-    def test_per_probe_weights_override_uniform(self) -> None:
86
-        suite = _synth_suite(
87
-            ProbeResult(
88
-                name="a",
89
-                kind="__score_adherence",
90
-                verdict=Verdict.PASS,
91
-                score=1.0,
92
-                evidence={"weight": 3.0},
93
-            ),
94
-            ProbeResult(
95
-                name="b",
96
-                kind="__score_adherence",
97
-                verdict=Verdict.PASS,
98
-                score=0.0,
99
-                evidence={"weight": 1.0},
100
-            ),
101
-        )
102
-        s = score.compute(suite)
103
-        # Weighted mean: (3·1 + 1·0) / 4 = 0.75
104
-        assert s.components["adherence"] == pytest.approx(0.75)
105
-
106
-    def test_failed_probe_surfaces_in_findings(self) -> None:
107
-        suite = _synth_suite(
108
-            ProbeResult(
109
-                name="bad",
110
-                kind="__score_adherence",
111
-                verdict=Verdict.FAIL,
112
-                score=0.1,
113
-                message="nope",
114
-            )
115
-        )
116
-        s = score.compute(suite)
117
-        assert any("bad" in f for f in s.findings)
118
-
119
-
120
-class TestJsonReport:
121
-    def test_schema_fields(self) -> None:
122
-        suite = _synth_suite(
123
-            ProbeResult(
124
-                name="p1",
125
-                kind="__score_adherence",
126
-                verdict=Verdict.PASS,
127
-                score=0.75,
128
-                raw=0.12,
129
-                z_score=3.1,
130
-            )
131
-        )
132
-        s = score.compute(suite)
133
-        out = json.loads(report.to_json(suite, s))
134
-        assert out["schema_version"] == 1
135
-        assert out["score"]["overall"] == pytest.approx(0.75)
136
-        assert out["probes"][0]["verdict"] == "pass"
137
-        assert out["probes"][0]["z_score"] == pytest.approx(3.1)
138
-
139
-
140
-class TestJunit:
141
-    def test_counts_populated(self) -> None:
142
-        suite = _synth_suite(
143
-            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
144
-            ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
145
-            ProbeResult(
146
-                name="p3",
147
-                kind="__score_adherence",
148
-                verdict=Verdict.ERROR,
149
-                score=None,
150
-            ),
151
-        )
152
-        s = score.compute(suite)
153
-        xml = report.to_junit(suite, s)
154
-        assert 'tests="3"' in xml
155
-        assert 'failures="1"' in xml
156
-        assert 'errors="1"' in xml
157
-        assert "<failure" in xml
158
-        assert "<error" in xml
159
-
160
-
161
-class TestMarkdown:
162
-    def test_contains_probe_table(self) -> None:
163
-        suite = _synth_suite(
164
-            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
165
-        )
166
-        s = score.compute(suite)
167
-        md = report.to_markdown(suite, s)
168
-        assert "dlm-sway report" in md
169
-        assert "| p1 | `__score_adherence`" in md
170
-
171
-
172
-class TestTerminal:
173
-    def test_renders_without_error(self) -> None:
174
-        import io
175
-
176
-        from rich.console import Console
177
-
178
-        suite = _synth_suite(
179
-            ProbeResult(
180
-                name="p1",
181
-                kind="__score_adherence",
182
-                verdict=Verdict.PASS,
183
-                score=0.8,
184
-                raw=0.12,
185
-                z_score=3.1,
186
-                message="looks fine",
187
-            ),
188
-            ProbeResult(
189
-                name="p2",
190
-                kind="__score_attribution",
191
-                verdict=Verdict.FAIL,
192
-                score=0.1,
193
-                message="a very long message that will be truncated — " * 5,
194
-            ),
195
-            ProbeResult(
196
-                name="p3",
197
-                kind="__score_adherence",
198
-                verdict=Verdict.SKIP,
199
-                score=None,
200
-            ),
201
-        )
202
-        s = score.compute(suite)
203
-        buf = io.StringIO()
204
-        console = Console(file=buf, force_terminal=False, width=120)
205
-        report.to_terminal(suite, s, console=console)
206
-        out = buf.getvalue()
207
-        assert "dlm-sway report" in out
208
-        assert "overall:" in out
209
-        assert "p1" in out
210
-        assert "p2" in out
211
-        # Top findings section kicks in because p2 failed.
212
-        assert "top findings" in out
213
-
214
-
215
-# Force the SwaySpec model to stay reachable from tests (keeps mypy happy
216
-# on the eventual CLI path that calls into both).
217
-assert SwaySpec is not None
sway/tests/unit/test_suite_spec.pydeleted
@@ -1,85 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.suite.spec` + :mod:`dlm_sway.suite.loader`."""
2
-
3
-from __future__ import annotations
4
-
5
-from pathlib import Path
6
-
7
-import pytest
8
-
9
-from dlm_sway.core.errors import SpecValidationError
10
-from dlm_sway.suite.loader import from_dict, load_spec
11
-from dlm_sway.suite.spec import SwaySpec
12
-
13
-
14
-def _minimum_valid() -> dict:
15
-    return {
16
-        "version": 1,
17
-        "models": {
18
-            "base": {"kind": "hf", "base": "HuggingFaceTB/SmolLM2-135M-Instruct"},
19
-            "ft": {
20
-                "kind": "hf",
21
-                "base": "HuggingFaceTB/SmolLM2-135M-Instruct",
22
-                "adapter": "/tmp/adapter",
23
-            },
24
-        },
25
-        "suite": [],
26
-    }
27
-
28
-
29
-class TestSwaySpec:
30
-    def test_minimum_valid(self) -> None:
31
-        spec = from_dict(_minimum_valid())
32
-        assert isinstance(spec, SwaySpec)
33
-        assert spec.version == 1
34
-        assert spec.defaults.seed == 0
35
-        assert spec.defaults.differential is True
36
-        assert spec.suite == []
37
-
38
-    def test_rejects_unknown_top_level_keys(self) -> None:
39
-        data = _minimum_valid()
40
-        data["bogus"] = True
41
-        with pytest.raises(SpecValidationError) as exc_info:
42
-            from_dict(data)
43
-        assert "bogus" in str(exc_info.value).lower()
44
-
45
-    def test_rejects_future_version(self) -> None:
46
-        data = _minimum_valid()
47
-        data["version"] = 9
48
-        with pytest.raises(SpecValidationError, match="unsupported sway spec version"):
49
-            from_dict(data)
50
-
51
-    def test_defaults_frozen(self) -> None:
52
-        spec = from_dict(_minimum_valid())
53
-        from pydantic import ValidationError
54
-
55
-        with pytest.raises(ValidationError):
56
-            spec.defaults.seed = 99  # type: ignore[misc]
57
-
58
-
59
-class TestLoader:
60
-    def test_missing_file(self, tmp_path: Path) -> None:
61
-        missing = tmp_path / "nope.yaml"
62
-        with pytest.raises(SpecValidationError, match="not found"):
63
-            load_spec(missing)
64
-
65
-    def test_invalid_yaml(self, tmp_path: Path) -> None:
66
-        bad = tmp_path / "bad.yaml"
67
-        # An unmatched { triggers yaml.scanner; a structurally ambiguous
68
-        # indent parses as a string value, which isn't a YAML error.
69
-        bad.write_text("{ unmatched: [", encoding="utf-8")
70
-        with pytest.raises(SpecValidationError, match="invalid YAML"):
71
-            load_spec(bad)
72
-
73
-    def test_non_mapping_top_level(self, tmp_path: Path) -> None:
74
-        bad = tmp_path / "list.yaml"
75
-        bad.write_text("- 1\n- 2\n", encoding="utf-8")
76
-        with pytest.raises(SpecValidationError, match="must be a mapping"):
77
-            load_spec(bad)
78
-
79
-    def test_roundtrip_via_yaml(self, tmp_path: Path) -> None:
80
-        import yaml
81
-
82
-        path = tmp_path / "sway.yaml"
83
-        path.write_text(yaml.safe_dump(_minimum_valid()), encoding="utf-8")
84
-        spec = load_spec(path)
85
-        assert spec.models.ft.adapter == Path("/tmp/adapter")
sway/tests/unit/test_visualize.pydeleted
@@ -1,202 +0,0 @@
1
-"""Tests for :mod:`dlm_sway.visualize`.
2
-
3
-Exercises the error path (matplotlib missing) and the happy path when
4
-the module is present by stubbing ``matplotlib.pyplot`` via sys.modules.
5
-"""
6
-
7
-from __future__ import annotations
8
-
9
-import sys
10
-import types
11
-from datetime import timedelta
12
-
13
-import pytest
14
-
15
-from dlm_sway.core.errors import BackendNotAvailableError
16
-from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
17
-
18
-
19
-def _suite_with(*probes: ProbeResult) -> SuiteResult:
20
-    started = utcnow()
21
-    return SuiteResult(
22
-        spec_path="sway.yaml",
23
-        started_at=started,
24
-        finished_at=started + timedelta(seconds=1),
25
-        base_model_id="b",
26
-        adapter_id="a",
27
-        sway_version="0.1.0.dev0",
28
-        probes=probes,
29
-    )
30
-
31
-
32
-class _FakeFig:
33
-    def tight_layout(self) -> None:  # pragma: no cover — trivial
34
-        return None
35
-
36
-
37
-class _FakeAx:
38
-    def __init__(self) -> None:
39
-        self.calls: list[str] = []
40
-
41
-    def bar(self, *a, **k):  # type: ignore[no-untyped-def]
42
-        self.calls.append("bar")
43
-
44
-    def plot(self, *a, **k):  # type: ignore[no-untyped-def]
45
-        self.calls.append("plot")
46
-
47
-    def hist(self, *a, **k):  # type: ignore[no-untyped-def]
48
-        self.calls.append("hist")
49
-
50
-    def axhline(self, *a, **k):  # type: ignore[no-untyped-def]
51
-        return None
52
-
53
-    def axvline(self, *a, **k):  # type: ignore[no-untyped-def]
54
-        return None
55
-
56
-    def set_xticks(self, *a, **k):  # type: ignore[no-untyped-def]
57
-        return None
58
-
59
-    def set_xticklabels(self, *a, **k):  # type: ignore[no-untyped-def]
60
-        return None
61
-
62
-    def set_xlabel(self, *a, **k):  # type: ignore[no-untyped-def]
63
-        return None
64
-
65
-    def set_ylabel(self, *a, **k):  # type: ignore[no-untyped-def]
66
-        return None
67
-
68
-    def set_title(self, *a, **k):  # type: ignore[no-untyped-def]
69
-        return None
70
-
71
-    def legend(self, *a, **k):  # type: ignore[no-untyped-def]
72
-        return None
73
-
74
-
75
-@pytest.fixture
76
-def fake_mpl(monkeypatch: pytest.MonkeyPatch) -> _FakeAx:
77
-    ax = _FakeAx()
78
-
79
-    def _subplots(*a, **k):  # type: ignore[no-untyped-def]
80
-        return _FakeFig(), ax
81
-
82
-    plt = types.ModuleType("matplotlib.pyplot")
83
-    plt.subplots = _subplots  # type: ignore[attr-defined]
84
-    mpl_pkg = types.ModuleType("matplotlib")
85
-    monkeypatch.setitem(sys.modules, "matplotlib", mpl_pkg)
86
-    monkeypatch.setitem(sys.modules, "matplotlib.pyplot", plt)
87
-    return ax
88
-
89
-
90
-def test_section_sis_plot_uses_per_section_evidence(fake_mpl: _FakeAx) -> None:
91
-    from dlm_sway.visualize import plot_section_sis
92
-
93
-    suite = _suite_with(
94
-        ProbeResult(
95
-            name="sis",
96
-            kind="section_internalization",
97
-            verdict=Verdict.PASS,
98
-            score=0.75,
99
-            raw=0.1,
100
-            evidence={
101
-                "per_section": [
102
-                    {
103
-                        "section_id": "a",
104
-                        "kind": "prose",
105
-                        "tag": None,
106
-                        "base_nll": 3.0,
107
-                        "ft_nll": 2.5,
108
-                        "own_lift": 0.17,
109
-                        "leak_lift": 0.02,
110
-                        "effective_sis": 0.15,
111
-                        "passed": True,
112
-                    },
113
-                    {
114
-                        "section_id": "b",
115
-                        "kind": "instruction",
116
-                        "tag": "intro",
117
-                        "base_nll": 4.0,
118
-                        "ft_nll": 3.9,
119
-                        "own_lift": 0.025,
120
-                        "leak_lift": 0.03,
121
-                        "effective_sis": -0.005,
122
-                        "passed": False,
123
-                    },
124
-                ],
125
-                "per_section_threshold": 0.05,
126
-            },
127
-        )
128
-    )
129
-    plot_section_sis(suite)
130
-    assert "bar" in fake_mpl.calls
131
-
132
-
133
-def test_adapter_ablation_plot(fake_mpl: _FakeAx) -> None:
134
-    from dlm_sway.visualize import plot_adapter_ablation
135
-
136
-    suite = _suite_with(
137
-        ProbeResult(
138
-            name="abl",
139
-            kind="adapter_ablation",
140
-            verdict=Verdict.PASS,
141
-            score=0.8,
142
-            raw=0.9,
143
-            evidence={
144
-                "lambdas": [0.0, 0.5, 1.0, 1.25],
145
-                "mean_divergence_per_lambda": [0.0, 0.5, 1.0, 1.1],
146
-                "linearity": 0.91,
147
-                "saturation_lambda": 0.75,
148
-                "overshoot": 1.1,
149
-            },
150
-        )
151
-    )
152
-    plot_adapter_ablation(suite)
153
-    assert "plot" in fake_mpl.calls
154
-
155
-
156
-def test_kl_histogram_plot(fake_mpl: _FakeAx) -> None:
157
-    from dlm_sway.visualize import plot_kl_histogram
158
-
159
-    suite = _suite_with(
160
-        ProbeResult(
161
-            name="dk",
162
-            kind="delta_kl",
163
-            verdict=Verdict.PASS,
164
-            score=0.7,
165
-            raw=0.1,
166
-            evidence={"per_prompt": [0.05, 0.1, 0.12, 0.09, 0.15], "divergence_kind": "js"},
167
-        )
168
-    )
169
-    plot_kl_histogram(suite)
170
-    assert "hist" in fake_mpl.calls
171
-
172
-
173
-def test_raises_when_matplotlib_missing(monkeypatch: pytest.MonkeyPatch) -> None:
174
-    # Purge matplotlib modules and block imports.
175
-    for mod in list(sys.modules):
176
-        if mod == "matplotlib" or mod.startswith("matplotlib."):
177
-            monkeypatch.delitem(sys.modules, mod, raising=False)
178
-
179
-    import builtins
180
-
181
-    real_import = builtins.__import__
182
-
183
-    def fake_import(name: str, *a, **k):  # type: ignore[no-untyped-def]
184
-        if name == "matplotlib" or name.startswith("matplotlib."):
185
-            raise ImportError("matplotlib missing in this venv")
186
-        return real_import(name, *a, **k)
187
-
188
-    monkeypatch.setattr(builtins, "__import__", fake_import)
189
-
190
-    from dlm_sway.visualize import plot_section_sis
191
-
192
-    suite = _suite_with()
193
-    with pytest.raises(BackendNotAvailableError):
194
-        plot_section_sis(suite)
195
-
196
-
197
-def test_raises_when_no_matching_probe(fake_mpl: _FakeAx) -> None:
198
-    from dlm_sway.visualize import plot_section_sis
199
-
200
-    suite = _suite_with()  # empty — no section_internalization probe
201
-    with pytest.raises(ValueError, match="section_internalization"):
202
-        plot_section_sis(suite)