`72bb003`

sway: convert in-tree subproject to git submodule pointing at tenseleyFlow/sway

Authored by

espadonne 3 weeks ago

SHA: 72bb0030b72321dea3c66a2e6d7ce26e52c74550
Parents: 9da4019
Tree: e628ba5

83 changed files

Status	File	+	-
M	`.gitmodules`	3	0
A	`sway`	1	0
D	`sway/CHANGELOG.md`	0	41
D	`sway/LICENSE`	0	21
D	`sway/README.md`	0	101
D	`sway/pyproject.toml`	0	210
D	`sway/src/dlm_sway/__init__.py`	0	42
D	`sway/src/dlm_sway/backends/__init__.py`	0	118
D	`sway/src/dlm_sway/backends/dummy.py`	0	257
D	`sway/src/dlm_sway/backends/hf.py`	0	375
D	`sway/src/dlm_sway/backends/mlx.py`	0	205
D	`sway/src/dlm_sway/cli/__init__.py`	0	1
D	`sway/src/dlm_sway/cli/app.py`	0	59
D	`sway/src/dlm_sway/cli/commands.py`	0	396
D	`sway/src/dlm_sway/core/__init__.py`	0	1
D	`sway/src/dlm_sway/core/determinism.py`	0	97
D	`sway/src/dlm_sway/core/errors.py`	0	65
D	`sway/src/dlm_sway/core/model.py`	0	112
D	`sway/src/dlm_sway/core/result.py`	0	139
D	`sway/src/dlm_sway/core/scoring.py`	0	203
D	`sway/src/dlm_sway/core/sections.py`	0	76
D	`sway/src/dlm_sway/integrations/__init__.py`	0	1
D	`sway/src/dlm_sway/integrations/dlm/__init__.py`	0	1
D	`sway/src/dlm_sway/integrations/dlm/autogen.py`	0	191
D	`sway/src/dlm_sway/integrations/dlm/resolver.py`	0	243
D	`sway/src/dlm_sway/probes/__init__.py`	0	27
D	`sway/src/dlm_sway/probes/_calibration_pack.py`	0	63
D	`sway/src/dlm_sway/probes/_divergence.py`	0	102
D	`sway/src/dlm_sway/probes/adapter_ablation.py`	0	193
D	`sway/src/dlm_sway/probes/adapter_revert.py`	0	178
D	`sway/src/dlm_sway/probes/base.py`	0	131
D	`sway/src/dlm_sway/probes/calibration_drift.py`	0	135
D	`sway/src/dlm_sway/probes/delta_kl.py`	0	121
D	`sway/src/dlm_sway/probes/leakage.py`	0	194
D	`sway/src/dlm_sway/probes/null_adapter.py`	0	144
D	`sway/src/dlm_sway/probes/paraphrase_invariance.py`	0	148
D	`sway/src/dlm_sway/probes/preference_flip.py`	0	140
D	`sway/src/dlm_sway/probes/prompt_collapse.py`	0	159
D	`sway/src/dlm_sway/probes/section_internalization.py`	0	189
D	`sway/src/dlm_sway/probes/style_fingerprint.py`	0	179
D	`sway/src/dlm_sway/py.typed`	0	0
D	`sway/src/dlm_sway/suite/__init__.py`	0	1
D	`sway/src/dlm_sway/suite/loader.py`	0	48
D	`sway/src/dlm_sway/suite/report.py`	0	249
D	`sway/src/dlm_sway/suite/runner.py`	0	136
D	`sway/src/dlm_sway/suite/score.py`	0	106
D	`sway/src/dlm_sway/suite/spec.py`	0	72
D	`sway/src/dlm_sway/visualize.py`	0	137
D	`sway/tests/__init__.py`	0	0
D	`sway/tests/conftest.py`	0	29
D	`sway/tests/fixtures/__init__.py`	0	0
D	`sway/tests/fixtures/tiny_model.py`	0	53
D	`sway/tests/integration/__init__.py`	0	0
D	`sway/tests/integration/conftest.py`	0	10
D	`sway/tests/integration/test_hf_adapter_toggle.py`	0	113
D	`sway/tests/unit/__init__.py`	0	0
D	`sway/tests/unit/test_backend_dummy.py`	0	102
D	`sway/tests/unit/test_backend_registry.py`	0	133
D	`sway/tests/unit/test_cli.py`	0	92
D	`sway/tests/unit/test_determinism.py`	0	47
D	`sway/tests/unit/test_divergence.py`	0	73
D	`sway/tests/unit/test_dlm_bridge.py`	0	252
D	`sway/tests/unit/test_errors.py`	0	55
D	`sway/tests/unit/test_model.py`	0	78
D	`sway/tests/unit/test_null_calibration.py`	0	123
D	`sway/tests/unit/test_probe_adapter_ablation.py`	0	135
D	`sway/tests/unit/test_probe_adapter_revert.py`	0	170
D	`sway/tests/unit/test_probe_base.py`	0	69
D	`sway/tests/unit/test_probe_calibration_drift.py`	0	57
D	`sway/tests/unit/test_probe_delta_kl.py`	0	124
D	`sway/tests/unit/test_probe_leakage.py`	0	109
D	`sway/tests/unit/test_probe_paraphrase_invariance.py`	0	91
D	`sway/tests/unit/test_probe_preference_flip.py`	0	161
D	`sway/tests/unit/test_probe_prompt_collapse.py`	0	137
D	`sway/tests/unit/test_probe_section_internalization.py`	0	94
D	`sway/tests/unit/test_probe_style_fingerprint.py`	0	115
D	`sway/tests/unit/test_result.py`	0	82
D	`sway/tests/unit/test_scoring.py`	0	84
D	`sway/tests/unit/test_sections.py`	0	35
D	`sway/tests/unit/test_suite_runner.py`	0	134
D	`sway/tests/unit/test_suite_score_report.py`	0	217
D	`sway/tests/unit/test_suite_spec.py`	0	85
D	`sway/tests/unit/test_visualize.py`	0	202

.gitmodulesmodified

  	# `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/
  	# which the submodule's own .gitignore covers.
  	ignore = untracked
 +[submodule "sway"]
 +	path = sway
 +	url = https://github.com/tenseleyFlow/sway.git

swayadded

`@@ -0,0 +1,1 @@`
	1	+Subproject commit 98ad9417c94e1bbeb97cf5e553878d7953513f69

sway/CHANGELOG.mddeleted

 -# Changelog
+-
 -## 0.1.0.dev0 — 2026-04-20
+-
 -Initial pre-alpha. Full 11-primitive battery shipped.
+-
 -### Primitives
+-
 -- **Adherence**
 -  - `delta_kl` — mean JS/KL divergence between base and fine-tuned next-token distributions
 -  - `adapter_revert` — reversion under adversarial paraphrase (needs `sway-eval[semsim]`)
 -  - `prompt_collapse` — exponential-decay fit of divergence over context length
 -- **Attribution**
 -  - `section_internalization` *(flagship)* — per-section `effective_sis` with leak check
 -  - `paraphrase_invariance` — memorization vs. generalization, intent-aware
 -  - `preference_flip` — DPO/ORPO chosen/rejected margin inversion
 -- **Calibration**
 -  - `style_fingerprint` — 6-dim numpy-only stylistic shift vs. document
 -  - `calibration_drift` — general-knowledge regression on a packaged 30-item pack
 -  - `leakage` — greedy LCS recall + perturbation fragility
 -- **Ablation**
 -  - `adapter_ablation` *(signature primitive)* — λ-scaled divergence curve with linearity, saturation, overshoot metrics
 -- **Baseline**
 -  - `null_adapter` — stats scaffolding for z-score calibration (implementation pending)
+-
 -### Infrastructure
+-
 -- `DifferentialBackend` + `ScalableDifferentialBackend` protocols
 -- HuggingFace + PEFT backend with `disable_adapter` / `set_adapter` toggling and LoRA-scale mutation
 -- Dummy backend for unit tests (canned responses + linear-blend scalable mode)
 -- YAML spec loader, composite score (four-category weighted), rich terminal + JSON + JUnit + Markdown reports
 -- Typer CLI: `run`, `gate`, `check`, `diff`, `autogen`, `doctor`, `report`
 -- `.dlm` bridge (`dlm-sway[dlm]`): resolver + full-battery autogen
 -- Matplotlib visualizations (`dlm-sway[viz]`): SIS bar chart, ablation curve, KL histogram
+-
 -### Known gaps
+-
 -- Null-adapter baseline is scaffolded but its HF-level materialization (building random-init LoRAs at matched rank) is not yet wired — probes fall back to fixed thresholds until the next milestone.
 -- Custom backend entry-point dispatch (`kind: custom`) is stubbed but not implemented.
 -- MLX backend is registered as a future-milestone target; all MLX paths raise `BackendNotAvailableError`.
 -- PyPI publication of the `dlm-sway` wheel is pending a clean CI release workflow.

sway/LICENSEdeleted

 -MIT License
+-
 -Copyright (c) 2026 Matt Wolffe
+-
 -Permission is hereby granted, free of charge, to any person obtaining a copy
 -of this software and associated documentation files (the "Software"), to deal
 -in the Software without restriction, including without limitation the rights
 -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 -copies of the Software, and to permit persons to whom the Software is
 -furnished to do so, subject to the following conditions:
+-
 -The above copyright notice and this permission notice shall be included in all
 -copies or substantial portions of the Software.
+-
 -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 -SOFTWARE.

sway/README.mddeleted

 -# dlm-sway
+-
 -Differential testing for fine-tuned causal language models.
+-
 -**One question:** *did LoRA/QLoRA training actually change model behavior
 -in a meaningful way, or is the model just defaulting to the pretrained
 -base?*
+-
 -`dlm-sway` gives you a trustworthy, reproducible answer with eleven
 -purpose-built primitives, each z-scored against a null-adapter baseline.
 -No LLM judges. No external APIs. Deterministic on CPU where possible.
+-
 -## Install
+-
 -```bash
 -pip install "dlm-sway[hf]"                # HuggingFace + PEFT backend
 -pip install "dlm-sway[hf,style,semsim]"   # full primitive battery
 -pip install "dlm-sway[all]"               # everything including optional viz
 -pip install "dlm-sway[dlm]"               # auto-generate tests from a .dlm file
 -```
+-
 -## 90-second smoke test
+-
 -```bash
 -dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct
 -```
+-
 -Outputs a verdict in under a minute on CPU for small models: *your
 -adapter is 4.2σ above noise* ✅ or *indistinguishable from a null
 -adapter* ❌.
+-
 -## Full suite
+-
 -```yaml
 -# sway.yaml
 -version: 1
 -models:
 -  base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"}
 -  ft:   {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct",
 -         adapter: "./runs/adapter/v0003"}
 -suite:
 -  - {name: knows_concept, kind: dir,
 -     prompt: "The Dunning-Kruger effect describes",
 -     target: " a cognitive bias where",
 -     distractor: " a programming language"}
 -  - {name: no_reversion, kind: adapter_revert, paraphrases: 4}
 -  - {name: section_attribution, kind: section_internalization}
 -```
+-
 -```bash
 -dlm-sway run sway.yaml              # full report to terminal + JSON
 -dlm-sway gate sway.yaml --junit     # CI-friendly; non-zero on fail
 -```
+-
 -## Why it exists
+-
 -Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"*
 -That's the wrong question after a targeted LoRA fine-tune on a small
 -user-authored document. The right question is *"did the adapter actually
 -move the model toward what I wrote?"* — and existing tools answer this
 -poorly.
+-
 -`dlm-sway` answers it directly via eleven primitives across four
 -categories:
+-
 -| Category      | Primitives                                            |
 -|---------------|-------------------------------------------------------|
 -| Adherence     | `delta_kl`, `adapter_revert`, `prompt_collapse`       |
 -| Attribution   | `section_internalization`, `paraphrase_invariance`, `preference_flip` |
 -| Calibration   | `style_fingerprint`, `calibration_drift`, `leakage`   |
 -| Ablation      | `adapter_ablation` ← the signature primitive          |
+-
 -**The signature primitive.** `adapter_ablation` scales the LoRA additive
 -term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence
 -curve. A healthy fine-tune shows a smooth, monotonic, non-saturated
 -response. A degenerate one shows a step function or an overshoot-then-
 -crash. Nobody else does this because nobody else gets this close to the
 -adapter math.
+-
 -## The `.dlm` integration
+-
 -If you trained your adapter via the [DocumentLanguageModel
 -project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway
 -can auto-generate a test suite from your document's sections:
+-
 -```bash
 -pip install "dlm-sway[hf,dlm]"
 -dlm-sway autogen path/to/doc.dlm -o sway.yaml
 -dlm-sway run sway.yaml
 -```
+-
 -Per-section attribution tells you *which* parts of your document
 -actually moved the model — a kind of signal no other tool provides.
+-
 -## Status
+-
 -Pre-alpha. API will break. Version `0.1.0` is the first tag.
+-
 -## License
+-
 -MIT

sway/pyproject.tomldeleted

 -[project]
 -name = "dlm-sway"
 -version = "0.1.0.dev0"
 -description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?"
 -readme = "README.md"
 -requires-python = ">=3.11"
 -license = { text = "MIT" }
 -authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }]
 -keywords = [
 -    "lora",
 -    "qlora",
 -    "peft",
 -    "fine-tuning",
 -    "evaluation",
 -    "llm",
 -    "differential-testing",
 -]
 -classifiers = [
 -    "Development Status :: 3 - Alpha",
 -    "Intended Audience :: Developers",
 -    "Intended Audience :: Science/Research",
 -    "License :: OSI Approved :: MIT License",
 -    "Programming Language :: Python :: 3",
 -    "Programming Language :: Python :: 3.11",
 -    "Programming Language :: Python :: 3.12",
 -    "Topic :: Scientific/Engineering :: Artificial Intelligence",
 -]
+-
 -# Core deps: spec loading, orchestration, reporting. No torch — a user
 -# who only defines specs or writes a custom backend shouldn't pull 3 GB
 -# of CUDA wheels.
 -dependencies = [
 -    "pydantic>=2.9",
 -    "pyyaml>=6.0",
 -    "typer>=0.12",
 -    "rich>=13.7",
 -    "numpy>=1.26",
 -    "packaging>=24.0",
 -]
+-
 -[project.optional-dependencies]
 -# HuggingFace + PEFT scoring backend. The canonical path.
 -hf = [
 -    "torch>=2.4",
 -    "transformers>=4.45",
 -    "peft>=0.13",
 -    "safetensors>=0.4",
 -]
 -# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op
 -# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays
 -# sane.
 -mlx = [
 -    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
 -    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
 -]
 -# Stylistic fingerprinting (C1). spaCy models pull at runtime via
 -# `python -m spacy download`.
 -style = [
 -    "spacy>=3.7",
 -    "textstat>=0.7",
 -    "nlpaug>=1.1",
 -]
 -# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly.
 -semsim = [
 -    "sentence-transformers>=3.0",
 -]
 -# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm.
 -dlm = [
 -    "dlm>=0.9",
 -]
 -# Visualization (P9).
 -viz = [
 -    "matplotlib>=3.8",
 -]
 -all = [
 -    "torch>=2.4",
 -    "transformers>=4.45",
 -    "peft>=0.13",
 -    "safetensors>=0.4",
 -    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
 -    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
 -    "spacy>=3.7",
 -    "textstat>=0.7",
 -    "nlpaug>=1.1",
 -    "sentence-transformers>=3.0",
 -    "matplotlib>=3.8",
 -]
+-
 -[project.scripts]
 -dlm-sway = "dlm_sway.cli.app:main"
+-
 -[project.urls]
 -Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
 -Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
+-
 -[dependency-groups]
 -dev = [
 -    "pytest>=8.0",
 -    "pytest-cov>=5.0",
 -    "mypy>=1.11",
 -    "ruff>=0.6",
 -    "types-pyyaml>=6.0",
 -    "hypothesis>=6.152.1",
 -]
+-
 -[build-system]
 -requires = ["hatchling"]
 -build-backend = "hatchling.build"
+-
 -[tool.hatch.build.targets.wheel]
 -packages = ["src/dlm_sway"]
+-
 -# -------- ruff --------
 -[tool.ruff]
 -line-length = 100
 -target-version = "py311"
 -src = ["src", "tests"]
+-
 -[tool.ruff.lint]
 -select = [
 -    "E",    # pycodestyle errors
 -    "F",    # pyflakes
 -    "W",    # pycodestyle warnings
 -    "I",    # isort
 -    "UP",   # pyupgrade
 -    "B",    # bugbear
 -    "N",    # pep8-naming
 -    "C4",   # comprehensions
 -    "SIM",  # simplify
 -    "PT",   # pytest
 -    "RET",  # return
 -    "ARG",  # unused args
 -    "PTH",  # use pathlib
 -    "TID",  # tidy imports
 -]
 -ignore = [
 -    "E501",  # handled by formatter
 -]
+-
 -[tool.ruff.lint.per-file-ignores]
 -"tests/**/*.py" = ["ARG", "PT011", "SIM117"]
 -# PyTorch's canonical `import torch.nn.functional as F` is universally
 -# read, so we allow the naming exception in the HF backend only.
 -"src/dlm_sway/backends/hf.py" = ["N812"]
 -# The .dlm bridge is the one place allowed to import the ``dlm`` package.
 -"src/dlm_sway/integrations/dlm/*.py" = ["TID251"]
+-
 -[tool.ruff.lint.flake8-tidy-imports.banned-api]
 -# Hard architectural boundary: the `dlm` package is only importable
 -# from inside the optional integration shim. This keeps dlm-sway
 -# usable for anyone with just a HuggingFace base + PEFT adapter.
 -"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)."
+-
 -[tool.ruff.format]
 -quote-style = "double"
 -indent-style = "space"
+-
 -# -------- mypy --------
 -[tool.mypy]
 -strict = true
 -python_version = "3.11"
 -packages = ["dlm_sway"]
 -mypy_path = "src"
 -warn_return_any = true
 -warn_unused_ignores = true
 -warn_redundant_casts = true
 -no_implicit_optional = true
 -disallow_untyped_decorators = true
 -plugins = ["pydantic.mypy"]
+-
 -[tool.pydantic-mypy]
 -init_forbid_extra = true
 -init_typed = true
 -warn_required_dynamic_aliases = true
+-
 -# Stubless ML ecosystem packages. Narrow boundaries in backends/* import
 -# them explicitly; the rest of the codebase stays strict.
 -[[tool.mypy.overrides]]
 -module = [
 -    "torch",
 -    "torch.*",
 -    "transformers.*",
 -    "peft.*",
 -    "safetensors.*",
 -    "mlx.*",
 -    "mlx_lm.*",
 -    "sentence_transformers.*",
 -    "spacy.*",
 -    "textstat.*",
 -    "nlpaug.*",
 -    "matplotlib",
 -    "matplotlib.*",
 -    "huggingface_hub.*",
 -    "dlm.*",
 -]
 -ignore_missing_imports = true
 -disable_error_code = ["no-untyped-call"]
+-
 -# -------- pytest --------
 -[tool.pytest.ini_options]
 -testpaths = ["tests"]
 -addopts = [
 -    "-ra",
 -    "-m", "not slow and not gpu and not online",
 -]
 -markers = [
 -    "slow: expensive; deselected by default",
 -    "gpu: requires CUDA; skipped on CPU/MPS runners",
 -    "online: touches the network; skipped in offline CI",
 -]

sway/src/dlm_sway/__init__.pydeleted

 -"""dlm-sway — differential testing for fine-tuned causal language models."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.core.errors import (
 -    BackendNotAvailableError,
 -    ProbeError,
 -    SpecValidationError,
 -    SwayError,
 -)
 -from dlm_sway.core.model import LoadedModel, Model, ModelSpec
 -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
 -from dlm_sway.core.scoring import (
 -    DifferentialBackend,
 -    NullCalibratedBackend,
 -    RollingLogprob,
 -    ScalableDifferentialBackend,
 -    ScoringBackend,
 -    TokenDist,
 -)
+-
 -__all__ = [
 -    "BackendNotAvailableError",
 -    "DifferentialBackend",
 -    "LoadedModel",
 -    "Model",
 -    "ModelSpec",
 -    "NullCalibratedBackend",
 -    "ProbeError",
 -    "ProbeResult",
 -    "RollingLogprob",
 -    "ScalableDifferentialBackend",
 -    "ScoringBackend",
 -    "SpecValidationError",
 -    "SuiteResult",
 -    "SwayError",
 -    "SwayScore",
 -    "TokenDist",
 -    "Verdict",
 -]
+-
 -__version__ = "0.1.0.dev0"

sway/src/dlm_sway/backends/__init__.pydeleted

 -"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom.
+-
 -Backends are constructed from a :class:`~dlm_sway.core.model.ModelSpec`
 -via :func:`build`. Heavy backends (HF, MLX) import their framework only
 -on construction so ``import dlm_sway`` stays cheap for users who only
 -touch the dummy backend or the spec loader.
 -"""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
 -from typing import TYPE_CHECKING
+-
 -from dlm_sway.core.errors import SpecValidationError
 -from dlm_sway.core.model import ModelSpec
+-
 -if TYPE_CHECKING:
 -    from dlm_sway.core.scoring import DifferentialBackend
+-
+-
 -def build(base_spec: ModelSpec, *, adapter_path: Path | None = None) -> DifferentialBackend:
 -    """Materialize a differential backend from a model spec.
+-
 -    The adapter path typically comes from ``ft.adapter`` in the spec —
 -    it's lifted to a keyword here so the same function can be used for
 -    "differential" (base + adapter on one loaded model) or future
 -    split-load paths.
 -    """
 -    effective_adapter = adapter_path if adapter_path is not None else base_spec.adapter
+-
 -    if base_spec.kind == "dummy":
 -        # Dummy backend isn't really about the spec — it's for tests
 -        # that pre-populate responses. Surface a loud error if someone
 -        # tries to build it through the normal path.
 -        raise SpecValidationError(
 -            "kind='dummy' backends must be constructed directly via "
 -            "DummyDifferentialBackend(base=..., ft=...); they cannot be "
 -            "materialized from a ModelSpec."
 -        )
+-
 -    if base_spec.kind == "hf":
 -        if effective_adapter is None:
 -            raise SpecValidationError(
 -                "hf backend requires an adapter path (set `adapter:` on the ft model)"
 -            )
 -        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
+-
 -        return HuggingFaceDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
+-
 -    if base_spec.kind == "mlx":
 -        if effective_adapter is None:
 -            raise SpecValidationError(
 -                "mlx backend requires an adapter path (set `adapter:` on the ft model; "
 -                "must be an MLX .npz adapter — use dlm's peft→mlx converter if needed)"
 -            )
 -        from dlm_sway.backends.mlx import MLXDifferentialBackend
+-
 -        return MLXDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
+-
 -    if base_spec.kind == "custom":
 -        return _load_custom(base_spec, effective_adapter)
+-
 -    raise SpecValidationError(f"unknown backend kind: {base_spec.kind!r}")
+-
+-
 -def _load_custom(base_spec: ModelSpec, adapter: Path | None) -> DifferentialBackend:
 -    """Dispatch to a user-supplied backend via ``entry_point='pkg.mod:Name'``.
+-
 -    The imported class is instantiated as ``Cls(base_spec=..., adapter_path=...)``
 -    — the same signature as :class:`dlm_sway.backends.hf.HuggingFaceDifferentialBackend`
 -    so authors can model their implementation on the built-in. The
 -    result is runtime-checked against :class:`DifferentialBackend` so
 -    protocol violations fail at construction, not deep inside a probe.
 -    """
 -    from dlm_sway.core.scoring import DifferentialBackend as DiffBackend
+-
 -    entry = base_spec.entry_point
 -    if not entry:
 -        raise SpecValidationError(
 -            "kind='custom' requires an entry_point of the form 'pkg.module:ClassName'"
 -        )
 -    if ":" not in entry:
 -        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
 -    module_path, _, class_name = entry.partition(":")
 -    if not module_path or not class_name:
 -        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
+-
 -    import importlib
+-
 -    try:
 -        module = importlib.import_module(module_path)
 -    except ImportError as exc:
 -        raise SpecValidationError(
 -            f"custom backend: cannot import module {module_path!r}: {exc}"
 -        ) from exc
 -    cls = getattr(module, class_name, None)
 -    if cls is None:
 -        raise SpecValidationError(
 -            f"custom backend: module {module_path!r} has no attribute {class_name!r}"
 -        )
+-
 -    try:
 -        instance = cls(base_spec=base_spec, adapter_path=adapter)
 -    except TypeError as exc:
 -        raise SpecValidationError(
 -            f"custom backend {entry!r} constructor signature mismatch: {exc}. "
 -            "Expected Cls(base_spec: ModelSpec, adapter_path: Path | None)"
 -        ) from exc
+-
 -    if not isinstance(instance, DiffBackend):
 -        raise SpecValidationError(
 -            f"custom backend {entry!r} does not satisfy DifferentialBackend "
 -            "(needs as_base() and as_finetuned() context managers)"
 -        )
 -    return instance
+-
+-
 -__all__ = ["build"]

sway/src/dlm_sway/backends/dummy.pydeleted

 -"""In-memory backend for unit tests.
+-
 -Deterministic, torchless, and trivially fast. Tests pass canned responses
 -and canned score tables keyed by ``(mode, prompt, completion)``. The same
 -backend instance serves as both ``as_base`` and ``as_finetuned`` — it
 -switches an internal mode flag.
+-
 -Use it to drive every probe's unit test without loading a real model.
 -For integration tests against a real PEFT adapter, see
 -:class:`~dlm_sway.backends.hf.HuggingFaceDifferentialBackend`.
 -"""
+-
 -from __future__ import annotations
+-
 -import math
 -from collections.abc import Iterator
 -from contextlib import contextmanager
 -from dataclasses import dataclass, field
 -from typing import Literal
+-
 -import numpy as np
+-
 -from dlm_sway.core.scoring import RollingLogprob, TokenDist
+-
 -Mode = Literal["base", "ft"]
+-
+-
 -@dataclass(slots=True)
 -class DummyResponses:
 -    """Canned data for one mode (base or ft).
+-
 -    Callers populate one of these per mode and hand both to
 -    :class:`DummyDifferentialBackend`.
 -    """
+-
 -    generations: dict[str, str] = field(default_factory=dict)
 -    """Prompt → canned completion. Lookup is exact-match."""
 -    logprobs: dict[tuple[str, str], float] = field(default_factory=dict)
 -    """``(prompt, completion) → sum logprob``. Default ``-10.0`` if missing."""
 -    rolling: dict[str, RollingLogprob] = field(default_factory=dict)
 -    """Text → canned :class:`RollingLogprob`."""
 -    token_dists: dict[str, TokenDist] = field(default_factory=dict)
 -    """Prompt → canned :class:`TokenDist`."""
+-
+-
 -class _DummyView:
 -    """The per-mode view yielded by ``as_base`` / ``as_finetuned``.
+-
 -    Implements :class:`~dlm_sway.core.model.Model` *and*
 -    :class:`~dlm_sway.core.scoring.ScoringBackend` — i.e. the
 -    ``ScoringModel`` intersection.
 -    """
+-
 -    def __init__(self, mode: Mode, responses: DummyResponses) -> None:
 -        self.id = mode
 -        self._mode: Mode = mode
 -        self._r = responses
+-
 -    # -- Model ---------------------------------------------------------
 -    def generate(
 -        self,
 -        prompt: str,
 -        *,
 -        max_new_tokens: int,
 -        temperature: float = 0.0,
 -        top_p: float = 1.0,
 -        seed: int = 0,
 -    ) -> str:
 -        del max_new_tokens, temperature, top_p, seed  # canned; decoding is trivial.
 -        try:
 -            return self._r.generations[prompt]
 -        except KeyError as exc:
 -            raise KeyError(
 -                f"dummy backend ({self._mode}): no canned generation for prompt {prompt!r}"
 -            ) from exc
+-
 -    def close(self) -> None:
 -        return None
+-
 -    # -- ScoringBackend ------------------------------------------------
 -    def logprob_of(self, prompt: str, completion: str) -> float:
 -        return self._r.logprobs.get((prompt, completion), -10.0)
+-
 -    def rolling_logprob(self, text: str) -> RollingLogprob:
 -        if text in self._r.rolling:
 -            return self._r.rolling[text]
 -        # Synthesize a plausible rolling logprob so probes that just
 -        # want a non-trivial value work without per-text configuration.
 -        tokens = text.split()
 -        n = max(len(tokens), 1)
 -        per_tok = -2.0 if self._mode == "base" else -1.5
 -        return RollingLogprob(
 -            token_ids=np.arange(n, dtype=np.int64),
 -            logprobs=np.full(max(n - 1, 0), per_tok, dtype=np.float32),
 -            num_tokens=n,
 -            total_logprob=per_tok * max(n - 1, 0),
 -        )
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -        del top_k
 -        if prompt in self._r.token_dists:
 -            return self._r.token_dists[prompt]
 -        # Synthesize a sharp base / broad ft distribution so divergence
 -        # probes see a non-zero signal without hand-rolled data.
 -        vocab = 1000
 -        k = 8
 -        if self._mode == "base":
 -            lp = np.array([-0.1] + [-5.0] * (k - 1), dtype=np.float32)
 -        else:
 -            # More uniform mass across the top-k tokens.
 -            lp = np.full(k, -math.log(k), dtype=np.float32)
 -        return TokenDist(
 -            token_ids=np.arange(k, dtype=np.int64),
 -            logprobs=lp,
 -            vocab_size=vocab,
 -            tail_logprob=math.log1p(-float(np.exp(lp).sum())) if np.exp(lp).sum() < 1 else 0.0,
 -        )
+-
+-
 -class _NullView(_DummyView):
 -    """A dummy view that perturbs the base distribution with seeded noise.
+-
 -    Used by :meth:`DummyDifferentialBackend.as_null_adapter`. The
 -    perturbation is small (matches an ``init_scale=0.02`` adapter) so
 -    the null-vs-base divergence stays well below real-adapter territory
 -    in probe tests.
 -    """
+-
 -    def __init__(self, base_responses: DummyResponses, seed: int, init_scale: float) -> None:
 -        super().__init__("base", base_responses)
 -        self._seed = seed
 -        self._init_scale = init_scale
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -        base_dist = super().next_token_dist(prompt, top_k=top_k)
 -        rng = np.random.default_rng(self._seed + hash(prompt) % 1_000_003)
 -        noise = rng.normal(0.0, self._init_scale, size=base_dist.logprobs.shape).astype(np.float32)
 -        new_lp = base_dist.logprobs + noise
 -        # Re-normalize (within the top-k slice) so a valid distribution comes back.
 -        max_lp = new_lp.max()
 -        new_probs = np.exp(new_lp - max_lp)
 -        new_probs /= new_probs.sum()
 -        return TokenDist(
 -            token_ids=base_dist.token_ids,
 -            logprobs=np.log(new_probs).astype(np.float32),
 -            vocab_size=base_dist.vocab_size,
 -            tail_logprob=base_dist.tail_logprob,
 -        )
+-
+-
 -class _InterpolatedView(_DummyView):
 -    """A dummy view where logits/dists are a lam-blend of base and ft.
+-
 -    Used by :meth:`DummyDifferentialBackend.as_scaled_adapter`.
 -    Generation falls back to the ft view at lam>=0.5, base otherwise —
 -    rounded because the dummy backend's generations are canned strings
 -    with no notion of "how much".
 -    """
+-
 -    def __init__(
 -        self,
 -        base_responses: DummyResponses,
 -        ft_responses: DummyResponses,
 -        lam: float,
 -    ) -> None:
 -        super().__init__(
 -            "ft" if lam >= 0.5 else "base", ft_responses if lam >= 0.5 else base_responses
 -        )
 -        self._base_r = base_responses
 -        self._ft_r = ft_responses
 -        self._lam = lam
+-
 -    def logprob_of(self, prompt: str, completion: str) -> float:
 -        base_v = self._base_r.logprobs.get((prompt, completion), -10.0)
 -        ft_v = self._ft_r.logprobs.get((prompt, completion), -10.0)
 -        return (1 - self._lam) * base_v + self._lam * ft_v
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256):  # type: ignore[no-untyped-def]
 -        base_dist = _DummyView("base", self._base_r).next_token_dist(prompt, top_k=top_k)
 -        ft_dist = _DummyView("ft", self._ft_r).next_token_dist(prompt, top_k=top_k)
 -        # Both dists are on the same synthetic support when unseeded; blend
 -        # their logprobs via log-space linear interpolation, which is a
 -        # log-linear "tempered" mix and keeps normalization close enough.
 -        lam = self._lam
 -        blended_lp = (1 - lam) * base_dist.logprobs + lam * ft_dist.logprobs
 -        return type(base_dist)(
 -            token_ids=base_dist.token_ids,
 -            logprobs=blended_lp,
 -            vocab_size=base_dist.vocab_size,
 -            tail_logprob=base_dist.tail_logprob,
 -        )
+-
+-
 -class DummyDifferentialBackend:
 -    """Dummy implementation of
 -    :class:`~dlm_sway.core.scoring.DifferentialBackend`.
+-
 -    Construction takes one :class:`DummyResponses` per mode. The two
 -    modes are mutually exclusive — the backend enforces that callers
 -    exit one view before entering the other, catching bugs in probes
 -    that hold a stale view across a toggle.
+-
 -    Also implements
 -    :class:`~dlm_sway.core.scoring.ScalableDifferentialBackend` with a
 -    linear-blend between base and ft responses, so probes that need
 -    ``as_scaled_adapter`` (N2 AdapterAblation) are unit-testable.
 -    """
+-
 -    def __init__(self, *, base: DummyResponses, ft: DummyResponses) -> None:
 -        self._base_r = base
 -        self._ft_r = ft
 -        self._base = _DummyView("base", base)
 -        self._ft = _DummyView("ft", ft)
 -        self._active: str | None = None
+-
 -    @contextmanager
 -    def as_base(self) -> Iterator[_DummyView]:
 -        self._enter("base")
 -        try:
 -            yield self._base
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_finetuned(self) -> Iterator[_DummyView]:
 -        self._enter("ft")
 -        try:
 -            yield self._ft
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_scaled_adapter(self, lam: float) -> Iterator[_DummyView]:
 -        self._enter(f"scaled({lam})")
 -        try:
 -            yield _InterpolatedView(self._base_r, self._ft_r, lam)
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_DummyView]:
 -        self._enter(f"null({seed})")
 -        try:
 -            yield _NullView(self._base_r, seed=seed, init_scale=init_scale)
 -        finally:
 -            self._exit()
+-
 -    def _enter(self, mode: str) -> None:
 -        if self._active is not None:
 -            raise RuntimeError(
 -                f"DifferentialBackend view already active ({self._active!r}); "
 -                f"exit the current view before entering {mode!r}."
 -            )
 -        self._active = mode
+-
 -    def _exit(self) -> None:
 -        self._active = None

sway/src/dlm_sway/backends/hf.pydeleted

 -"""HuggingFace + PEFT differential backend.
+-
 -Loads the base once, attaches the LoRA adapter once, and toggles between
 -"base" and "fine-tuned" views on the same module via PEFT's
 -:meth:`~peft.PeftModel.disable_adapter` / :meth:`~peft.PeftModel.set_adapter`.
+-
 -This is the single most important backend in sway. Every numeric probe
 -benefits from the shared-weights toggle — memory is halved compared to
 -loading two copies, and KV-cache layouts stay aligned so pairwise KL math
 -is straight-forward.
+-
 -Heavy imports (``torch``, ``transformers``, ``peft``) are deferred until
 -``HuggingFaceDifferentialBackend`` is actually instantiated so
 -``import dlm_sway`` stays light for users of the dummy backend or spec
 -validation.
 -"""
+-
 -from __future__ import annotations
+-
 -from collections.abc import Iterator
 -from contextlib import contextmanager
 -from dataclasses import dataclass
 -from pathlib import Path
 -from typing import TYPE_CHECKING, Any, Literal
+-
 -import numpy as np
+-
 -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
 -from dlm_sway.core.model import ModelSpec
 -from dlm_sway.core.scoring import RollingLogprob, TokenDist
+-
 -if TYPE_CHECKING:
 -    from transformers import PreTrainedModel, PreTrainedTokenizerBase
+-
+-
 -Device = Literal["cuda", "mps", "cpu"]
+-
+-
 -def _detect_device() -> Device:
 -    try:
 -        import torch
 -    except ImportError as exc:
 -        raise BackendNotAvailableError("hf", extra="hf") from exc
 -    if torch.cuda.is_available():
 -        return "cuda"
 -    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
 -        return "mps"
 -    return "cpu"
+-
+-
 -def _resolve_dtype(requested: str, device: Device) -> Any:
 -    """Map the user's ``dtype`` preference to a torch dtype."""
 -    import torch  # noqa: PLC0415 — lazy
+-
 -    if requested == "fp16":
 -        return torch.float16
 -    if requested == "bf16":
 -        return torch.bfloat16
 -    if requested == "fp32":
 -        return torch.float32
 -    # auto: bf16 on CUDA (Ampere+) / MPS; fp32 on CPU for numerical stability.
 -    if device == "cuda" and torch.cuda.is_bf16_supported():
 -        return torch.bfloat16
 -    if device == "mps":
 -        return torch.float16
 -    return torch.float32
+-
+-
 -def _require_hf() -> tuple[Any, Any, Any]:
 -    """Import torch + transformers + peft, raising a friendly error if missing."""
 -    try:
 -        import torch
 -        import transformers
 -    except ImportError as exc:
 -        raise BackendNotAvailableError("hf", extra="hf") from exc
 -    try:
 -        import peft
 -    except ImportError as exc:
 -        raise BackendNotAvailableError(
 -            "hf", extra="hf", hint="peft is required for the adapter toggle."
 -        ) from exc
 -    return torch, transformers, peft
+-
+-
 -# --- the view object ------------------------------------------------------
+-
+-
 -@dataclass(slots=True)
 -class _HFView:
 -    """One side (base or ft) of a :class:`HuggingFaceDifferentialBackend`.
+-
 -    Both sides reuse the same underlying module; the difference is
 -    whether the adapter is active.
 -    """
+-
 -    id: str
 -    _model: Any
 -    _tokenizer: Any
 -    _device: str
 -    _pad_token_id: int
+-
 -    # -- Model ---------------------------------------------------------
 -    def generate(
 -        self,
 -        prompt: str,
 -        *,
 -        max_new_tokens: int,
 -        temperature: float = 0.0,
 -        top_p: float = 1.0,
 -        seed: int = 0,
 -    ) -> str:
 -        import torch
+-
 -        torch.manual_seed(seed)
 -        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device)
 -        do_sample = temperature > 0.0
 -        gen_kwargs: dict[str, Any] = {
 -            "max_new_tokens": max_new_tokens,
 -            "do_sample": do_sample,
 -            "pad_token_id": self._pad_token_id,
 -        }
 -        if do_sample:
 -            gen_kwargs["temperature"] = temperature
 -            gen_kwargs["top_p"] = top_p
 -        with torch.inference_mode():
 -            out_ids = self._model.generate(**inputs, **gen_kwargs)
 -        new_tokens = out_ids[0, inputs["input_ids"].shape[1] :]
 -        return str(self._tokenizer.decode(new_tokens, skip_special_tokens=True))
+-
 -    def close(self) -> None:
 -        return None
+-
 -    # -- ScoringBackend ------------------------------------------------
 -    def logprob_of(self, prompt: str, completion: str) -> float:
 -        import torch
 -        import torch.nn.functional as F
+-
 -        prompt_ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
 -        full_ids = self._tokenizer(prompt + completion, return_tensors="pt").input_ids.to(
 -            self._device
 -        )
 -        if full_ids.shape[1] <= prompt_ids.shape[1]:
 -            raise ProbeError(
 -                "logprob_of",
 -                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
 -            )
 -        target_ids = full_ids[:, prompt_ids.shape[1] :]
 -        with torch.inference_mode():
 -            logits = self._model(full_ids).logits  # (1, T, V)
 -        # Align: logit at position t predicts token at t+1. We want
 -        # predictions for the completion slice.
 -        shift_logits = logits[:, prompt_ids.shape[1] - 1 : -1, :]  # (1, C, V)
 -        log_probs = F.log_softmax(shift_logits.float(), dim=-1)
 -        gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
 -        return float(gathered.sum().item())
+-
 -    def rolling_logprob(self, text: str) -> RollingLogprob:
 -        import torch
 -        import torch.nn.functional as F
+-
 -        ids = self._tokenizer(text, return_tensors="pt").input_ids.to(self._device)
 -        if ids.shape[1] < 2:
 -            return RollingLogprob(
 -                token_ids=ids[0].cpu().numpy().astype(np.int64),
 -                logprobs=np.array([], dtype=np.float32),
 -                num_tokens=int(ids.shape[1]),
 -                total_logprob=0.0,
 -            )
 -        with torch.inference_mode():
 -            logits = self._model(ids).logits  # (1, T, V)
 -        log_probs = F.log_softmax(logits[:, :-1].float(), dim=-1)  # predicts tokens 1..T
 -        gathered = log_probs.gather(-1, ids[:, 1:].unsqueeze(-1)).squeeze(-1).squeeze(0)
 -        return RollingLogprob(
 -            token_ids=ids[0].cpu().numpy().astype(np.int64),
 -            logprobs=gathered.cpu().numpy().astype(np.float32),
 -            num_tokens=int(ids.shape[1]),
 -            total_logprob=float(gathered.sum().item()),
 -        )
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -        import torch
 -        import torch.nn.functional as F
+-
 -        ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
 -        with torch.inference_mode():
 -            logits = self._model(ids).logits[:, -1, :]  # (1, V)
 -        log_probs = F.log_softmax(logits.float(), dim=-1).squeeze(0)
 -        k = min(top_k, int(log_probs.shape[0]))
 -        top = torch.topk(log_probs, k=k)
 -        tail_mass = float(1.0 - torch.exp(top.values).sum().item())
 -        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
 -        return TokenDist(
 -            token_ids=top.indices.cpu().numpy().astype(np.int64),
 -            logprobs=top.values.cpu().numpy().astype(np.float32),
 -            vocab_size=int(log_probs.shape[0]),
 -            tail_logprob=tail_logprob,
 -        )
+-
+-
 -# --- the backend -----------------------------------------------------------
+-
+-
 -class HuggingFaceDifferentialBackend:
 -    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for HF+PEFT.
+-
 -    The adapter toggle relies on
 -    :meth:`peft.PeftModel.disable_adapter` producing a context where the
 -    forward pass skips the LoRA deltas, and
 -    :meth:`peft.PeftModel.set_adapter` (or just exiting the disable
 -    context) re-enabling them. A dedicated sanity test asserts that
 -    these actually change logits on a fixture.
 -    """
+-
 -    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
 -        torch, transformers, peft = _require_hf()
 -        self._torch = torch
 -        self._spec = base_spec
 -        self._adapter_path = Path(adapter_path).expanduser().resolve()
+-
 -        device_str: Device = (
 -            _detect_device() if base_spec.device == "auto" else base_spec.device  # type: ignore[assignment]
 -        )
 -        self._device: str = device_str
 -        dtype = _resolve_dtype(base_spec.dtype, device_str)
+-
 -        tokenizer = transformers.AutoTokenizer.from_pretrained(
 -            str(self._adapter_path)
 -            if (self._adapter_path / "tokenizer_config.json").exists()
 -            else base_spec.base,
 -            trust_remote_code=base_spec.trust_remote_code,
 -        )
 -        if tokenizer.pad_token_id is None:
 -            tokenizer.pad_token = tokenizer.eos_token
+-
 -        base_model = transformers.AutoModelForCausalLM.from_pretrained(
 -            base_spec.base,
 -            torch_dtype=dtype,
 -            trust_remote_code=base_spec.trust_remote_code,
 -        )
 -        base_model.to(self._device)
 -        peft_model = peft.PeftModel.from_pretrained(
 -            base_model,
 -            str(self._adapter_path),
 -            is_trainable=False,
 -        )
 -        peft_model.eval()
+-
 -        self._tokenizer: PreTrainedTokenizerBase = tokenizer
 -        self._peft_model: PreTrainedModel = peft_model
 -        self._pad_token_id: int = int(tokenizer.pad_token_id)
 -        self._active: str | None = None
+-
 -    # -- DifferentialBackend -------------------------------------------
+-
 -    @contextmanager
 -    def as_base(self) -> Iterator[_HFView]:
 -        self._enter("base")
 -        try:
 -            # peft.PeftModel.disable_adapter is a context manager; mypy
 -            # mis-reads it as a Tensor on this transformers version.
 -            with self._peft_model.disable_adapter():  # type: ignore[operator]
 -                yield self._make_view("base")
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_finetuned(self) -> Iterator[_HFView]:
 -        self._enter("ft")
 -        try:
 -            yield self._make_view("ft")
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_scaled_adapter(self, lam: float) -> Iterator[_HFView]:
 -        """Temporarily multiply every LoRA layer's scaling factor by ``lam``.
+-
 -        Works by walking the PEFT module tree and mutating each
 -        ``LoraLayer.scaling[adapter_name]`` in place. The original
 -        scalings are restored when the context exits — or when an
 -        exception propagates, to keep the model in a sane state.
 -        """
 -        self._enter(f"scaled({lam})")
 -        # ``module`` is dynamic (peft LoraLayer subclass) — Any avoids
 -        # mypy treating its ``.scaling`` as a Tensor when peft is loaded.
 -        saved: list[tuple[Any, str, float]] = []
 -        try:
 -            import peft  # noqa: PLC0415 — already a hard dep of this backend
+-
 -            lora_cls = getattr(peft.tuners.lora, "LoraLayer", None)
 -            if lora_cls is None:
 -                raise RuntimeError("peft.tuners.lora.LoraLayer not found; check peft>=0.13 pin")
 -            for module in self._peft_model.modules():
 -                if not isinstance(module, lora_cls):
 -                    continue
 -                scaling = getattr(module, "scaling", None)
 -                if not isinstance(scaling, dict):
 -                    continue
 -                for key, original in scaling.items():
 -                    saved.append((module, key, float(original)))
 -                    scaling[key] = float(original) * lam
 -            yield self._make_view(f"scaled_{lam:.2f}")
 -        finally:
 -            for module, key, original in saved:
 -                module.scaling[key] = original
 -            self._exit()
+-
 -    @contextmanager
 -    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_HFView]:
 -        """Temporarily replace every LoRA ``A``/``B`` tensor with random noise.
+-
 -        Same rank, alpha, and target modules as the real adapter — only
 -        the weights differ. This is the denominator in every z-score
 -        path: "how much signal does structural noise produce?"
+-
 -        Implementation walks the PEFT module tree for ``lora_A``/``lora_B``
 -        parameters, saves a clone of each current value, overwrites in
 -        place with a zero-mean Gaussian at ``init_scale``, and restores
 -        on exit (including on exception).
 -        """
 -        import torch
+-
 -        self._enter(f"null({seed})")
 -        gen = torch.Generator(device="cpu").manual_seed(int(seed))
 -        saved: list[tuple[torch.nn.Parameter, torch.Tensor]] = []
 -        try:
 -            for pname, param in self._peft_model.named_parameters():
 -                if not any(key in pname for key in ("lora_A", "lora_B")):
 -                    continue
 -                saved.append((param, param.detach().clone()))
 -                with torch.no_grad():
 -                    noise = torch.randn(
 -                        *param.shape,
 -                        generator=gen,
 -                        dtype=torch.float32,
 -                    ).to(dtype=param.dtype, device=param.device)
 -                    param.copy_(noise * init_scale)
 -            yield self._make_view(f"null_{seed}")
 -        finally:
 -            with torch.no_grad():
 -                for param, original in saved:
 -                    param.copy_(original)
 -            self._exit()
+-
 -    def close(self) -> None:
 -        """Release GPU memory. Safe to call more than once."""
 -        if getattr(self, "_peft_model", None) is not None:
 -            del self._peft_model
 -        if self._torch.cuda.is_available():
 -            self._torch.cuda.empty_cache()
+-
 -    # -- internals -----------------------------------------------------
+-
 -    def _make_view(self, mode: str) -> _HFView:
 -        return _HFView(
 -            id=mode,
 -            _model=self._peft_model,
 -            _tokenizer=self._tokenizer,
 -            _device=self._device,
 -            _pad_token_id=self._pad_token_id,
 -        )
+-
 -    def _enter(self, mode: str) -> None:
 -        if self._active is not None:
 -            raise RuntimeError(
 -                f"HuggingFaceDifferentialBackend view {self._active!r} already active; "
 -                f"exit it before entering {mode!r}."
 -            )
 -        self._active = mode
+-
 -    def _exit(self) -> None:
 -        self._active = None
+-
+-
 -__all__ = ["HuggingFaceDifferentialBackend"]

sway/src/dlm_sway/backends/mlx.pydeleted

 -"""MLX backend for Apple Silicon (darwin-arm64).
+-
 -Partial implementation covering the common case: a PEFT adapter that's
 -already been converted to MLX's ``.npz`` format. Unlike the HF backend,
 -MLX has no runtime ``disable_adapter`` context — adapters get fused into
 -the linear layers at load time — so this backend keeps **both** a base
 -model and an adapted model in memory. Fine for the small (<3B) models
 -MLX is typically used with on Apple Silicon; document the cost clearly.
+-
 -If users point this backend at raw PEFT safetensors, ``mlx_lm.load``
 -will refuse them with its own error. A future milestone can wire a
 -PEFT-→-MLX converter; for now the contract is "bring your own .npz".
 -"""
+-
 -from __future__ import annotations
+-
 -from collections.abc import Iterator
 -from contextlib import contextmanager
 -from dataclasses import dataclass
 -from pathlib import Path
 -from typing import TYPE_CHECKING, Any
+-
 -import numpy as np
+-
 -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
 -from dlm_sway.core.model import ModelSpec
 -from dlm_sway.core.scoring import RollingLogprob, TokenDist
+-
 -if TYPE_CHECKING:
 -    pass
+-
+-
 -def _require_mlx() -> tuple[Any, Any]:
 -    try:
 -        import mlx.core as mx
 -        import mlx_lm
 -    except ImportError as exc:
 -        raise BackendNotAvailableError(
 -            "mlx",
 -            extra="mlx",
 -            hint="MLX backend needs mlx + mlx-lm on darwin-arm64.",
 -        ) from exc
 -    return mx, mlx_lm
+-
+-
 -@dataclass(slots=True)
 -class _MLXView:
 -    """One side (base or ft) of the MLX backend.
+-
 -    Both sides carry the same tokenizer (MLX stores it alongside the
 -    converted model files, so sharing avoids double-loading).
 -    """
+-
 -    id: str
 -    _model: Any
 -    _tokenizer: Any
+-
 -    def generate(
 -        self,
 -        prompt: str,
 -        *,
 -        max_new_tokens: int,
 -        temperature: float = 0.0,
 -        top_p: float = 1.0,
 -        seed: int = 0,
 -    ) -> str:
 -        del seed  # mlx_lm.generate seeds via its own global state
 -        _, mlx_lm = _require_mlx()
 -        kwargs: dict[str, Any] = {"max_tokens": max_new_tokens, "verbose": False}
 -        if temperature > 0.0:
 -            kwargs["temp"] = temperature
 -            kwargs["top_p"] = top_p
 -        out = mlx_lm.generate(self._model, self._tokenizer, prompt=prompt, **kwargs)
 -        return str(out)
+-
 -    def close(self) -> None:
 -        return None
+-
 -    # -- ScoringBackend ------------------------------------------------
+-
 -    def _forward_logits(self, prompt: str) -> np.ndarray:
 -        """Run the model once and return ``(seq_len, vocab)`` logits."""
 -        mx, _ = _require_mlx()
 -        input_ids = self._tokenizer.encode(prompt)
 -        tokens = mx.array(input_ids)[None, :]  # (1, T)
 -        out = self._model(tokens)
 -        # mlx_lm models return an mx.array; convert to numpy for downstream math.
 -        return np.asarray(out[0])
+-
 -    def logprob_of(self, prompt: str, completion: str) -> float:
 -        input_ids = self._tokenizer.encode(prompt)
 -        full_ids = self._tokenizer.encode(prompt + completion)
 -        if len(full_ids) <= len(input_ids):
 -            raise ProbeError(
 -                "logprob_of",
 -                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
 -            )
 -        logits = self._forward_logits(prompt + completion)  # (T, V)
 -        # Position t predicts token t+1 — slice off the last row and the prompt span.
 -        shift = logits[len(input_ids) - 1 : -1, :]
 -        target_ids = np.asarray(full_ids[len(input_ids) :], dtype=np.int64)
 -        log_probs = _log_softmax(shift.astype(np.float64), axis=-1)
 -        gathered = log_probs[np.arange(len(target_ids)), target_ids]
 -        return float(gathered.sum())
+-
 -    def rolling_logprob(self, text: str) -> RollingLogprob:
 -        ids = self._tokenizer.encode(text)
 -        if len(ids) < 2:
 -            return RollingLogprob(
 -                token_ids=np.asarray(ids, dtype=np.int64),
 -                logprobs=np.array([], dtype=np.float32),
 -                num_tokens=len(ids),
 -                total_logprob=0.0,
 -            )
 -        logits = self._forward_logits(text)
 -        log_probs = _log_softmax(logits[:-1].astype(np.float64), axis=-1)
 -        ids_arr = np.asarray(ids, dtype=np.int64)
 -        gathered = log_probs[np.arange(len(ids) - 1), ids_arr[1:]]
 -        return RollingLogprob(
 -            token_ids=ids_arr,
 -            logprobs=gathered.astype(np.float32),
 -            num_tokens=len(ids),
 -            total_logprob=float(gathered.sum()),
 -        )
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -        logits = self._forward_logits(prompt)
 -        last_logits = logits[-1].astype(np.float64)
 -        log_probs = _log_softmax(last_logits, axis=-1)
 -        k = min(top_k, log_probs.shape[0])
 -        # np.argpartition for top-k then sort the partition.
 -        part = np.argpartition(log_probs, -k)[-k:]
 -        top_ids = part[np.argsort(log_probs[part])[::-1]]
 -        top_lp = log_probs[top_ids]
 -        tail_mass = float(1.0 - np.exp(top_lp).sum())
 -        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
 -        return TokenDist(
 -            token_ids=top_ids.astype(np.int64),
 -            logprobs=top_lp.astype(np.float32),
 -            vocab_size=int(log_probs.shape[0]),
 -            tail_logprob=tail_logprob,
 -        )
+-
+-
 -class MLXDifferentialBackend:
 -    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for MLX models.
+-
 -    Loads two copies of the same base model — one bare, one with the
 -    adapter fused — because MLX has no runtime toggle. Memory cost: 2×
 -    base weights. On typical Apple Silicon workloads with ≤3B models
 -    this is acceptable.
 -    """
+-
 -    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
 -        mx, mlx_lm = _require_mlx()
 -        self._mx = mx
 -        self._spec = base_spec
 -        self._adapter_path = Path(adapter_path).expanduser().resolve()
+-
 -        # Load bare base (no adapter).
 -        self._base_model, self._tokenizer = mlx_lm.load(base_spec.base)
 -        # Load ft with adapter attached. ``adapter_path`` is mlx_lm's kwarg.
 -        self._ft_model, _ = mlx_lm.load(base_spec.base, adapter_path=str(self._adapter_path))
 -        self._active: str | None = None
+-
 -    @contextmanager
 -    def as_base(self) -> Iterator[_MLXView]:
 -        self._enter("base")
 -        try:
 -            yield _MLXView(id="base", _model=self._base_model, _tokenizer=self._tokenizer)
 -        finally:
 -            self._exit()
+-
 -    @contextmanager
 -    def as_finetuned(self) -> Iterator[_MLXView]:
 -        self._enter("ft")
 -        try:
 -            yield _MLXView(id="ft", _model=self._ft_model, _tokenizer=self._tokenizer)
 -        finally:
 -            self._exit()
+-
 -    def close(self) -> None:
 -        """MLX reclaims memory when references drop; nothing to do here."""
 -        return
+-
 -    def _enter(self, mode: str) -> None:
 -        if self._active is not None:
 -            raise RuntimeError(
 -                f"MLXDifferentialBackend view {self._active!r} already active; "
 -                f"exit it before entering {mode!r}."
 -            )
 -        self._active = mode
+-
 -    def _exit(self) -> None:
 -        self._active = None
+-
+-
 -def _log_softmax(x: np.ndarray, *, axis: int) -> np.ndarray:
 -    x_max = np.max(x, axis=axis, keepdims=True)
 -    y = x - x_max
 -    log_sum = np.log(np.sum(np.exp(y), axis=axis, keepdims=True))
 -    return np.asarray(y - log_sum, dtype=np.float64)
+-
+-
 -__all__ = ["MLXDifferentialBackend"]

sway/src/dlm_sway/cli/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1		-"""Command-line interface (entry point: ``dlm-sway``)."""

sway/src/dlm_sway/cli/app.pydeleted

 -"""dlm-sway CLI entry point.
+-
 -``pip install dlm-sway`` installs this module's :func:`main` as the
 -``dlm-sway`` console script. Every subcommand is a thin wrapper around a
 -library-level function so the CLI surface mirrors what programmatic
 -callers get.
 -"""
+-
 -from __future__ import annotations
+-
 -import typer
+-
 -from dlm_sway import __version__
 -from dlm_sway.cli import commands
+-
 -app = typer.Typer(
 -    name="dlm-sway",
 -    no_args_is_help=True,
 -    add_completion=False,
 -    help="Differential testing for fine-tuned causal language models.",
 -)
+-
+-
 -def _version_callback(value: bool) -> None:
 -    if value:
 -        typer.echo(f"dlm-sway {__version__}")
 -        raise typer.Exit()
+-
+-
 -@app.callback()
 -def _root(
 -    version: bool = typer.Option(  # noqa: B008 — typer pattern
 -        False,
 -        "--version",
 -        callback=_version_callback,
 -        is_eager=True,
 -        help="Print version and exit.",
 -    ),
 -) -> None:
 -    """Root callback; accepts ``--version``."""
 -    del version
+-
+-
 -app.command("run")(commands.run_cmd)
 -app.command("gate")(commands.gate_cmd)
 -app.command("check")(commands.check_cmd)
 -app.command("diff")(commands.diff_cmd)
 -app.command("autogen")(commands.autogen_cmd)
 -app.command("doctor")(commands.doctor_cmd)
 -app.command("report")(commands.report_cmd)
+-
+-
 -def main() -> None:
 -    """Script entry point registered in :file:`pyproject.toml`."""
 -    app()
+-
+-
 -if __name__ == "__main__":
 -    main()

sway/src/dlm_sway/cli/commands.pydeleted

 -"""Command implementations for the ``dlm-sway`` CLI.
+-
 -Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
 -Commands deliberately do as little as possible themselves — the real
 -work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the
 -probes package.
 -"""
+-
 -from __future__ import annotations
+-
 -import json
 -import sys
 -from pathlib import Path
 -from typing import Annotated, Any
+-
 -import typer
 -from rich.console import Console
+-
 -from dlm_sway import __version__
 -from dlm_sway.core.errors import SwayError
 -from dlm_sway.core.result import SuiteResult, SwayScore, Verdict
+-
+-
 -def run_cmd(
 -    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
 -    json_out: Annotated[
 -        Path | None,
 -        typer.Option(
 -            "--json",
 -            "-j",
 -            help="Write the JSON report to this path in addition to the terminal render.",
 -        ),
 -    ] = None,
 -    markdown_out: Annotated[
 -        Path | None,
 -        typer.Option("--markdown", "-m", help="Write a markdown report to this path."),
 -    ] = None,
 -) -> None:
 -    """Execute a suite and render a terminal report."""
 -    try:
 -        result, score_obj = _execute_spec(spec)
 -    except SwayError as exc:
 -        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
 -        raise typer.Exit(code=2) from exc
+-
 -    from dlm_sway.suite import report
+-
 -    console = Console()
 -    report.to_terminal(result, score_obj, console=console)
+-
 -    if json_out is not None:
 -        json_out.write_text(report.to_json(result, score_obj), encoding="utf-8")
 -        console.print(f"\n[dim]wrote JSON → {json_out}[/dim]")
 -    if markdown_out is not None:
 -        markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8")
 -        console.print(f"[dim]wrote markdown → {markdown_out}[/dim]")
+-
+-
 -def gate_cmd(
 -    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
 -    junit_out: Annotated[
 -        Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.")
 -    ] = None,
 -    coverage_threshold: Annotated[
 -        float | None,
 -        typer.Option(
 -            "--threshold",
 -            help="Override the spec's coverage_threshold. Exit non-zero below it.",
 -        ),
 -    ] = None,
 -) -> None:
 -    """Execute a suite and exit non-zero on failure (CI gate)."""
 -    try:
 -        result, score_obj = _execute_spec(spec)
 -    except SwayError as exc:
 -        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
 -        raise typer.Exit(code=2) from exc
+-
 -    from dlm_sway.suite import report
 -    from dlm_sway.suite.loader import load_spec as _load_spec
+-
 -    console = Console()
 -    report.to_terminal(result, score_obj, console=console)
+-
 -    if junit_out is not None:
 -        junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8")
 -        console.print(f"[dim]wrote JUnit → {junit_out}[/dim]")
+-
 -    threshold = (
 -        coverage_threshold
 -        if coverage_threshold is not None
 -        else _load_spec(spec).defaults.coverage_threshold
 -    )
 -    has_failures = any(p.verdict == Verdict.FAIL for p in result.probes)
 -    below_threshold = score_obj.overall < threshold
 -    if has_failures or below_threshold:
 -        console.print(
 -            f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}"
 -            if below_threshold
 -            else "\n[red]gate FAILED[/red] — at least one probe reported FAIL"
 -        )
 -        raise typer.Exit(code=1)
 -    console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}")
+-
+-
 -def check_cmd(
 -    adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")],
 -    base: Annotated[str, typer.Option("--base", help="HuggingFace base model id or local path.")],
 -    prompts: Annotated[
 -        Path | None,
 -        typer.Option(
 -            "--prompts",
 -            help="File with one prompt per line. Defaults to sway's built-in quick set.",
 -        ),
 -    ] = None,
 -) -> None:
 -    """<60s smoke test: "is this adapter doing anything at all?".
+-
 -    Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No
 -    spec file required.
 -    """
 -    from dlm_sway.backends import build as build_backend
 -    from dlm_sway.core.model import ModelSpec
 -    from dlm_sway.suite import report
 -    from dlm_sway.suite.runner import run as run_suite
 -    from dlm_sway.suite.score import compute as compute_score
 -    from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec
+-
 -    quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS
+-
 -    base_spec = ModelSpec(base=base, kind="hf")
 -    ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter)
 -    spec = SwaySpec(
 -        version=1,
 -        models=SuiteModels(base=base_spec, ft=ft_spec),
 -        defaults=SuiteDefaults(seed=0),
 -        suite=[
 -            {
 -                "name": "quick_delta_kl",
 -                "kind": "delta_kl",
 -                "prompts": list(quick_prompts),
 -                "assert_mean_gte": 0.01,
 -            },
 -            {
 -                "name": "quick_calibration",
 -                "kind": "calibration_drift",
 -                "items_limit": 10,
 -            },
 -        ],
 -    )
 -    try:
 -        backend = build_backend(ft_spec)
 -    except SwayError as exc:
 -        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
 -        raise typer.Exit(code=2) from exc
+-
 -    try:
 -        result = run_suite(spec, backend, spec_path="<check>")
 -    finally:
 -        _close_if_possible(backend)
 -    score_obj = compute_score(result)
 -    report.to_terminal(result, score_obj, console=Console())
+-
+-
 -def diff_cmd(
 -    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
 -    adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")],
 -    adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")],
 -) -> None:
 -    """Run the same suite against two adapters and show per-probe deltas."""
 -    from dlm_sway.backends import build as build_backend
 -    from dlm_sway.suite.loader import load_spec
 -    from dlm_sway.suite.runner import run as run_suite
 -    from dlm_sway.suite.score import compute as compute_score
+-
 -    sway_spec = load_spec(spec)
 -    console = Console()
+-
 -    def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]:
 -        ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path})
 -        backend = build_backend(ft_spec)
 -        try:
 -            result = run_suite(sway_spec, backend, spec_path=str(spec))
 -        finally:
 -            _close_if_possible(backend)
 -        scored = compute_score(result)
 -        per_probe = {p.name: (p.score or 0.0) for p in result.probes}
 -        return scored.overall, per_probe
+-
 -    try:
 -        overall_a, per_a = _score_for(adapter_a)
 -        overall_b, per_b = _score_for(adapter_b)
 -    except SwayError as exc:
 -        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
 -        raise typer.Exit(code=2) from exc
+-
 -    console.print(f"[bold]overall[/bold]  A: {overall_a:.2f}   B: {overall_b:.2f}")
 -    console.print()
 -    console.print("[bold]per-probe[/bold] (A → B, Δ):")
 -    for name in sorted(per_a.keys() | per_b.keys()):
 -        a = per_a.get(name, 0.0)
 -        b = per_b.get(name, 0.0)
 -        delta = b - a
 -        sign = "+" if delta >= 0 else ""
 -        console.print(f"  {name:<30}  {a:.2f}  →  {b:.2f}   ({sign}{delta:+.2f})")
+-
+-
 -def autogen_cmd(
 -    dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")],
 -    out: Annotated[
 -        Path,
 -        typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
 -    ] = Path("sway.yaml"),
 -) -> None:
 -    """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm])."""
 -    import importlib
+-
 -    try:
 -        autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen")
 -    except ImportError as exc:
 -        typer.secho(
 -            "dlm integration not installed — run: pip install 'dlm-sway[dlm]'",
 -            err=True,
 -            fg=typer.colors.RED,
 -        )
 -        raise typer.Exit(code=2) from exc
+-
 -    try:
 -        autogen_mod.write_sway_yaml(dlm_path, out)
 -    except SwayError as exc:
 -        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
 -        raise typer.Exit(code=2) from exc
+-
 -    typer.echo(f"wrote {out}")
+-
+-
 -def doctor_cmd() -> None:
 -    """Print backend availability and version info."""
 -    console = Console()
 -    console.print(f"[bold]dlm-sway[/bold] {__version__}")
 -    console.print(f"  python:    {sys.version.split()[0]}")
 -    console.print(f"  platform:  {sys.platform}")
 -    console.print()
+-
 -    console.print("[bold]backends[/bold]")
 -    console.print(
 -        f"  hf:        {_probe_import('torch')} {_probe_import('transformers')} {_probe_import('peft')}"
 -    )
 -    console.print(f"  mlx:       {_probe_import('mlx')} {_probe_import('mlx_lm')}")
 -    console.print(f"  semsim:    {_probe_import('sentence_transformers')}")
 -    console.print(
 -        f"  style+:    {_probe_import('spacy')} {_probe_import('textstat')} {_probe_import('nlpaug')}"
 -    )
 -    console.print(f"  dlm:       {_probe_import('dlm')}")
 -    console.print(f"  viz:       {_probe_import('matplotlib')}")
+-
+-
 -def report_cmd(
 -    result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")],
 -    format: Annotated[
 -        str, typer.Option("--format", help="Output format: terminal, md, junit, json.")
 -    ] = "terminal",
 -) -> None:
 -    """Re-render a previously saved run (for history tracking / dashboards)."""
 -    raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8"))
 -    fmt = format.lower()
 -    if fmt == "json":
 -        typer.echo(json.dumps(raw, indent=2, sort_keys=True))
 -        return
 -    if fmt in {"md", "markdown"}:
 -        # A file-level re-render needs the dataclasses back; simplest is
 -        # to synthesize a minimal markdown from the JSON directly.
 -        typer.echo(_render_markdown_from_json(raw))
 -        return
 -    if fmt == "junit":
 -        typer.echo(_render_junit_from_json(raw))
 -        return
 -    # Default: terminal-ish one-liner summary.
 -    score: dict[str, Any] = raw.get("score", {})
 -    typer.echo(f"overall: {score.get('overall', 0.0):.2f}  [{score.get('band', '?')}]")
 -    probes: list[dict[str, Any]] = raw.get("probes", [])
 -    for p in probes:
 -        typer.echo(
 -            f"  {p['name']:<30}  {p['verdict']:<6}  "
 -            f"{(p.get('score') or 0.0):.2f}  {p.get('message', '')[:60]}"
 -        )
+-
+-
 -# -- helpers -----------------------------------------------------------
+-
+-
 -_BUILTIN_QUICK_PROMPTS: tuple[str, ...] = (
 -    "The quick brown fox",
 -    "Once upon a time",
 -    "The answer to the question is",
 -    "One important lesson is",
 -    "In my opinion,",
 -    "The first step is to",
 -    "Remember that",
 -    "A common mistake is",
 -)
+-
+-
 -def _load_prompts(path: Path) -> tuple[str, ...]:
 -    return tuple(
 -        line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()
 -    )
+-
+-
 -def _execute_spec(path: Path) -> tuple[SuiteResult, SwayScore]:
 -    """Load a spec, build a backend, run the suite, fold scores. Shared
 -    by ``run`` and ``gate``. Picks up .dlm-derived sections when the
 -    spec's ``dlm_source`` is set."""
 -    from dlm_sway.backends import build as build_backend
 -    from dlm_sway.suite.loader import load_spec
 -    from dlm_sway.suite.runner import run as run_suite
 -    from dlm_sway.suite.score import compute as compute_score
+-
 -    spec = load_spec(path)
 -    sections = None
 -    doc_text = None
 -    if spec.dlm_source is not None:
 -        import importlib
+-
 -        try:
 -            resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver")
 -            handle = resolver.resolve_dlm(Path(spec.dlm_source))
 -            sections = handle.sections
 -            doc_text = handle.doc_text
 -        except ImportError:
 -            # Honoring dlm_source is best-effort — probes that need
 -            # sections will SKIP with a pointer at the extra.
 -            sections = None
 -    backend = build_backend(spec.models.ft)
 -    try:
 -        result = run_suite(spec, backend, spec_path=str(path), sections=sections, doc_text=doc_text)
 -    finally:
 -        _close_if_possible(backend)
 -    score_obj = compute_score(result)
 -    return result, score_obj
+-
+-
 -def _close_if_possible(backend: object) -> None:
 -    close = getattr(backend, "close", None)
 -    if callable(close):
 -        close()
+-
+-
 -def _probe_import(name: str) -> str:
 -    import importlib
+-
 -    try:
 -        mod = importlib.import_module(name)
 -    except ImportError:
 -        return f"[red]{name}: missing[/red]"
 -    ver = getattr(mod, "__version__", "installed")
 -    return f"[green]{name}: {ver}[/green]"
+-
+-
 -def _render_markdown_from_json(raw: dict[str, Any]) -> str:
 -    score: dict[str, Any] = raw.get("score", {})
 -    lines: list[str] = [
 -        "# dlm-sway report",
 -        "",
 -        f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`)  ",
 -        f"**Base:** `{raw.get('base_model_id', '?')}`  ",
 -        f"**Adapter:** `{raw.get('adapter_id', '?')}`  ",
 -        "",
 -        "## Probes",
 -        "",
 -        "| name | kind | verdict | score |",
 -        "|---|---|---|---:|",
 -    ]
 -    probes: list[dict[str, Any]] = raw.get("probes", [])
 -    for p in probes:
 -        lines.append(
 -            f"| {p['name']} | `{p['kind']}` | {p['verdict']} | {(p.get('score') or 0.0):.2f} |"
 -        )
 -    return "\n".join(lines)
+-
+-
 -def _render_junit_from_json(raw: dict[str, Any]) -> str:
 -    """Minimal JUnit renderer from a saved JSON (useful for report --format junit)."""
 -    import xml.etree.ElementTree as ET
+-
 -    probes: list[dict[str, Any]] = raw.get("probes", [])
 -    testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))})
 -    for p in probes:
 -        tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]})
 -        if p["verdict"] == "fail":
 -            ET.SubElement(tc, "failure", {"message": p.get("message", "")})
 -        elif p["verdict"] == "error":
 -            ET.SubElement(tc, "error", {"message": p.get("message", "")})
 -        elif p["verdict"] == "skip":
 -            ET.SubElement(tc, "skipped", {"message": p.get("message", "")})
 -    return ET.tostring(testsuite, encoding="unicode")

sway/src/dlm_sway/core/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1		-"""Core abstractions: protocols, results, errors, determinism."""

sway/src/dlm_sway/core/determinism.pydeleted

 -"""Deterministic-execution helper.
+-
 -Mirrors ``dlm.train.determinism.seed_everything`` so running the same
 -suite twice on the same host produces the same :class:`ProbeResult`
 -payloads. The dlm project treats determinism as a contract; sway takes
 -the same posture for scoring operations.
+-
 -Generation is allowed to use non-deterministic attention kernels when
 -``temperature > 0``, because a deterministic sampled generation is a
 -contradiction. Scoring (logprobs, rolling logprobs, next-token dists)
 -always runs under :func:`torch.use_deterministic_algorithms(True)`.
 -"""
+-
 -from __future__ import annotations
+-
 -import os
 -import random
 -from dataclasses import dataclass
 -from typing import Literal
+-
 -DeterminismClass = Literal["strict", "best_effort", "loose"]
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class DeterminismSummary:
 -    """What seeding actually accomplished, for logging in the report."""
+-
 -    class_: DeterminismClass
 -    seed: int
 -    notes: tuple[str, ...] = ()
+-
+-
 -def seed_everything(seed: int, *, strict: bool = True) -> DeterminismSummary:
 -    """Seed every RNG sway's probes touch and flip backend flags.
+-
 -    Idempotent — safe to call repeatedly with the same seed.
+-
 -    Parameters
 -    ----------
 -    seed:
 -        The seed. Callers typically use the value from ``sway.yaml``'s
 -        ``defaults.seed`` (default 0).
 -    strict:
 -        If ``True`` (the default), request deterministic CUDA algorithms
 -        and set ``CUBLAS_WORKSPACE_CONFIG``. Scoring probes need this;
 -        generation-only runs can set it ``False``.
+-
 -    Returns
 -    -------
 -    :class:`DeterminismSummary` with a classification:
+-
 -    - ``"strict"`` — deterministic algorithms active, no warnings.
 -    - ``"best_effort"`` — platform doesn't support full determinism
 -      (MPS, some CPU kernels).
 -    - ``"loose"`` — seeded but deterministic algorithms refused.
 -    """
+-
 -    notes: list[str] = []
 -    clazz: DeterminismClass = "best_effort"
+-
 -    # Env vars must come first — torch reads them at cuBLAS init.
 -    if strict:
 -        os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
+-
 -    random.seed(seed)
+-
 -    # numpy is a hard dep; safe to seed unconditionally.
 -    import numpy as np
+-
 -    np.random.seed(seed)
+-
 -    try:
 -        import torch  # noqa: PLC0415 — lazy: torch is an optional extra.
 -    except ModuleNotFoundError:
 -        notes.append("torch not installed; seeded python + numpy only")
 -        return DeterminismSummary(class_="best_effort", seed=seed, notes=tuple(notes))
+-
 -    torch.manual_seed(seed)
 -    if torch.cuda.is_available():
 -        torch.cuda.manual_seed_all(seed)
 -        clazz = "strict"
 -    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
 -        clazz = "best_effort"
 -        notes.append("MPS: bit-identical across runs is best-effort")
 -    else:
 -        clazz = "best_effort"
 -        notes.append("CPU-only backend: strict determinism depends on BLAS impl")
+-
 -    if strict:
 -        try:
 -            torch.use_deterministic_algorithms(True, warn_only=True)
 -            torch.backends.cudnn.benchmark = False
 -        except Exception as exc:  # noqa: BLE001 — torch raises a naked Exception
 -            clazz = "loose"
 -            notes.append(f"deterministic algorithms refused: {exc}")
+-
 -    return DeterminismSummary(class_=clazz, seed=seed, notes=tuple(notes))

sway/src/dlm_sway/core/errors.pydeleted

 -"""Exception hierarchy for dlm-sway.
+-
 -Every error sway raises inherits from :class:`SwayError` so callers can
 -catch the whole family with a single ``except``. Subclasses carry enough
 -context (spec paths, probe names, missing extras) for the CLI to render
 -actionable messages without the caller having to introspect an exception
 -chain.
 -"""
+-
 -from __future__ import annotations
+-
+-
 -class SwayError(Exception):
 -    """Root of the dlm-sway exception hierarchy."""
+-
+-
 -class SpecValidationError(SwayError):
 -    """A ``sway.yaml`` (or equivalent) failed pydantic validation.
+-
 -    Parameters
 -    ----------
 -    message:
 -        Human-readable summary of what went wrong.
 -    source:
 -        Path or identifier of the spec being validated, if known.
 -    """
+-
 -    def __init__(self, message: str, *, source: str | None = None) -> None:
 -        super().__init__(message)
 -        self.source = source
+-
 -    def __str__(self) -> str:
 -        base = super().__str__()
 -        return f"{self.source}: {base}" if self.source else base
+-
+-
 -class BackendNotAvailableError(SwayError):
 -    """A requested backend's optional dependencies aren't installed.
+-
 -    The CLI turns this into a pointed ``pip install dlm-sway[<extra>]``
 -    hint; programmatic callers can read :attr:`extra` directly.
 -    """
+-
 -    def __init__(self, backend: str, *, extra: str, hint: str | None = None) -> None:
 -        message = (
 -            f"backend {backend!r} unavailable — install the extra: pip install 'dlm-sway[{extra}]'"
 -        )
 -        if hint:
 -            message = f"{message}\n{hint}"
 -        super().__init__(message)
 -        self.backend = backend
 -        self.extra = extra
+-
+-
 -class ProbeError(SwayError):
 -    """A probe failed to *execute* (as opposed to failing its assertion).
+-
 -    Distinct from a ``verdict=FAIL`` result — assertion failures are
 -    normal and reported via :class:`ProbeResult`. This is for genuine
 -    bugs: missing sections, mismatched tokenizers, NaN logits.
 -    """
+-
 -    def __init__(self, probe: str, message: str) -> None:
 -        super().__init__(f"probe {probe!r}: {message}")
 -        self.probe = probe

sway/src/dlm_sway/core/model.pydeleted

 -"""The :class:`Model` abstraction and :class:`ModelSpec` user-facing config.
+-
 -Probes operate on objects that satisfy :class:`Model` (for generation)
 -and :class:`~dlm_sway.core.scoring.ScoringBackend` (for logit-level
 -access). Backends return concrete instances of both — they are
 -deliberately separate Protocols because not every backend exposes logits
 -(e.g. an Ollama HTTP backend would implement ``Model`` but not
 -``ScoringBackend``).
+-
 -The user-facing surface is :class:`ModelSpec`, a pydantic model that
 -describes how to materialize a base + adapter pair. No ``.dlm``
 -concepts live at this layer — those belong in
 -:mod:`dlm_sway.integrations.dlm`.
 -"""
+-
 -from __future__ import annotations
+-
 -from dataclasses import dataclass
 -from pathlib import Path
 -from typing import Any, Literal, Protocol, runtime_checkable
+-
 -from pydantic import BaseModel, ConfigDict, Field
+-
 -BackendKind = Literal["hf", "mlx", "dummy", "custom"]
 -"""Registered scoring-backend kinds.
+-
 -``custom`` is an escape hatch — the runner looks up an entry point when
 -it sees ``custom`` in a spec.
 -"""
+-
+-
 -class ModelSpec(BaseModel):
 -    """How to materialize one model (base or fine-tuned)."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    kind: BackendKind = "hf"
 -    base: str
 -    """HuggingFace repo id (``HuggingFaceTB/SmolLM2-135M-Instruct``) or
 -    a local path to a model directory."""
+-
 -    adapter: Path | None = None
 -    """Path to a PEFT adapter directory (containing ``adapter_config.json``
 -    and ``adapter_model.safetensors``). ``None`` → base-only model."""
+-
 -    dtype: Literal["auto", "fp16", "bf16", "fp32"] = "auto"
 -    device: str = "auto"
 -    """``"auto"`` chooses CUDA → MPS → CPU in that order."""
+-
 -    trust_remote_code: bool = False
 -    """HuggingFace ``trust_remote_code`` passthrough. Off by default —
 -    the user must opt in explicitly, matching sway's no-surprises
 -    posture."""
+-
 -    entry_point: str | None = Field(default=None)
 -    """Required when ``kind='custom'``. Import path like
 -    ``mypkg.mybackend:MyBackend``."""
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class LoadedModel:
 -    """A materialized model plus the tokenizer that produced it.
+-
 -    Returned by backend ``load()`` methods. Probes usually don't touch
 -    this directly — they go through the :class:`Model` /
 -    :class:`~dlm_sway.core.scoring.ScoringBackend` Protocols.
 -    """
+-
 -    id: str
 -    """Stable handle: ``"base"`` or ``"ft"`` typically."""
 -    spec: ModelSpec
 -    model: Any
 -    """Framework-native handle (torch ``nn.Module``, MLX array module …).
+-
 -    Typed as ``Any`` because the frameworks themselves ship unstubbed.
 -    Backend implementations narrow this at their boundary."""
 -    tokenizer: Any
 -    meta: dict[str, Any]
 -    """Backend-captured metadata: device, dtype, adapter version, bytes
 -    on disk, num trainable params. Surfaced in the suite report."""
+-
+-
 -@runtime_checkable
 -class Model(Protocol):
 -    """Minimum interface for text generation.
+-
 -    Implemented by backend-wrapped model objects. Probes that need logits
 -    also require :class:`~dlm_sway.core.scoring.ScoringBackend`.
 -    """
+-
 -    id: str
+-
 -    def generate(
 -        self,
 -        prompt: str,
 -        *,
 -        max_new_tokens: int,
 -        temperature: float = 0.0,
 -        top_p: float = 1.0,
 -        seed: int = 0,
 -    ) -> str:
 -        """Generate a completion.
+-
 -        Defaults (``temperature=0``, ``top_p=1``) are greedy-decode for
 -        reproducibility. Callers wanting sampled output must pass
 -        non-defaults *and* a seed.
 -        """
 -        ...
+-
 -    def close(self) -> None:
 -        """Release any resources held by this model."""
 -        ...

sway/src/dlm_sway/core/result.pydeleted

 -"""Probe and suite result types.
+-
 -Every numeric probe ultimately returns a :class:`ProbeResult`. The suite
 -runner collects them into a :class:`SuiteResult` and the scorer folds
 -that into a single :class:`SwayScore` with transparent per-component
 -weights.
+-
 -These dataclasses are deliberately plain — no pydantic — because they
 -cross probe/backend boundaries hundreds of times per run and a free
 -``model_validate`` on every construction would dominate the runtime of
 -cheap probes.
 -"""
+-
 -from __future__ import annotations
+-
 -from dataclasses import dataclass, field
 -from datetime import UTC, datetime
 -from enum import StrEnum
 -from typing import Any
+-
+-
 -class Verdict(StrEnum):
 -    """Outcome of a single probe against its assertion."""
+-
 -    PASS = "pass"
 -    FAIL = "fail"
 -    WARN = "warn"
 -    SKIP = "skip"
 -    ERROR = "error"
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class ProbeResult:
 -    """The result of running one probe.
+-
 -    Attributes
 -    ----------
 -    name:
 -        User-facing name from the spec (unique within a suite).
 -    kind:
 -        Probe discriminator (``delta_kl``, ``section_internalization`` …).
 -    verdict:
 -        Pass / fail / warn / skip / error.
 -    score:
 -        Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric
 -        probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`.
 -    raw:
 -        The raw metric value (e.g. KL=0.083). Probe-specific units.
 -    z_score:
 -        Standard deviations above the null-adapter baseline. ``None``
 -        when no null calibration was run.
 -    base_value:
 -        The metric evaluated on the base model, when meaningful.
 -    ft_value:
 -        The metric evaluated on the fine-tuned model, when meaningful.
 -    evidence:
 -        Small structured payload for the report — prompts, example
 -        completions, per-section breakdowns. Kept bounded (<10 KB) so
 -        suite JSON stays under a megabyte.
 -    message:
 -        One-line diagnostic. Surfaces in the terminal report.
 -    duration_s:
 -        Wall time to execute.
 -    """
+-
 -    name: str
 -    kind: str
 -    verdict: Verdict
 -    score: float | None
 -    raw: float | None = None
 -    z_score: float | None = None
 -    base_value: float | None = None
 -    ft_value: float | None = None
 -    evidence: dict[str, Any] = field(default_factory=dict)
 -    message: str = ""
 -    duration_s: float = 0.0
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class SuiteResult:
 -    """A full run of a sway.yaml suite."""
+-
 -    spec_path: str
 -    started_at: datetime
 -    finished_at: datetime
 -    base_model_id: str
 -    adapter_id: str
 -    sway_version: str
 -    probes: tuple[ProbeResult, ...] = ()
 -    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
 -    """Per-primitive null-adapter baseline stats (mean, std, runs). Used
 -    to turn raw metrics into z-scores when rendering the report."""
+-
 -    @property
 -    def wall_seconds(self) -> float:
 -        return (self.finished_at - self.started_at).total_seconds()
+-
+-
 -# Component weights for the composite score. Overridable in sway.yaml.
 -DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = {
 -    "adherence": 0.30,
 -    "attribution": 0.35,
 -    "calibration": 0.20,
 -    "ablation": 0.15,
 -}
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class SwayScore:
 -    """Composite score with a transparent per-component breakdown."""
+-
 -    overall: float
 -    components: dict[str, float]
 -    weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS))
 -    band: str = ""
 -    findings: tuple[str, ...] = ()
+-
 -    @staticmethod
 -    def band_for(overall: float) -> str:
 -        """Map a score to a human-readable band.
+-
 -        Bands (from the plan):
 -          - <0.3  : indistinguishable from noise
 -          - 0.3–0.6 : partial fit
 -          - 0.6–0.85: healthy
 -          - >0.85 : suspiciously good (possible overfit / memorization)
 -        """
 -        if overall < 0.3:
 -            return "noise"
 -        if overall < 0.6:
 -            return "partial"
 -        if overall <= 0.85:
 -            return "healthy"
 -        return "suspicious"
+-
+-
 -def utcnow() -> datetime:
 -    """Timezone-aware UTC timestamp (used by the runner)."""
 -    return datetime.now(UTC)

sway/src/dlm_sway/core/scoring.pydeleted

 -"""Scoring protocols: logprobs, next-token distributions, differential toggling.
+-
 -Scoring is **separate** from generation because not every backend can
 -provide logits. Every numeric sway probe depends on at least one of
 -three operations:
+-
 -1. ``logprob_of(prompt, completion)`` — score a completion against a
 -   prompt (A1, B2, B3, C2, …).
 -2. ``rolling_logprob(text)`` — perplexity over a piece of text (B1,
 -   C2).
 -3. ``next_token_dist(prompt, top_k)`` — the raw next-token distribution
 -   at a single position (A1, N2).
+-
 -The :class:`DifferentialBackend` is the key performance primitive:
 -both base and fine-tuned views share the same loaded weights and KV
 -cache layout, toggled via PEFT's :meth:`set_adapter` /
 -:meth:`disable_adapter`. A naive "load twice" implementation would
 -double memory and halve throughput.
 -"""
+-
 -from __future__ import annotations
+-
 -from contextlib import AbstractContextManager
 -from dataclasses import dataclass, field
 -from typing import Protocol, runtime_checkable
+-
 -import numpy as np
 -from numpy.typing import NDArray
+-
 -from dlm_sway.core.model import Model
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class RollingLogprob:
 -    """Per-token logprobs over a piece of text, plus summary stats.
+-
 -    Attributes
 -    ----------
 -    token_ids:
 -        The tokenizer output for ``text``. Length ``N``.
 -    logprobs:
 -        ``log p(token_i | token_<i)`` for each position i ≥ 1. Length
 -        ``N-1``.
 -    num_tokens:
 -        ``N`` — included for convenience; ``len(token_ids)``.
 -    total_logprob:
 -        Sum of :attr:`logprobs`.
 -    """
+-
 -    token_ids: NDArray[np.int64]
 -    logprobs: NDArray[np.float32]
 -    num_tokens: int
 -    total_logprob: float
+-
 -    @property
 -    def mean_logprob(self) -> float:
 -        n = self.logprobs.size
 -        return float(self.total_logprob / n) if n else 0.0
+-
 -    @property
 -    def perplexity(self) -> float:
 -        """``exp(-mean_logprob)``. Base-e, natural perplexity."""
 -        return float(np.exp(-self.mean_logprob))
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class TokenDist:
 -    """A (possibly top-k truncated) next-token probability distribution.
+-
 -    For KL / JS divergence probes sway needs matched distributions
 -    across base and fine-tuned views. The runner is responsible for
 -    aligning ``top_k`` token slices between two ``TokenDist`` objects
 -    before handing them to divergence math.
 -    """
+-
 -    token_ids: NDArray[np.int64]
 -    """Token ids, descending by probability. Length ``k``."""
 -    logprobs: NDArray[np.float32]
 -    """Log-probabilities for :attr:`token_ids`. Length ``k``."""
 -    vocab_size: int
 -    """Full vocab size — needed to renormalize top-k truncated slices."""
 -    tail_logprob: float = field(default=0.0)
 -    """log of (1 - sum of exp(logprobs[:k])); 0 if top_k covers the full vocab."""
+-
+-
 -@runtime_checkable
 -class ScoringBackend(Protocol):
 -    """Logit-level access to a loaded model."""
+-
 -    def logprob_of(self, prompt: str, completion: str) -> float:
 -        """Sum of log-probabilities of ``completion`` tokens given ``prompt``.
+-
 -        The prompt is *not* scored; only the completion contributes. The
 -        value is in nats (natural log). Longer completions are
 -        monotonically more negative — callers normalize by length if
 -        they need a rate.
 -        """
 -        ...
+-
 -    def rolling_logprob(self, text: str) -> RollingLogprob:
 -        """Compute per-token logprobs for the whole of ``text``.
+-
 -        Equivalent to lm-eval's ``loglikelihood_rolling``. Used for
 -        perplexity comparison on held-out content (B1 SIS, C2).
 -        """
 -        ...
+-
 -    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -        """Next-token distribution at the position after ``prompt``.
+-
 -        Truncated to ``top_k`` for memory; callers doing divergence math
 -        over the top-k slice accept the (typically negligible) error vs
 -        full-vocab KL.
 -        """
 -        ...
+-
+-
 -@runtime_checkable
 -class DifferentialBackend(Protocol):
 -    """A backend that holds base + fine-tuned views on a single loaded model.
+-
 -    The idiomatic usage is::
+-
 -        with backend.as_base() as base_view:
 -            p_base = base_view.next_token_dist(prompt)
 -        with backend.as_finetuned() as ft_view:
 -            p_ft = ft_view.next_token_dist(prompt)
+-
 -    Implementations toggle PEFT adapters via
 -    :meth:`peft.PeftModel.set_adapter` / :meth:`disable_adapter`.
+-
 -    Invariant: the two views must be **not simultaneously usable**. A
 -    caller holding a ``base_view`` after entering the ``as_finetuned``
 -    context is a programmer error and implementations MUST detect and
 -    raise.
 -    """
+-
 -    def as_base(self) -> AbstractContextManager[_ScoringModel]: ...
+-
 -    def as_finetuned(self) -> AbstractContextManager[_ScoringModel]: ...
+-
+-
 -@runtime_checkable
 -class ScalableDifferentialBackend(DifferentialBackend, Protocol):
 -    """A differential backend that can also scale the LoRA additive term.
+-
 -    LoRA applies ``W + (alpha/r) · B @ A`` to a base weight matrix. This
 -    protocol exposes a context manager that temporarily multiplies that
 -    additive term by ``lam`` for everything inside the ``with`` block.
+-
 -    ``lam = 0.0`` is equivalent to :meth:`as_base`.
 -    ``lam = 1.0`` is equivalent to :meth:`as_finetuned`.
 -    ``lam = 1.25`` overshoots — useful for N2 AdapterAblation's
 -    response-curve measurement.
+-
 -    Only the HF backend ships an implementation in v0.1. Probes that
 -    need scaling check via ``isinstance(backend, ScalableDifferentialBackend)``
 -    at runtime and SKIP gracefully when unavailable.
 -    """
+-
 -    def as_scaled_adapter(self, lam: float) -> AbstractContextManager[_ScoringModel]: ...
+-
+-
 -@runtime_checkable
 -class NullCalibratedBackend(DifferentialBackend, Protocol):
 -    """A differential backend that can produce a "null adapter" view.
+-
 -    A null adapter has the *same structure* (rank, alpha, target modules)
 -    as the real adapter but with weights drawn from a zero-mean Gaussian.
 -    Running probes against this view yields the baseline "how much
 -    signal does random noise produce" distribution — the denominator in
 -    every numeric probe's z-score.
+-
 -    The context manager takes a ``seed`` so calibration runs can be
 -    reproduced and multiple independent null samples can be drawn to
 -    estimate ``std``.
+-
 -    Implementations MUST restore the real adapter on exit, including
 -    on exceptions, so a caller can freely interleave null and real
 -    calibrations within the same backend lifetime.
 -    """
+-
 -    def as_null_adapter(
 -        self, seed: int, *, init_scale: float = 0.02
 -    ) -> AbstractContextManager[_ScoringModel]: ...
+-
+-
 -# Helper Protocol for type-checking the yielded context object: it
 -# must satisfy both Model and ScoringBackend. mypy doesn't support
 -# intersection types, so we spell it out explicitly.
 -@runtime_checkable
 -class _ScoringModel(Model, ScoringBackend, Protocol):
 -    """A Model that also exposes ScoringBackend."""
+-
 -    ...
+-
+-
 -ScoringModel = _ScoringModel
 -"""Public alias for the intersection ``Model & ScoringBackend``.
+-
 -Exported for backend and probe implementations that need to annotate
 -variables of this combined type.
 -"""

sway/src/dlm_sway/core/sections.pydeleted

 -"""Minimal section contract for attribution probes.
+-
 -The flagship B1 ``section_internalization`` probe needs *structured*
 -input — a section has an id, a kind, content text, and possibly some
 -Q/A pairs or chosen/rejected triples. sway defines this shape here so
 -the probes stay oblivious to the upstream (``.dlm`` parser, custom
 -loaders, synthetic test fixtures).
+-
 -Field names are aligned with :mod:`dlm.doc.sections` but this module
 -does not import ``dlm`` — the bridge at
 -:mod:`dlm_sway.integrations.dlm` does the adaptation.
 -"""
+-
 -from __future__ import annotations
+-
 -from dataclasses import dataclass, field
 -from typing import Literal
+-
 -SectionKind = Literal["prose", "instruction", "preference"]
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class SectionProbe:
 -    """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section."""
+-
 -    prompt: str
 -    gold: str
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class SectionPreference:
 -    """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section."""
+-
 -    prompt: str
 -    chosen: str
 -    rejected: str
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class Section:
 -    """One typed chunk of a training document.
+-
 -    Attributes
 -    ----------
 -    id:
 -        Content-addressed identifier. ``.dlm`` uses a 16-hex-char
 -        sha256 prefix; sway doesn't enforce a format.
 -    kind:
 -        Discriminator for which of :attr:`probes` /
 -        :attr:`preferences` / :attr:`content` is the primary signal.
 -    content:
 -        Raw section text. Always populated; used by the rolling-PPL
 -        path for PROSE sections.
 -    probes:
 -        For INSTRUCTION: parsed Q/A pairs. Empty tuple for others.
 -    preferences:
 -        For PREFERENCE: parsed chosen/rejected triples. Empty otherwise.
 -    tag:
 -        Optional free-form label for the section (e.g., "intro",
 -        "api-reference"). Surfaces in per-section reports.
 -    """
+-
 -    id: str
 -    kind: SectionKind
 -    content: str
 -    probes: tuple[SectionProbe, ...] = field(default_factory=tuple)
 -    preferences: tuple[SectionPreference, ...] = field(default_factory=tuple)
 -    tag: str | None = None
+-
+-
 -def filter_kinds(
 -    sections: tuple[Section, ...], kinds: tuple[SectionKind, ...]
 -) -> tuple[Section, ...]:
 -    """Return only sections whose ``kind`` matches one of ``kinds``."""
 -    allow = set(kinds)
 -    return tuple(s for s in sections if s.kind in allow)

sway/src/dlm_sway/integrations/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1		-"""Optional integrations with upstream fine-tuning tools."""

sway/src/dlm_sway/integrations/dlm/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1		-"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``."""

sway/src/dlm_sway/integrations/dlm/autogen.pydeleted

 -"""Auto-generate a ``sway.yaml`` from a ``.dlm`` document.
+-
 -Walks the parsed sections and emits one entry per primitive sway ships:
 -the full 11-primitive battery wired up against the document's own
 -content. The result is a YAML artifact the user commits alongside their
 -``.dlm`` and diffs in PRs.
+-
 -The generated spec includes a ``dlm_source`` field that the suite loader
 -uses to pick up :class:`~dlm_sway.core.sections.Section` data at run
 -time — probes that need sections (B1, B3, C3) then work against the
 -typed structure instead of re-parsing text.
 -"""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
 -from typing import Any
+-
 -import yaml
+-
 -from dlm_sway.core.errors import SwayError
 -from dlm_sway.core.sections import Section
 -from dlm_sway.integrations.dlm.resolver import DlmHandle, resolve_dlm
+-
+-
 -def write_sway_yaml(dlm_path: Path, out: Path) -> None:
 -    """Resolve the .dlm, build a spec dict, write it as YAML to ``out``."""
 -    handle = resolve_dlm(dlm_path)
 -    if handle.adapter_path is None:
 -        raise SwayError(
 -            f"{dlm_path}: no trained adapter found at ~/.dlm/store/{handle.dlm_id}/adapter; "
 -            "train the document with `dlm train` before generating a sway suite."
 -        )
 -    spec = build_spec_dict(handle, dlm_source=str(dlm_path.resolve()))
 -    out.write_text(yaml.safe_dump(spec, sort_keys=False), encoding="utf-8")
+-
+-
 -def build_spec_dict(handle: DlmHandle, *, dlm_source: str | None = None) -> dict[str, Any]:
 -    """Build a sway.yaml-shaped dict from a :class:`DlmHandle`."""
 -    base_spec = {"kind": "hf", "base": handle.base_model}
 -    ft_spec = {
 -        "kind": "hf",
 -        "base": handle.base_model,
 -        "adapter": str(handle.adapter_path) if handle.adapter_path else None,
 -    }
 -    spec: dict[str, Any] = {
 -        "version": 1,
 -        "models": {"base": base_spec, "ft": ft_spec},
 -        "defaults": {"seed": 0, "differential": True},
 -        "suite": _build_suite(handle.sections),
 -    }
 -    if dlm_source is not None:
 -        spec["dlm_source"] = dlm_source
 -    return spec
+-
+-
 -def _build_suite(sections: tuple[Section, ...]) -> list[dict[str, Any]]:
 -    """Assemble the full probe battery for the given sections.
+-
 -    The ordering matters: ``null_adapter`` first so every downstream
 -    probe's z-score threshold has stats to consult.
 -    """
 -    instruction_probes: list[tuple[str, str]] = [
 -        (p.prompt, p.gold) for s in sections if s.kind == "instruction" for p in s.probes
 -    ]
 -    prose_prompts: list[str] = []
 -    for s in sections:
 -        if s.kind == "prose" and s.content.strip():
 -            # Use the section's leading sentence as a natural completion prompt.
 -            first_sentence = s.content.split(".")[0].strip()
 -            if first_sentence:
 -                prose_prompts.append(first_sentence + ".")
+-
 -    kl_prompts = [q for q, _ in instruction_probes][:16] or prose_prompts[:16]
 -    style_prompts = prose_prompts[:8] or [q for q, _ in instruction_probes][:8]
+-
 -    suite: list[dict[str, Any]] = []
+-
 -    # Baseline calibration — always first.
 -    suite.append({"name": "null_baseline", "kind": "null_adapter", "runs": 3})
+-
 -    # Adherence.
 -    if kl_prompts:
 -        suite.append(
 -            {
 -                "name": "delta_kl_doc",
 -                "kind": "delta_kl",
 -                "prompts": kl_prompts,
 -                "assert_mean_gte": 0.02,
 -            }
 -        )
 -    if instruction_probes:
 -        suite.append(
 -            {
 -                "name": "revert_check",
 -                "kind": "adapter_revert",
 -                "cases": [
 -                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
 -                    for q, a in instruction_probes[:8]
 -                ],
 -                "assert_revert_rate_lt": 0.3,
 -            }
 -        )
 -    if kl_prompts:
 -        suite.append(
 -            {
 -                "name": "prompt_collapse",
 -                "kind": "prompt_collapse",
 -                "prompts": kl_prompts[:4],
 -                "context_lengths": [0, 256, 512, 1024],
 -                "assert_half_life_tokens": 300,
 -            }
 -        )
+-
 -    # Attribution.
 -    if len(sections) >= 2:
 -        suite.append(
 -            {
 -                "name": "section_attribution",
 -                "kind": "section_internalization",
 -                "per_section_threshold": 0.05,
 -            }
 -        )
 -    if instruction_probes:
 -        suite.append(
 -            {
 -                "name": "paraphrase_invariance",
 -                "kind": "paraphrase_invariance",
 -                "cases": [
 -                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
 -                    for q, a in instruction_probes[:6]
 -                ],
 -            }
 -        )
 -    has_preferences = any(s.kind == "preference" and s.preferences for s in sections)
 -    if has_preferences:
 -        suite.append(
 -            {
 -                "name": "preference_flip",
 -                "kind": "preference_flip",
 -                "assert_flip_rate_gte": 0.7,
 -            }
 -        )
+-
 -    # Calibration.
 -    if style_prompts:
 -        suite.append(
 -            {
 -                "name": "style_shift",
 -                "kind": "style_fingerprint",
 -                "prompts": style_prompts,
 -            }
 -        )
 -    suite.append({"name": "general_knowledge", "kind": "calibration_drift"})
 -    if any(s.kind == "prose" for s in sections):
 -        suite.append(
 -            {
 -                "name": "verbatim_leak",
 -                "kind": "leakage",
 -                "prefix_chars": 128,
 -                "continuation_chars": 256,
 -            }
 -        )
+-
 -    # Signature ablation — goes last because it's the most expensive.
 -    if kl_prompts:
 -        suite.append(
 -            {
 -                "name": "adapter_ablation",
 -                "kind": "adapter_ablation",
 -                "prompts": kl_prompts[:6],
 -                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
 -            }
 -        )
+-
 -    return suite
+-
+-
 -def _auto_paraphrases(prompt: str) -> list[str]:
 -    """Small, deterministic paraphrase set used when authors don't supply one.
+-
 -    Purely heuristic — good enough to detect "did the model memorize the
 -    exact wording". Real paraphrase generation lives behind the
 -    ``semsim`` extra.
 -    """
 -    variants: list[str] = []
 -    stripped = prompt.rstrip("?. ")
 -    variants.append(f"Could you explain: {stripped}?")
 -    variants.append(f"I'd like to know — {stripped}.")
 -    variants.append(f"Please describe: {stripped}.")
 -    return variants[:3]

sway/src/dlm_sway/integrations/dlm/resolver.pydeleted

 -"""Resolve a ``.dlm`` file to the artifacts sway needs.
+-
 -Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything
 -outside this package is oblivious to dlm's internal shape; the bridge
 -is the only place that knows, e.g., that a dlm section carries a
 -``kind`` field named ``type`` or that adapters live at
 -``adapter/versions/vNNNN/``.
 -"""
+-
 -from __future__ import annotations
+-
 -import hashlib
 -from dataclasses import dataclass
 -from pathlib import Path
+-
 -from dlm_sway.core.errors import SwayError
 -from dlm_sway.core.sections import (
 -    Section,
 -    SectionKind,
 -    SectionPreference,
 -    SectionProbe,
 -)
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class DlmHandle:
 -    """Everything the sway bridge pulls out of a ``.dlm`` file.
+-
 -    Attributes
 -    ----------
 -    dlm_id:
 -        Stable identifier from the frontmatter.
 -    base_model:
 -        Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape
 -        hatch, taken verbatim from the frontmatter.
 -    adapter_path:
 -        Directory containing the current trained PEFT adapter (resolved
 -        via dlm's own ``StorePath.for_dlm``). ``None`` if the document
 -        hasn't been trained yet.
 -    sections:
 -        Typed sections ready for sway's probes.
 -    doc_text:
 -        Concatenated raw content of all sections. Used by probes that
 -        need a whole-document stylistic reference (C1).
 -    """
+-
 -    dlm_id: str
 -    base_model: str
 -    adapter_path: Path | None
 -    sections: tuple[Section, ...]
 -    doc_text: str
+-
+-
 -def resolve_dlm(dlm_path: Path) -> DlmHandle:
 -    """Parse ``dlm_path`` and return a :class:`DlmHandle`.
+-
 -    Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message
 -    when the file is malformed or when the resolved adapter path doesn't
 -    exist on disk.
 -    """
 -    try:
 -        from dlm.doc.parser import parse_file as dlm_parse_file
 -    except ImportError as exc:
 -        raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc
+-
 -    parsed = dlm_parse_file(dlm_path)
 -    fm = parsed.frontmatter
 -    sections = tuple(_translate_section(s) for s in parsed.sections)
 -    doc_text = "\n\n".join(s.content for s in sections)
+-
 -    adapter_path = _resolve_adapter_path(fm.dlm_id)
 -    base_hf_id = _resolve_base_model_to_hf_id(fm.base_model)
+-
 -    return DlmHandle(
 -        dlm_id=fm.dlm_id,
 -        base_model=base_hf_id,
 -        adapter_path=adapter_path,
 -        sections=sections,
 -        doc_text=doc_text,
 -    )
+-
+-
 -def _resolve_base_model_to_hf_id(base_model: str) -> str:
 -    """Translate dlm's base-model *key* to a HuggingFace repo id.
+-
 -    dlm's frontmatter stores registry keys like ``smollm2-135m`` which
 -    resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends
 -    call ``AutoModelForCausalLM.from_pretrained`` directly and need the
 -    HF id. The ``hf:org/name`` escape hatch passes through unchanged.
 -    """
 -    if base_model.startswith("hf:"):
 -        return base_model[len("hf:") :]
 -    try:
 -        from dlm.base_models import resolve as resolve_base
 -    except ImportError:
 -        return base_model
 -    try:
 -        spec = resolve_base(base_model)
 -    except Exception:  # noqa: BLE001 — unknown dlm errors
 -        return base_model
 -    hf_id = getattr(spec, "hf_id", None)
 -    return str(hf_id) if hf_id else base_model
+-
+-
 -def _resolve_adapter_path(dlm_id: str) -> Path | None:
 -    """Locate the current adapter directory for ``dlm_id``.
+-
 -    Uses dlm's module-level ``for_dlm`` helper if available, else falls
 -    back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt``
 -    pointer. Returns ``None`` if no adapter has been trained yet.
 -    """
 -    # Primary path: use dlm's own store-path helpers.
 -    try:
 -        from dlm.store.paths import for_dlm as _for_dlm
 -    except ImportError:
 -        _for_dlm = None
+-
 -    if _for_dlm is not None:
 -        try:
 -            store = _for_dlm(dlm_id)
 -        except Exception:  # noqa: BLE001 — unknown dlm exception shapes
 -            store = None
 -        if store is not None:
 -            try:
 -                resolved = store.resolve_current_adapter()
 -            except (AttributeError, FileNotFoundError):
 -                resolved = None
 -            if resolved is not None and Path(resolved).exists():
 -                return Path(resolved)
+-
 -    # Manual fallback. The ``current.txt`` pointer is relative to the
 -    # **store root**, not to current.txt's parent dir — so go up one level.
 -    import os
+-
 -    home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser()
 -    store_root = home / "store" / dlm_id
 -    current_file = store_root / "adapter" / "current.txt"
 -    if current_file.exists():
 -        pointer = current_file.read_text(encoding="utf-8").strip()
 -        candidate = (store_root / pointer).resolve()
 -        if candidate.exists():
 -            return candidate
 -    return None
+-
+-
 -def _translate_section(dlm_section: object) -> Section:
 -    """Adapt a ``dlm.doc.sections.Section`` to sway's section type.
+-
 -    dlm's Section dataclass uses the attribute name ``type`` (not
 -    ``kind``) and stores instruction/preference content as raw markdown
 -    — dlm ships dedicated parsers (``parse_instruction_body``,
 -    ``parse_preference_body``) that we reuse here so any future dlm
 -    syntax additions land in sway for free.
 -    """
 -    # dlm's current attribute is ``type``; older revisions used ``kind``.
 -    kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None))
 -    kind = _normalize_kind(kind_raw)
 -    content = str(getattr(dlm_section, "content", ""))
 -    section_id = str(
 -        getattr(dlm_section, "section_id", None)
 -        or getattr(dlm_section, "id", None)
 -        or _content_hash(content)
 -    )
 -    tag = getattr(dlm_section, "tag", None)
+-
 -    probes: tuple[SectionProbe, ...] = ()
 -    preferences: tuple[SectionPreference, ...] = ()
 -    if kind == "instruction":
 -        probes = tuple(_parse_instruction(content, section_id=section_id))
 -    elif kind == "preference":
 -        preferences = tuple(_parse_preference(content, section_id=section_id))
+-
 -    return Section(
 -        id=section_id,
 -        kind=kind,
 -        content=content,
 -        probes=probes,
 -        preferences=preferences,
 -        tag=tag if isinstance(tag, str) else None,
 -    )
+-
+-
 -def _normalize_kind(raw: object) -> SectionKind:
 -    """Map dlm's SectionType/str to sway's lowercase kind."""
 -    if raw is None:
 -        return "prose"
 -    value = str(raw).lower()
 -    # dlm uses uppercase StrEnum values like "PROSE"; normalize.
 -    if value.endswith("prose") or "prose" in value:
 -        return "prose"
 -    if "instruction" in value:
 -        return "instruction"
 -    if "preference" in value:
 -        return "preference"
 -    return "prose"
+-
+-
 -def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]:
 -    """Pull (Q, A) pairs out of a dlm INSTRUCTION section body.
+-
 -    Delegates to dlm's own ``parse_instruction_body`` so syntax additions
 -    land in sway without code changes here. Falls back to an empty list
 -    on parse errors — the probe will fail gracefully.
 -    """
 -    try:
 -        from dlm.data.instruction_parser import parse_instruction_body
 -    except ImportError:
 -        return []
 -    try:
 -        pairs = parse_instruction_body(content, section_id=section_id)
 -    except Exception:  # noqa: BLE001 — dlm raises InstructionParseError
 -        return []
 -    out: list[SectionProbe] = []
 -    for p in pairs:
 -        q = getattr(p, "question", getattr(p, "prompt", ""))
 -        a = getattr(p, "answer", getattr(p, "gold", ""))
 -        if q and a:
 -            out.append(SectionProbe(prompt=str(q), gold=str(a)))
 -    return out
+-
+-
 -def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]:
 -    """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body."""
 -    try:
 -        from dlm.data.preference_parser import parse_preference_body
 -    except ImportError:
 -        return []
 -    try:
 -        triples = parse_preference_body(content, section_id=section_id)
 -    except Exception:  # noqa: BLE001 — dlm raises PreferenceParseError
 -        return []
 -    out: list[SectionPreference] = []
 -    for t in triples:
 -        p = str(getattr(t, "prompt", ""))
 -        c = str(getattr(t, "chosen", ""))
 -        rej = str(getattr(t, "rejected", ""))
 -        if p and c and rej:
 -            out.append(SectionPreference(prompt=p, chosen=c, rejected=rej))
 -    return out
+-
+-
 -def _content_hash(content: str) -> str:
 -    return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]

sway/src/dlm_sway/probes/__init__.pydeleted

 -"""Probe primitives. Each module in this package implements one primitive.
+-
 -Importing this package eagerly imports every probe module so their
 -``__init_subclass__`` hooks populate the registry. If you're hitting
 -"unknown probe kind" from :func:`dlm_sway.probes.base.build_probe`, the
 -fix is to ``import dlm_sway.probes`` before building the probe — which
 -this ``__init__`` does for you.
 -"""
+-
 -from __future__ import annotations
+-
 -# Register every shipped probe with the central registry by importing
 -# its module. Order is not load-bearing for registration but matches the
 -# categorical grouping in :mod:`dlm_sway.core.result`.
 -from dlm_sway.probes import (  # noqa: F401 — imports register the probes
 -    adapter_ablation,
 -    adapter_revert,
 -    calibration_drift,
 -    delta_kl,
 -    leakage,
 -    null_adapter,
 -    paraphrase_invariance,
 -    preference_flip,
 -    prompt_collapse,
 -    section_internalization,
 -    style_fingerprint,
 -)

sway/src/dlm_sway/probes/_calibration_pack.pydeleted

 -"""A small, built-in general-knowledge probe pack for C2.
+-
 -Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few
 -tokens a competent base model should assign high probability to. The
 -items are deliberately *factually trivial* — the point isn't "does the
 -model know this?" but "did the fine-tune forget this?" — so the pack
 -skews toward grade-school geography, chemistry, arithmetic, and
 -high-frequency idiom.
+-
 -A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD +
 -OpenBookQA. This 30-item seed lets the probe ship today and catches the
 -most egregious over-fit cases.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Final
+-
 -CalibrationItem = tuple[str, str]
+-
 -BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = (
 -    # Geography
 -    ("The capital of France is", " Paris"),
 -    ("The capital of Japan is", " Tokyo"),
 -    ("The largest ocean on Earth is the", " Pacific"),
 -    ("Mount Everest is located on the border of Nepal and", " China"),
 -    ("The longest river in South America is the", " Amazon"),
 -    # Natural sciences
 -    ("Water freezes at zero degrees", " Celsius"),
 -    ("The chemical symbol for gold is", " Au"),
 -    ("Light travels faster than", " sound"),
 -    ("Plants convert sunlight into energy through", " photosynthesis"),
 -    ("The Earth orbits around the", " Sun"),
 -    # Arithmetic
 -    ("Two plus two equals", " four"),
 -    ("Ten times ten equals", " one hundred"),
 -    ("Half of one hundred is", " fifty"),
 -    ("A dozen means", " twelve"),
 -    # Language and idiom
 -    ("A rose by any other name would smell as", " sweet"),
 -    ("To be or not to be, that is the", " question"),
 -    ("The early bird catches the", " worm"),
 -    ("Actions speak louder than", " words"),
 -    ("A picture is worth a thousand", " words"),
 -    # History
 -    ("World War II ended in the year", " 1945"),
 -    ("The first president of the United States was", " George Washington"),
 -    ("The Berlin Wall fell in", " 1989"),
 -    # Biology
 -    ("Humans have twenty", " fingers and toes"),
 -    ("The human body has two", " lungs"),
 -    ("Blood is pumped through the body by the", " heart"),
 -    # Technology
 -    ("HTML stands for HyperText", " Markup Language"),
 -    ("The World Wide Web was invented by Tim", " Berners-Lee"),
 -    # Miscellaneous
 -    ("One year has", " 365 days"),
 -    ("A week has seven", " days"),
 -    ("There are seven colors in a", " rainbow"),
 -)
 -"""30 items covering geography, science, arithmetic, language, history,
 -biology, and technology. Pulled from public-domain grade-school facts so
 -there's no licensing concern about shipping with the wheel."""

sway/src/dlm_sway/probes/_divergence.pydeleted

 -"""Shared math for divergence-based probes.
+-
 -Extracted so :mod:`delta_kl`, :mod:`adapter_ablation`, and any future
 -probe operating on next-token distributions reuse the same aligned-
 -top-k KL / JS computation. Having one implementation keeps the numerical
 -treatment consistent across the report.
 -"""
+-
 -from __future__ import annotations
+-
 -import math
 -from typing import Literal
+-
 -import numpy as np
 -from numpy.typing import NDArray
+-
 -from dlm_sway.core.scoring import TokenDist
+-
 -Divergence = Literal["kl", "js"]
+-
+-
 -def aligned_probs(
 -    base: TokenDist, ft: TokenDist
 -) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
 -    """Return aligned probability vectors over the union of top-k tokens.
+-
 -    Two ``TokenDist`` objects may surface different top-k indices if
 -    the two models disagree about the hot tokens. We build a shared
 -    support — ``union(base.token_ids, ft.token_ids)`` — and slot the
 -    known probabilities in. Unknown entries fall back to the
 -    per-distribution tail mass divided across the missing tokens,
 -    which is the maximum-entropy completion under the truncation.
 -    """
 -    union_ids = np.union1d(base.token_ids, ft.token_ids)
 -    k = int(union_ids.size)
+-
 -    base_probs = _to_support(base, union_ids, k)
 -    ft_probs = _to_support(ft, union_ids, k)
+-
 -    # Normalize in case of floating noise from the fill-in.
 -    base_probs /= base_probs.sum()
 -    ft_probs /= ft_probs.sum()
 -    return base_probs, ft_probs
+-
+-
 -def _to_support(dist: TokenDist, support: NDArray[np.int64], k: int) -> NDArray[np.float64]:
 -    probs = np.exp(dist.logprobs.astype(np.float64))
 -    out = np.zeros(k, dtype=np.float64)
 -    known_mass = float(probs.sum())
 -    tail_mass = max(0.0, 1.0 - known_mass)
+-
 -    id_to_idx = {int(tok): idx for idx, tok in enumerate(support.tolist())}
 -    missing = 0
 -    for tok, p in zip(dist.token_ids.tolist(), probs.tolist(), strict=True):
 -        i = id_to_idx.get(int(tok))
 -        if i is None:
 -            # Shouldn't happen given union construction.
 -            missing += 1
 -            continue
 -        out[i] = float(p)
+-
 -    # Spread the tail mass over the support entries that this dist
 -    # doesn't explicitly provide. Size of that set:
 -    n_unknown = int((out == 0.0).sum()) - missing
 -    if n_unknown > 0 and tail_mass > 0.0:
 -        per = tail_mass / n_unknown
 -        out[out == 0.0] = per
+-
 -    return out
+-
+-
 -def kl(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
 -    """KL(p || q) in nats. Robust to zeros in p (treated as 0·log0 = 0)."""
 -    mask = p > 0.0
 -    safe_q = np.where(q > 0.0, q, 1e-12)
 -    return float(np.sum(p[mask] * (np.log(p[mask]) - np.log(safe_q[mask]))))
+-
+-
 -def js(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
 -    """Jensen-Shannon divergence. Symmetric, bounded in [0, ln 2] (nats).
+-
 -    The upper bound makes JS a nicer default for thresholding than raw
 -    KL — a user doesn't need to know their specific model's KL scale to
 -    pick a threshold.
 -    """
 -    m = 0.5 * (p + q)
 -    return 0.5 * kl(p, m) + 0.5 * kl(q, m)
+-
+-
 -def divergence(base: TokenDist, ft: TokenDist, kind: Divergence = "js") -> float:
 -    """Compute KL or JS between two ``TokenDist`` on a shared support."""
 -    p, q = aligned_probs(base, ft)
 -    if kind == "js":
 -        return js(p, q)
 -    if kind == "kl":
 -        return kl(q, p)  # KL(ft || base) — "how much does ft diverge from base"
 -    raise ValueError(f"unknown divergence kind: {kind!r}")
+-
+-
 -def js_ln2() -> float:
 -    """Upper bound on JS in nats. Useful for normalization."""
 -    return math.log(2.0)

sway/src/dlm_sway/probes/adapter_ablation.pydeleted

 -"""N2 AdapterAblation — the sway signature primitive.
+-
 -Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25}
 -and measures the mean divergence from the base distribution at each
 -step. Fits a monotonic response curve; reports three shape metrics:
+-
 -- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means
 -  the adapter's effect scales predictably; low means it's "all or
 -  nothing" (degenerate).
 -- **saturation_lambda**: the smallest λ at which divergence reaches
 -  90% of the λ=1 value. Too low (<0.3) means the adapter fires at
 -  partial strength — fragile. Too high (>1.0) means the adapter is
 -  under-trained.
 -- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the
 -  healthy "pushing past 1 still moves the model" signal. An overshoot
 -  below 1.0 suggests collapse.
+-
 -This is the single novel primitive that no generic eval harness
 -provides — sway's position next to the adapter math makes it possible.
+-
 -Requires the backend to implement
 -:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes
 -SKIP gracefully on backends that don't.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Literal
+-
 -import numpy as np
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.core.scoring import ScalableDifferentialBackend
 -from dlm_sway.probes._divergence import Divergence, divergence
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -class AdapterAblationSpec(ProbeSpec):
 -    kind: Literal["adapter_ablation"] = "adapter_ablation"
 -    prompts: list[str] = Field(default_factory=list)
 -    lambdas: list[float] = Field(
 -        default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
 -        min_length=3,
 -    )
 -    divergence: Divergence = "js"
 -    top_k: int | None = None
 -    assert_linearity_gte: float = 0.85
 -    assert_saturation_between: tuple[float, float] = (0.3, 1.05)
 -    assert_overshoot_gte: float = 1.02
+-
+-
 -class AdapterAblationProbe(Probe):
 -    kind = "adapter_ablation"
 -    spec_cls = AdapterAblationSpec
 -    category = "ablation"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, AdapterAblationSpec)
 -        if not spec.prompts:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no prompts provided",
 -            )
 -        if not isinstance(ctx.backend, ScalableDifferentialBackend):
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message=(
 -                    "backend does not implement ScalableDifferentialBackend — "
 -                    "adapter ablation requires LoRA-scale access"
 -                ),
 -            )
+-
 -        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
+-
 -        # Reference distribution at λ=0 (adapter scaled to zero → base).
 -        lam_zero = min(spec.lambdas)
 -        per_lambda: list[float] = []
 -        for lam in spec.lambdas:
 -            divs_for_lam: list[float] = []
 -            for prompt in spec.prompts:
 -                with ctx.backend.as_scaled_adapter(lam_zero) as ref:
 -                    ref_dist = ref.next_token_dist(prompt, top_k=top_k)
 -                with ctx.backend.as_scaled_adapter(lam) as scaled:
 -                    scaled_dist = scaled.next_token_dist(prompt, top_k=top_k)
 -                divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence))
 -            per_lambda.append(float(np.mean(divs_for_lam)))
+-
 -        lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64)
 -        divs_arr = np.asarray(per_lambda, dtype=np.float64)
+-
 -        linearity = _r_squared(lambdas_arr, divs_arr)
 -        saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr)
 -        overshoot = _overshoot(lambdas_arr, divs_arr)
+-
 -        # Pass when all three shape metrics land in their healthy bands.
 -        sat_lo, sat_hi = spec.assert_saturation_between
 -        ok_lin = linearity >= spec.assert_linearity_gte
 -        ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi
 -        ok_over = overshoot >= spec.assert_overshoot_gte
 -        verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL
+-
 -        lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6)))
 -        over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2))
 -        sat_score = 1.0 if ok_sat else 0.3
 -        score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=linearity,
 -            evidence={
 -                "lambdas": spec.lambdas,
 -                "mean_divergence_per_lambda": per_lambda,
 -                "linearity": linearity,
 -                "saturation_lambda": saturation_lambda,
 -                "overshoot": overshoot,
 -                "passed_linearity": ok_lin,
 -                "passed_saturation": ok_sat,
 -                "passed_overshoot": ok_over,
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} "
 -                f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}"
 -                if saturation_lambda is not None
 -                else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}"
 -            ),
 -        )
+-
+-
 -def _r_squared(x: np.ndarray, y: np.ndarray) -> float:
 -    """Coefficient of determination for a linear fit of ``y`` on ``x``."""
 -    if x.size < 2:
 -        return 0.0
 -    xm = float(x.mean())
 -    ym = float(y.mean())
 -    denom = float(((x - xm) ** 2).sum())
 -    if denom == 0.0:
 -        return 0.0
 -    slope = float(((x - xm) * (y - ym)).sum()) / denom
 -    intercept = ym - slope * xm
 -    y_pred = slope * x + intercept
 -    ss_res = float(((y - y_pred) ** 2).sum())
 -    ss_tot = float(((y - ym) ** 2).sum())
 -    if ss_tot == 0.0:
 -        return 1.0
 -    return max(0.0, 1.0 - ss_res / ss_tot)
+-
+-
 -def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None:
 -    """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1)."""
 -    # Locate the index of λ=1.0 (or the closest entry ≤ 1.0).
 -    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
 -    if candidates.size == 0:
 -        # Fall back to the largest λ ≤ 1.0.
 -        mask = lambdas <= 1.0
 -        if not mask.any():
 -            return None
 -        idx1 = int(np.argmax(lambdas * mask))
 -    else:
 -        idx1 = int(candidates[0])
 -    target = 0.9 * float(divs[idx1])
 -    if target <= 0:
 -        return None
 -    for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False):
 -        if d >= target:
 -            return float(lam)
 -    return None
+-
+-
 -def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float:
 -    """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0."""
 -    idx_max = int(np.argmax(lambdas))
 -    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
 -    if candidates.size == 0:
 -        return 1.0
 -    idx1 = int(candidates[0])
 -    if idx_max == idx1:
 -        return 1.0
 -    d1 = float(divs[idx1])
 -    dmax = float(divs[idx_max])
 -    if d1 <= 0:
 -        return 1.0
 -    return dmax / d1

sway/src/dlm_sway/probes/adapter_revert.pydeleted

 -"""A2 AdapterRevert — does the fine-tuned model drift back to base under pressure?
+-
 -For each test case the user provides a prompt, a "gold" answer (the
 -adapter's intended response), and one or more adversarial paraphrases of
 -the prompt. We generate base-model and ft-model completions on every
 -paraphrase and ask: does the ft output cluster semantically with the
 -base's output (revert) or with the gold (adhere)?
+-
 -Signal: ``revert_rate`` = fraction of (case, paraphrase) pairs where
 -``cos(ft, base) > cos(ft, gold)``. A healthy fine-tune holds below 25%.
+-
 -Needs sentence embeddings. Without the ``semsim`` extra installed the
 -probe returns :attr:`Verdict.SKIP` with a pip hint — deterministic
 -n-gram fallbacks don't carry semantic equivalence reliably enough to
 -drive a revert decision, and we'd rather be honest than lossy.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Any, Literal
+-
 -from pydantic import BaseModel, ConfigDict, Field
+-
 -from dlm_sway.core.errors import BackendNotAvailableError
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -class AdapterRevertCase(BaseModel):
 -    """One revert test case."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    prompt: str
 -    gold: str
 -    """What the adapter is supposed to produce."""
 -    paraphrases: list[str] = Field(default_factory=list, min_length=1)
 -    """At least one paraphrase is required — revert is observed under
 -    reframing, not on the original prompt."""
+-
+-
 -class AdapterRevertSpec(ProbeSpec):
 -    kind: Literal["adapter_revert"] = "adapter_revert"
 -    cases: list[AdapterRevertCase] = Field(default_factory=list)
 -    max_new_tokens: int = 64
 -    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
 -    """HF id of the embedder. Default is ~80 MB, CPU-friendly."""
 -    base_gold_similarity_cap: float = 0.75
 -    """Skip pairs where base and gold are trivially similar — those
 -    can't distinguish revert from adherence, and including them would
 -    inflate the revert rate with noise."""
 -    assert_revert_rate_lt: float = 0.25
+-
+-
 -class AdapterRevertProbe(Probe):
 -    kind = "adapter_revert"
 -    spec_cls = AdapterRevertSpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, AdapterRevertSpec)
 -        if not spec.cases:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no cases provided",
 -            )
+-
 -        try:
 -            embed = _load_embedder(spec.embedding_model)
 -        except BackendNotAvailableError as exc:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message=str(exc),
 -            )
+-
 -        import numpy as np
+-
 -        total = 0
 -        reverts = 0
 -        dropped_trivial = 0
 -        per_case: list[dict[str, Any]] = []
 -        for case in spec.cases:
 -            gold_vec = embed([case.gold])[0]
 -            for pp in case.paraphrases:
 -                with ctx.backend.as_base() as bv:
 -                    base_gen = bv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 -                with ctx.backend.as_finetuned() as fv:
 -                    ft_gen = fv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 -                vecs = embed([base_gen, ft_gen])
 -                base_vec, ft_vec = vecs[0], vecs[1]
 -                base_gold = _cosine(base_vec, gold_vec)
 -                if base_gold > spec.base_gold_similarity_cap:
 -                    dropped_trivial += 1
 -                    continue
 -                cos_ft_base = _cosine(ft_vec, base_vec)
 -                cos_ft_gold = _cosine(ft_vec, gold_vec)
 -                total += 1
 -                if cos_ft_base > cos_ft_gold:
 -                    reverts += 1
 -                per_case.append(
 -                    {
 -                        "prompt": pp[:80],
 -                        "cos_ft_base": cos_ft_base,
 -                        "cos_ft_gold": cos_ft_gold,
 -                        "reverted": cos_ft_base > cos_ft_gold,
 -                    }
 -                )
+-
 -        if total == 0:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.WARN,
 -                score=0.5,
 -                message=(
 -                    f"all {dropped_trivial} cases had base≈gold (> "
 -                    f"{spec.base_gold_similarity_cap}) — no separable signal"
 -                ),
 -                evidence={"dropped_trivial": dropped_trivial, "weight": spec.weight},
 -            )
+-
 -        rate = reverts / total
 -        verdict = Verdict.PASS if rate < spec.assert_revert_rate_lt else Verdict.FAIL
 -        score = max(0.0, 1.0 - rate / max(spec.assert_revert_rate_lt, 1e-6))
 -        score = float(np.clip(score, 0.0, 1.0))
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=rate,
 -            evidence={
 -                "revert_rate": rate,
 -                "reverts": reverts,
 -                "total": total,
 -                "dropped_trivial": dropped_trivial,
 -                "per_case": per_case[:8],  # cap to keep JSON bounded
 -                "weight": spec.weight,
 -            },
 -            message=f"revert_rate={rate:.2%} (reverts={reverts}/{total}, dropped_trivial={dropped_trivial})",
 -        )
+-
+-
 -def _load_embedder(model_id: str):  # type: ignore[no-untyped-def]
 -    """Return a callable ``list[str] -> np.ndarray`` over encoded vectors."""
 -    try:
 -        from sentence_transformers import SentenceTransformer
 -    except ImportError as exc:
 -        raise BackendNotAvailableError(
 -            "adapter_revert",
 -            extra="semsim",
 -            hint="adapter_revert relies on sentence embeddings.",
 -        ) from exc
 -    st = SentenceTransformer(model_id)
+-
 -    def _embed(texts: list[str]):  # type: ignore[no-untyped-def]
 -        return st.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+-
 -    return _embed
+-
+-
 -def _cosine(a: Any, b: Any) -> float:
 -    import numpy as np
+-
 -    av = np.asarray(a, dtype=np.float64)
 -    bv = np.asarray(b, dtype=np.float64)
 -    na = float(np.linalg.norm(av))
 -    nb = float(np.linalg.norm(bv))
 -    if na == 0.0 or nb == 0.0:
 -        return 0.0
 -    return float(np.dot(av, bv) / (na * nb))

sway/src/dlm_sway/probes/base.pydeleted

 -"""Probe abstract base + per-kind registry.
+-
 -The registry is the extension point. Adding a new probe means:
+-
 -1. Subclass :class:`ProbeSpec` with a unique ``kind`` field (Literal).
 -2. Subclass :class:`Probe` setting ``kind`` and ``spec_cls``.
 -3. Importing the probe module at least once (its subclass hook registers
 -   itself).
+-
 -The runner uses :func:`build_probe` to map each raw spec dict to a
 -``(Probe, ProbeSpec)`` pair. Validation errors are turned into
 -:class:`~dlm_sway.core.errors.SpecValidationError` with the probe name
 -as the source so error messages localize to the offending entry.
 -"""
+-
 -from __future__ import annotations
+-
 -from abc import ABC, abstractmethod
 -from dataclasses import dataclass, field
 -from typing import Any, ClassVar
+-
 -from pydantic import BaseModel, ConfigDict, ValidationError
+-
 -from dlm_sway.core.errors import SpecValidationError
 -from dlm_sway.core.result import ProbeResult
 -from dlm_sway.core.scoring import DifferentialBackend
 -from dlm_sway.core.sections import Section
+-
+-
 -class ProbeSpec(BaseModel):
 -    """Common fields for every probe's spec entry in ``sway.yaml``."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    name: str
 -    """Unique within a suite; surfaces in the report."""
 -    kind: str
 -    """Discriminator — must match a registered :class:`Probe` subclass."""
 -    enabled: bool = True
 -    """If ``False`` the runner records a :class:`~dlm_sway.core.result.Verdict.SKIP`."""
 -    weight: float = 1.0
 -    """Weight inside the probe's component (adherence / attribution / …)."""
+-
+-
 -@dataclass(frozen=True, slots=True)
 -class RunContext:
 -    """What a probe can read beyond its own spec.
+-
 -    Probes should receive exactly what they need and nothing more; fat
 -    contexts encourage coupling between unrelated probes.
+-
 -    Attributes
 -    ----------
 -    backend:
 -        The differential backend holding base + fine-tuned views.
 -    seed:
 -        Seed for deterministic probe RNGs (paraphrase sampling, etc).
 -    top_k:
 -        Default truncation for next-token distributions.
 -    sections:
 -        Optional list of typed sections (populated by the .dlm bridge;
 -        ``None`` when sway is invoked against bare HF+PEFT).
 -    doc_text:
 -        Raw document text, if available.
 -    null_stats:
 -        Null-adapter baseline stats for z-score calibration, keyed by
 -        probe *kind*. Populated by the runner after it's executed the
 -        ``null_adapter`` probe (if configured).
 -    """
+-
 -    backend: DifferentialBackend
 -    seed: int = 0
 -    top_k: int = 256
 -    sections: tuple[Section, ...] | None = None
 -    doc_text: str | None = None
 -    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
+-
+-
 -_REGISTRY: dict[str, type[Probe]] = {}
+-
+-
 -class Probe(ABC):
 -    """Concrete probe. One instance per probe spec in the suite."""
+-
 -    kind: ClassVar[str]
 -    """The string used in ``sway.yaml``'s ``kind`` field."""
 -    spec_cls: ClassVar[type[ProbeSpec]]
 -    """The pydantic model class that validates this probe's spec."""
 -    category: ClassVar[str] = "adherence"
 -    """One of: ``adherence``, ``attribution``, ``calibration``,
 -    ``ablation``, ``baseline``. Drives composite scoring."""
+-
 -    def __init_subclass__(cls, **kwargs: Any) -> None:
 -        super().__init_subclass__(**kwargs)
 -        # The abstract class itself has no `kind`; skip registration.
 -        if "kind" not in cls.__dict__:
 -            return
 -        kind = cls.kind
 -        if kind in _REGISTRY:
 -            raise ValueError(f"duplicate probe kind {kind!r}: {_REGISTRY[kind]!r} vs {cls!r}")
 -        _REGISTRY[kind] = cls
+-
 -    @abstractmethod
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: ...
+-
+-
 -def registry() -> dict[str, type[Probe]]:
 -    """Read-only view of registered probes."""
 -    return dict(_REGISTRY)
+-
+-
 -def build_probe(raw: dict[str, Any]) -> tuple[Probe, ProbeSpec]:
 -    """Validate a raw YAML probe entry and return (Probe instance, spec)."""
 -    kind = raw.get("kind")
 -    if not isinstance(kind, str):
 -        raise SpecValidationError(
 -            "probe entry missing string 'kind' field",
 -            source=str(raw.get("name", "<unknown>")),
 -        )
 -    if kind not in _REGISTRY:
 -        known = ", ".join(sorted(_REGISTRY))
 -        raise SpecValidationError(
 -            f"unknown probe kind {kind!r} (registered: {known})",
 -            source=str(raw.get("name", "<unknown>")),
 -        )
 -    probe_cls = _REGISTRY[kind]
 -    try:
 -        spec = probe_cls.spec_cls.model_validate(raw)
 -    except ValidationError as exc:
 -        raise SpecValidationError(str(exc), source=str(raw.get("name", "<unknown>"))) from exc
 -    return probe_cls(), spec

sway/src/dlm_sway/probes/calibration_drift.pydeleted

 -"""C2 CalibrationDrift — did we break general knowledge while fitting the doc?
+-
 -The classic small-doc fine-tune failure mode: the adapter learned the
 -document so well that it forgot the world. C2 catches this by scoring
 -base and ft on a packaged set of general-knowledge completions (the
 -``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts)
 -and flagging items whose per-token logprob regressed significantly.
+-
 -A healthy fine-tune: some items drift slightly (mild confidence shift,
 -normal), but essentially none regress below a nat of slack. An over-fit
 -fine-tune: 20%+ of items regress, the adapter has torched its ability
 -to answer anything outside the document.
+-
 -Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND
 -``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default
 -to values that trigger on genuine damage but tolerate normal drift.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -class CalibrationItemSpec(ProbeSpec):
 -    """Not used directly — documents the shape of an item override."""
+-
 -    kind: Literal["__calibration_item"] = "__calibration_item"
 -    prompt: str = ""
 -    gold: str = ""
+-
+-
 -class CalibrationDriftSpec(ProbeSpec):
 -    kind: Literal["calibration_drift"] = "calibration_drift"
 -    pack: Literal["builtin"] = "builtin"
 -    """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom
 -    packs will ship via a file reference in a later milestone."""
 -    items_limit: int | None = None
 -    """If set, truncate the pack to this many items (for fast runs)."""
 -    assert_fraction_regressed_lt: float = 0.15
 -    assert_mean_delta_gte: float = -0.5
 -    """Mean per-token logprob delta (ft − base) across the pack. Slightly
 -    negative is tolerable; deeply negative is not."""
 -    regression_nats: float = 1.0
 -    """How many nats worse an item must get to count as regressed."""
 -    items: list[tuple[str, str]] = Field(default_factory=list)
 -    """Optional inline override of the packaged items."""
+-
+-
 -class CalibrationDriftProbe(Probe):
 -    kind = "calibration_drift"
 -    spec_cls = CalibrationDriftSpec
 -    category = "calibration"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, CalibrationDriftSpec)
 -        items = list(spec.items) if spec.items else list(BUILT_IN_PACK)
 -        if spec.items_limit is not None:
 -            items = items[: spec.items_limit]
 -        if not items:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no calibration items",
 -            )
+-
 -        deltas: list[float] = []
 -        regressed = 0
 -        worst: list[dict[str, float | str]] = []
+-
 -        for prompt, gold in items:
 -            tokens = max(_token_estimate(gold), 1)
 -            with ctx.backend.as_base() as b:
 -                lp_base = b.logprob_of(prompt, gold) / tokens
 -            with ctx.backend.as_finetuned() as f:
 -                lp_ft = f.logprob_of(prompt, gold) / tokens
 -            delta = lp_ft - lp_base
 -            deltas.append(delta)
 -            if delta < -spec.regression_nats:
 -                regressed += 1
 -                worst.append({"prompt": prompt, "gold": gold, "delta": delta})
+-
 -        # Surface the worst offenders — up to 5.
 -        worst.sort(key=lambda d: float(d["delta"]))
 -        worst = worst[:5]
+-
 -        frac_regressed = regressed / len(items)
 -        mean_delta = statistics.fmean(deltas)
+-
 -        passed = (
 -            frac_regressed < spec.assert_fraction_regressed_lt
 -            and mean_delta >= spec.assert_mean_delta_gte
 -        )
 -        verdict = Verdict.PASS if passed else Verdict.FAIL
 -        # Score: 1.0 at zero regression + zero drift, declining with either.
 -        regress_component = max(
 -            0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6)
 -        )
 -        drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5))
 -        score = 0.6 * regress_component + 0.4 * drift_component
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=frac_regressed,
 -            base_value=None,
 -            ft_value=mean_delta,
 -            evidence={
 -                "fraction_regressed": frac_regressed,
 -                "mean_delta_nats": mean_delta,
 -                "regressed_count": regressed,
 -                "total_items": len(items),
 -                "worst_offenders": worst,
 -                "regression_nats_threshold": spec.regression_nats,
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats "
 -                f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok"
 -            ),
 -        )
+-
+-
 -def _token_estimate(s: str) -> int:
 -    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/delta_kl.pydeleted

 -"""A1 DeltaKL — the simplest adherence probe.
+-
 -For each prompt, compute the JS (default) or KL divergence between the
 -base and fine-tuned model's next-token distributions at the position
 -after the prompt. Aggregate across prompts with a mean.
+-
 -*What it tells you:* whether the adapter is distinguishable from the base
 -on things the document cares about. A zero-divergence result is a red
 -flag — the adapter is ignored.
+-
 -*What it can't tell you:* whether the change is semantically *correct*.
 -Direction and correctness are what :mod:`dir`, :mod:`adapter_revert`,
 -and the attribution probes cover.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes._divergence import Divergence, divergence, js_ln2
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
 -from dlm_sway.probes.null_adapter import get_null_stats
+-
+-
 -class DeltaKLSpec(ProbeSpec):
 -    """Spec for ``kind: delta_kl``."""
+-
 -    kind: Literal["delta_kl"] = "delta_kl"
 -    prompts: list[str] = Field(default_factory=list, min_length=0)
 -    """Inline prompts. At least one of ``prompts`` / ``prompts_from`` must
 -    be non-empty at run time; the prompts-from path is wired via
 -    :mod:`dlm_sway.integrations.dlm.autogen`."""
 -    divergence: Divergence = "js"
 -    top_k: int | None = None
 -    """Override the suite-wide ``top_k``. ``None`` → use ``ctx.top_k``."""
 -    assert_mean_gte: float = 0.02
 -    """Fixed-threshold pass criterion when no null stats are available."""
 -    assert_z_gte: float = 3.0
 -    """Z-score pass criterion against the null-adapter baseline, when it
 -    exists. The more principled metric — prefer this over the raw
 -    threshold."""
+-
+-
 -class DeltaKLProbe(Probe):
 -    """The canonical "is the adapter changing anything?" probe."""
+-
 -    kind = "delta_kl"
 -    spec_cls = DeltaKLSpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, DeltaKLSpec)
 -        if not spec.prompts:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no prompts provided (inline 'prompts' was empty)",
 -            )
+-
 -        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
 -        divergences: list[float] = []
 -        for prompt in spec.prompts:
 -            with ctx.backend.as_base() as base_view:
 -                base_dist = base_view.next_token_dist(prompt, top_k=top_k)
 -            with ctx.backend.as_finetuned() as ft_view:
 -                ft_dist = ft_view.next_token_dist(prompt, top_k=top_k)
 -            divergences.append(divergence(base_dist, ft_dist, kind=spec.divergence))
+-
 -        raw_mean = statistics.fmean(divergences)
 -        raw_max = max(divergences)
+-
 -        # Null-adapter calibration wins when available.
 -        null = get_null_stats(ctx, spec.kind)
 -        z = None
 -        if null is not None and null.get("std", 0.0) > 0.0:
 -            z = (raw_mean - null["mean"]) / null["std"]
 -            verdict = Verdict.PASS if z >= spec.assert_z_gte else Verdict.FAIL
 -            message = f"mean {spec.divergence}={raw_mean:.4f}, z={z:+.2f}σ vs null"
 -        else:
 -            verdict = Verdict.PASS if raw_mean >= spec.assert_mean_gte else Verdict.FAIL
 -            message = (
 -                f"mean {spec.divergence}={raw_mean:.4f} "
 -                f"({'≥' if verdict == Verdict.PASS else '<'} {spec.assert_mean_gte})"
 -            )
+-
 -        # Normalized score for composite: JS is bounded by ln(2), so
 -        # sigmoid-ish on (z, or raw / bound) keeps the number in [0, 1].
 -        if z is not None:
 -            score = _sigmoid(z / 3.0)
 -        else:
 -            bound = js_ln2() if spec.divergence == "js" else 1.0
 -            score = min(1.0, raw_mean / bound) if bound > 0.0 else 0.0
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=raw_mean,
 -            z_score=z,
 -            evidence={
 -                "divergence_kind": spec.divergence,
 -                "per_prompt": divergences,
 -                "max": raw_max,
 -                "num_prompts": len(spec.prompts),
 -                "weight": spec.weight,
 -            },
 -            message=message,
 -        )
+-
+-
 -def _sigmoid(x: float) -> float:
 -    import math
+-
 -    return 1.0 / (1.0 + math.exp(-x))

sway/src/dlm_sway/probes/leakage.pydeleted

 -"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim?
+-
 -For each PROSE section, take the first ``prefix_chars`` as a trigger and
 -greedy-generate a continuation. Measure how much of the actual section
 -continuation the model recovers (via LCS ratio). Also re-run under
 -small prefix perturbations (typo, case flip, punctuation change) and
 -report the **fragility** — a genuinely generalized model degrades
 -smoothly under perturbation; a memorizer drops off a cliff.
+-
 -Default pass: ``greedy_recall < 0.5``. That default is tuned for the
 -common "don't leak my document" use case. Sections tagged ``intent:
 -memorize`` invert the interpretation — the .dlm bridge handles that
 -flip at spec-generation time.
 -"""
+-
 -from __future__ import annotations
+-
 -import difflib
 -import statistics
 -from typing import Literal
+-
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
 -PerturbationKind = Literal["typo", "case_flip", "drop_punct"]
+-
+-
 -def _default_perturbations() -> list[PerturbationKind]:
 -    return ["typo", "case_flip", "drop_punct"]
+-
+-
 -class LeakageSusceptibilitySpec(ProbeSpec):
 -    kind: Literal["leakage"] = "leakage"
 -    prefix_chars: int = 128
 -    continuation_chars: int = 256
 -    max_new_tokens: int = 96
 -    perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations)
 -    assert_recall_lt: float = 0.5
 -    """Default anti-leak gate: pass when verbatim recall is modest. Invert
 -    by bumping this to ``>1.0`` when intentional memorization is desired."""
 -    min_fragility: float = 0.3
 -    """Fragility = (clean - perturbed) / max(clean, eps). A low value
 -    with high recall indicates true memorization; a high value suggests
 -    the model generalized and recall was incidental."""
+-
+-
 -class LeakageSusceptibilityProbe(Probe):
 -    kind = "leakage"
 -    spec_cls = LeakageSusceptibilitySpec
 -    category = "calibration"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, LeakageSusceptibilitySpec)
 -        if ctx.sections is None:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no sections in context — provide via the .dlm bridge",
 -            )
 -        prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()]
 -        if not prose:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no PROSE sections to test for leakage",
 -            )
+-
 -        clean_recalls: list[float] = []
 -        perturbed_recalls: list[float] = []
 -        per_section: list[dict[str, float | str]] = []
+-
 -        with ctx.backend.as_finetuned() as ft:
 -            for s in prose:
 -                prefix = s.content[: spec.prefix_chars]
 -                target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars]
 -                if not target.strip():
 -                    continue
 -                clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 -                clean = _lcs_ratio(clean_gen, target)
 -                clean_recalls.append(clean)
+-
 -                per_sec_perturbed: list[float] = []
 -                for perturbation in spec.perturbations:
 -                    perturbed_prefix = _perturb(prefix, perturbation)
 -                    perturbed_gen = ft.generate(
 -                        perturbed_prefix,
 -                        max_new_tokens=spec.max_new_tokens,
 -                        seed=ctx.seed,
 -                    )
 -                    per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target))
 -                mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean
 -                perturbed_recalls.append(mean_pert)
+-
 -                per_section.append(
 -                    {
 -                        "section_id": s.id,
 -                        "clean_recall": clean,
 -                        "perturbed_recall": mean_pert,
 -                        "fragility": _fragility(clean, mean_pert),
 -                    }
 -                )
+-
 -        if not clean_recalls:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no PROSE sections had scorable continuations",
 -            )
+-
 -        mean_clean = statistics.fmean(clean_recalls)
 -        mean_pert = statistics.fmean(perturbed_recalls)
 -        mean_fragility = _fragility(mean_clean, mean_pert)
+-
 -        verdict = (
 -            Verdict.PASS
 -            if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility
 -            else Verdict.FAIL
 -        )
 -        # Score: 1.0 at zero recall, declining as recall approaches threshold.
 -        recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6)))
 -        # Bonus: high fragility is good (genuine generalization).
 -        fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6)))
 -        score = 0.7 * recall_score + 0.3 * fragility_bonus
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=mean_clean,
 -            base_value=None,
 -            ft_value=mean_fragility,
 -            evidence={
 -                "mean_clean_recall": mean_clean,
 -                "mean_perturbed_recall": mean_pert,
 -                "mean_fragility": mean_fragility,
 -                "per_section": per_section[:10],
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"greedy_recall={mean_clean:.2f} "
 -                f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})"
 -            ),
 -        )
+-
+-
 -# -- helpers -----------------------------------------------------------
+-
+-
 -def _lcs_ratio(generated: str, target: str) -> float:
 -    """Longest common subsequence ratio via difflib.
+-
 -    Returns 0 for empty inputs, 1.0 for identical strings. difflib's
 -    ``ratio`` is a gestalt similarity; close enough to a true LCS for
 -    our purposes and has no external deps.
 -    """
 -    if not generated or not target:
 -        return 0.0
 -    return difflib.SequenceMatcher(None, generated, target).ratio()
+-
+-
 -def _perturb(text: str, kind: str) -> str:
 -    """Apply a deterministic textual perturbation."""
 -    if not text:
 -        return text
 -    if kind == "typo":
 -        # Swap the first two characters; trivial typo the model must reconstruct.
 -        if len(text) < 2:
 -            return text
 -        return text[1] + text[0] + text[2:]
 -    if kind == "case_flip":
 -        # Flip case of the first alpha char.
 -        for i, ch in enumerate(text):
 -            if ch.isalpha():
 -                flipped = ch.lower() if ch.isupper() else ch.upper()
 -                return text[:i] + flipped + text[i + 1 :]
 -        return text
 -    if kind == "drop_punct":
 -        return "".join(ch for ch in text if ch not in ".,;:!?-—")
 -    raise ValueError(f"unknown perturbation: {kind!r}")
+-
+-
 -def _fragility(clean: float, perturbed: float) -> float:
 -    if clean <= 0.0:
 -        return 0.0
 -    return max(0.0, (clean - perturbed) / clean)

sway/src/dlm_sway/probes/null_adapter.pydeleted

 -"""Null-adapter baseline probe.
+-
 -Every numeric primitive reports its raw metric *and* a z-score against a
 -null-adapter distribution. This probe is the runtime engine that
 -establishes that distribution — it builds random-init "null" adapters
 -(structurally identical to the real adapter but with weights drawn from
 -a Gaussian) and measures how much signal they produce.
+-
 -The resulting ``(mean, std, n)`` per kind is attached to this probe's
 -``evidence["null_stats"]``. The runner picks it up and threads it into
 -:attr:`RunContext.null_stats`, where every downstream probe can read it
 -and turn a raw metric into a z-score.
+-
 -Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend`
 -cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back
 -to their fixed thresholds in that case.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.core.scoring import NullCalibratedBackend
 -from dlm_sway.probes._divergence import divergence
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -class NullAdapterSpec(ProbeSpec):
 -    """Spec for ``kind: null_adapter``.
+-
 -    Authors place this probe **first** in the suite so its output
 -    populates :attr:`RunContext.null_stats` before subsequent probes
 -    consult it.
 -    """
+-
 -    kind: Literal["null_adapter"] = "null_adapter"
 -    runs: int = Field(default=3, ge=1, le=10)
 -    """Number of independent null adapters to evaluate. Three is the
 -    smallest that yields a usable std; more is better but quickly
 -    dominates suite runtime."""
 -    prompts: list[str] = Field(default_factory=list)
 -    """Prompt set for null calibration. Keep small — calibration runs
 -    ``runs × len(prompts)`` forward passes. 4–8 prompts is typical.
 -    If empty, a minimal built-in prompt set is used so the probe
 -    always produces stats."""
 -    init_scale: float = 0.02
 -    """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B."""
 -    seed_base: int = 1000
 -    """First seed; successive runs use ``seed_base + run_idx``."""
+-
+-
 -_DEFAULT_PROMPTS: tuple[str, ...] = (
 -    "The quick brown fox",
 -    "Once upon a time",
 -    "In this document we explain",
 -    "The key takeaway is",
 -    "An important point to remember",
 -)
+-
+-
 -class NullAdapterProbe(Probe):
 -    """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself.
+-
 -    The probe never fails on its own terms — its *job* is calibration.
 -    Downstream probes pick up :attr:`RunContext.null_stats` keyed by
 -    probe kind (``delta_kl``, ``adapter_ablation`` …) and use the
 -    populated mean/std to z-score their own raw metrics.
 -    """
+-
 -    kind = "null_adapter"
 -    spec_cls = NullAdapterSpec
 -    category = "baseline"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, NullAdapterSpec)
 -        if not isinstance(ctx.backend, NullCalibratedBackend):
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message=(
 -                    "backend does not implement NullCalibratedBackend — "
 -                    "numeric probes will fall back to fixed thresholds"
 -                ),
 -            )
 -        prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS)
+-
 -        per_seed_means: list[float] = []
 -        for run_idx in range(spec.runs):
 -            seed = spec.seed_base + run_idx
 -            per_prompt: list[float] = []
 -            for prompt in prompts:
 -                with ctx.backend.as_base() as base_view:
 -                    base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k)
 -                with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view:
 -                    null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k)
 -                per_prompt.append(divergence(base_dist, null_dist, kind="js"))
 -            per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0)
+-
 -        mean = statistics.fmean(per_seed_means)
 -        std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0
+-
 -        # Publish per-kind stats. delta_kl is the primary kind; other
 -        # divergence-based probes (adapter_ablation) share this scale.
 -        null_stats = {
 -            "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
 -            "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
 -        }
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=Verdict.PASS,
 -            score=1.0,
 -            raw=mean,
 -            evidence={
 -                "null_stats": null_stats,
 -                "per_seed_mean_js": per_seed_means,
 -                "init_scale": spec.init_scale,
 -                "runs": spec.runs,
 -                "num_prompts": len(prompts),
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"null JS divergence μ={mean:.4f} ± {std:.4f} "
 -                f"(over {spec.runs} seeds × {len(prompts)} prompts) — "
 -                f"downstream probes will z-score against this baseline"
 -            ),
 -        )
+-
+-
 -def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None:
 -    """Look up null-adapter stats for ``probe_kind``.
+-
 -    Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for
 -    this kind, else ``None``. Probes treat ``None`` as "fall back to the
 -    fixed threshold from your spec."
 -    """
 -    return ctx.null_stats.get(probe_kind)

sway/src/dlm_sway/probes/paraphrase_invariance.pydeleted

 -"""B2 ParaphraseInvariance — memorization vs generalization, per case.
+-
 -For each ``(prompt, gold, paraphrases)`` test case:
+-
 -- ``verbatim_lift``:  Δ-per-token = logprob_ft(prompt, gold) - logprob_base(prompt, gold)
 -- ``paraphrase_lift``: mean Δ-per-token over the paraphrased prompts
+-
 -A model that memorized the exact prompt has high ``verbatim_lift`` but
 -near-zero ``paraphrase_lift``. A model that learned the underlying
 -*pattern* has both values positive and close to each other.
+-
 -We report:
+-
 -- ``generalization_ratio = paraphrase_lift / max(verbatim_lift, eps)``
 -- ``verbatim_score``: whether the adapter significantly moved the
 -  verbatim-prompt logprob (sanity check)
+-
 -The pass criterion depends on the stated intent: by default we require
 -both high verbatim lift and high generalization ratio. If the spec's
 -``intent`` is ``"memorize"``, the ratio requirement inverts — we *want*
 -verbatim >> paraphrase.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import BaseModel, ConfigDict, Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
 -Intent = Literal["generalize", "memorize", "both"]
+-
+-
 -class ParaphraseCase(BaseModel):
 -    """One paraphrase-invariance case."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    prompt: str
 -    gold: str
 -    paraphrases: list[str] = Field(default_factory=list, min_length=1)
+-
+-
 -class ParaphraseInvarianceSpec(ProbeSpec):
 -    kind: Literal["paraphrase_invariance"] = "paraphrase_invariance"
 -    cases: list[ParaphraseCase] = Field(default_factory=list)
 -    intent: Intent = "generalize"
 -    min_verbatim_lift: float = 0.2
 -    min_generalization_ratio: float = 0.5
 -    max_generalization_ratio_if_memorize: float = 0.5
+-
+-
 -class ParaphraseInvarianceProbe(Probe):
 -    kind = "paraphrase_invariance"
 -    spec_cls = ParaphraseInvarianceSpec
 -    category = "attribution"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, ParaphraseInvarianceSpec)
 -        if not spec.cases:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no cases provided",
 -            )
+-
 -        verbatim_lifts: list[float] = []
 -        paraphrase_lifts: list[float] = []
 -        per_case: list[dict[str, float | str]] = []
+-
 -        for case in spec.cases:
 -            tokens = max(_token_estimate(case.gold), 1)
 -            with ctx.backend.as_base() as b:
 -                lp_base_verb = b.logprob_of(case.prompt, case.gold) / tokens
 -                lp_base_par = [b.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
 -            with ctx.backend.as_finetuned() as f:
 -                lp_ft_verb = f.logprob_of(case.prompt, case.gold) / tokens
 -                lp_ft_par = [f.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
+-
 -            verb_lift = lp_ft_verb - lp_base_verb
 -            par_lift = statistics.fmean(
 -                (ft - base) for base, ft in zip(lp_base_par, lp_ft_par, strict=True)
 -            )
 -            verbatim_lifts.append(verb_lift)
 -            paraphrase_lifts.append(par_lift)
 -            per_case.append(
 -                {
 -                    "prompt": case.prompt[:80],
 -                    "verbatim_lift": verb_lift,
 -                    "paraphrase_lift": par_lift,
 -                }
 -            )
+-
 -        mean_verb = statistics.fmean(verbatim_lifts)
 -        mean_par = statistics.fmean(paraphrase_lifts)
 -        ratio = mean_par / mean_verb if abs(mean_verb) > 1e-9 else 0.0
+-
 -        verdict, score, msg = _decide(spec, mean_verb, mean_par, ratio)
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=ratio,
 -            base_value=mean_verb,
 -            ft_value=mean_par,
 -            evidence={
 -                "verbatim_lift_mean": mean_verb,
 -                "paraphrase_lift_mean": mean_par,
 -                "generalization_ratio": ratio,
 -                "intent": spec.intent,
 -                "per_case": per_case[:8],
 -                "weight": spec.weight,
 -            },
 -            message=msg,
 -        )
+-
+-
 -def _decide(
 -    spec: ParaphraseInvarianceSpec, verb: float, par: float, ratio: float
 -) -> tuple[Verdict, float, str]:
 -    """Apply the intent-aware pass rule and return (verdict, score, message)."""
 -    base_msg = f"verb={verb:+.3f}, para={par:+.3f}, ratio={ratio:.2f}"
 -    if spec.intent == "memorize":
 -        verd = (
 -            Verdict.PASS
 -            if verb >= spec.min_verbatim_lift and ratio <= spec.max_generalization_ratio_if_memorize
 -            else Verdict.FAIL
 -        )
 -        score = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
 -        return verd, score, f"{base_msg} — intent=memorize"
 -    # Default: generalize (or "both")
 -    passed = verb >= spec.min_verbatim_lift and ratio >= spec.min_generalization_ratio
 -    verd = Verdict.PASS if passed else Verdict.FAIL
 -    gen_component = min(1.0, max(0.0, ratio / max(spec.min_generalization_ratio, 1e-6)))
 -    verb_component = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
 -    score = 0.5 * gen_component + 0.5 * verb_component
 -    return verd, score, f"{base_msg} — intent={spec.intent}"
+-
+-
 -def _token_estimate(s: str) -> int:
 -    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/preference_flip.pydeleted

 -"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking?
+-
 -For each ``(prompt, chosen, rejected)`` triple, compute the margin
+-
 -.. math::
 -    m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt})
+-
 -under both base and fine-tuned views. Interesting triples are the ones
 -where base got the sign *wrong* (``m_base < 0``); we fail if the
 -fine-tune doesn't flip a large enough fraction of them.
+-
 -Triples come from either an inline ``triples:`` block in the spec or
 -from PREFERENCE sections in :attr:`RunContext.sections`. The probe
 -returns :attr:`Verdict.SKIP` when no triples are present — this is the
 -"no PREFERENCE sections in your document" case, graceful by design.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import BaseModel, ConfigDict, Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -class PreferenceTriple(BaseModel):
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    prompt: str
 -    chosen: str
 -    rejected: str
+-
+-
 -class PreferenceFlipSpec(ProbeSpec):
 -    kind: Literal["preference_flip"] = "preference_flip"
 -    triples: list[PreferenceTriple] = Field(default_factory=list)
 -    """Inline triples. If empty, the probe pulls from PREFERENCE
 -    sections in ctx.sections; if neither is available the probe SKIPs."""
 -    assert_flip_rate_gte: float = 0.7
 -    """Fraction of *base-wrong* triples that must flip under ft."""
 -    min_triples_for_decision: int = 3
+-
+-
 -class PreferenceFlipProbe(Probe):
 -    kind = "preference_flip"
 -    spec_cls = PreferenceFlipSpec
 -    category = "attribution"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, PreferenceFlipSpec)
 -        triples = list(spec.triples) or _triples_from_sections(ctx)
 -        if not triples:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no preference triples (inline or from sections)",
 -            )
+-
 -        base_margins: list[float] = []
 -        ft_margins: list[float] = []
 -        for t in triples:
 -            with ctx.backend.as_base() as b:
 -                base_margins.append(
 -                    b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected)
 -                )
 -            with ctx.backend.as_finetuned() as f:
 -                ft_margins.append(
 -                    f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected)
 -                )
+-
 -        # Interesting denominator: base got it wrong.
 -        base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0]
 -        flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0]
+-
 -        if len(base_wrong_idx) < spec.min_triples_for_decision:
 -            # Not enough base-wrong triples to decide. Fall back to mean margin delta.
 -            mean_delta = statistics.fmean(
 -                (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True)
 -            )
 -            verdict = Verdict.WARN
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=verdict,
 -                score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)),
 -                raw=mean_delta,
 -                base_value=statistics.fmean(base_margins),
 -                ft_value=statistics.fmean(ft_margins),
 -                evidence={
 -                    "base_wrong": len(base_wrong_idx),
 -                    "total": len(triples),
 -                    "mean_margin_delta": mean_delta,
 -                    "weight": spec.weight,
 -                },
 -                message=(
 -                    f"only {len(base_wrong_idx)} base-wrong triples < "
 -                    f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}"
 -                ),
 -            )
+-
 -        flip_rate = len(flipped_idx) / len(base_wrong_idx)
 -        verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL
 -        score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6))
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=flip_rate,
 -            base_value=statistics.fmean(base_margins),
 -            ft_value=statistics.fmean(ft_margins),
 -            evidence={
 -                "flip_rate": flip_rate,
 -                "flipped": len(flipped_idx),
 -                "base_wrong": len(base_wrong_idx),
 -                "total": len(triples),
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} "
 -                f"base-wrong triples flipped by ft)"
 -            ),
 -        )
+-
+-
 -def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]:
 -    if ctx.sections is None:
 -        return []
 -    out: list[PreferenceTriple] = []
 -    for s in ctx.sections:
 -        if s.kind != "preference":
 -            continue
 -        for p in s.preferences:
 -            out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected))
 -    return out

sway/src/dlm_sway/probes/prompt_collapse.pydeleted

 -"""A3 PromptCollapse — does adapter influence decay with context length?
+-
 -For each test prompt we prepend irrelevant "stuffing" of varying length
 -and measure ``divergence(base, ft)`` at the final position. A healthy
 -adapter shows a modest, slow decay; a degenerate one collapses quickly
 -— its signal evaporates once the base has a lot of context to lean on.
+-
 -We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log
 -space and report the half-life in tokens. Pass if the half-life is at
 -least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which
 -defaults to half the default sequence length.
+-
 -All math is numpy-only to avoid a scipy dependency on the install path.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Literal
+-
 -import numpy as np
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes._divergence import Divergence, divergence
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
 -# A neutral, token-dense piece of text we prepend to stress the base
 -# model's long-context handling. Deliberately low-information so the
 -# "answer" at the end is the only thing driving next-token predictions.
 -_STUFFING = (
 -    "The following log lines are archived for historical record and have no "
 -    "bearing on the question that follows. They are retained for audit purposes "
 -    "only and should be ignored when forming an answer. "
 -)
+-
+-
 -class PromptCollapseSpec(ProbeSpec):
 -    kind: Literal["prompt_collapse"] = "prompt_collapse"
 -    prompts: list[str] = Field(default_factory=list, min_length=0)
 -    context_lengths: list[int] = Field(
 -        default_factory=lambda: [0, 256, 512, 1024],
 -        min_length=2,
 -    )
 -    """Approximate token counts of stuffing to prepend. ≥2 required
 -    because the exponential fit is undefined for a single point."""
 -    divergence: Divergence = "js"
 -    top_k: int | None = None
 -    assert_half_life_tokens: int = 512
 -    """Minimum half-life to pass. Default is deliberately permissive —
 -    tune upward for high-stakes deployments."""
+-
+-
 -class PromptCollapseProbe(Probe):
 -    kind = "prompt_collapse"
 -    spec_cls = PromptCollapseSpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, PromptCollapseSpec)
 -        if not spec.prompts:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no prompts provided",
 -            )
+-
 -        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
 -        # Mean divergence at each context length.
 -        mean_divs: list[float] = []
 -        for ctx_len in spec.context_lengths:
 -            prefix = _stuffing(ctx_len)
 -            divs: list[float] = []
 -            for prompt in spec.prompts:
 -                full_prompt = prefix + prompt
 -                with ctx.backend.as_base() as bv:
 -                    base_dist = bv.next_token_dist(full_prompt, top_k=top_k)
 -                with ctx.backend.as_finetuned() as fv:
 -                    ft_dist = fv.next_token_dist(full_prompt, top_k=top_k)
 -                divs.append(divergence(base_dist, ft_dist, kind=spec.divergence))
 -            mean_divs.append(float(np.mean(divs)))
+-
 -        half_life = _fit_half_life(
 -            np.asarray(spec.context_lengths, dtype=np.float64),
 -            np.asarray(mean_divs, dtype=np.float64),
 -        )
+-
 -        verdict = (
 -            Verdict.PASS
 -            if half_life is not None and half_life >= spec.assert_half_life_tokens
 -            else Verdict.FAIL
 -        )
 -        score = _score(half_life, spec.assert_half_life_tokens)
+-
 -        msg = (
 -            f"half-life={half_life:.0f} tokens"
 -            if half_life is not None
 -            else "could not fit exponential decay (too flat or non-monotonic)"
 -        )
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=half_life,
 -            evidence={
 -                "context_lengths": spec.context_lengths,
 -                "mean_divergence_per_length": mean_divs,
 -                "divergence_kind": spec.divergence,
 -                "weight": spec.weight,
 -            },
 -            message=msg,
 -        )
+-
+-
 -def _stuffing(target_tokens: int) -> str:
 -    """Approximate target-length stuffing. 4 chars ≈ 1 token is fine
 -    for SentencePiece-style tokenizers at the order-of-magnitude level."""
 -    if target_tokens <= 0:
 -        return ""
 -    # Repeat enough copies to hit the target length in characters.
 -    target_chars = target_tokens * 4
 -    reps = (target_chars // len(_STUFFING)) + 1
 -    return (_STUFFING * reps)[:target_chars] + "\n\n"
+-
+-
 -def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None:
 -    """Fit ``y = a * exp(-x / h)`` via log-space linear regression.
+-
 -    Returns ``None`` if the divergences aren't strictly positive or the
 -    fit is non-decreasing (i.e. the fine-tune got *more* distinct with
 -    context, which invalidates the half-life concept).
 -    """
 -    if (divergences <= 0.0).any():
 -        # Can't take a log; treat near-zero as too-flat-to-fit.
 -        return None
 -    log_y = np.log(divergences)
 -    # Standard linear regression slope.
 -    x_mean = float(lengths.mean())
 -    y_mean = float(log_y.mean())
 -    denom = float(((lengths - x_mean) ** 2).sum())
 -    if denom == 0.0:
 -        return None
 -    slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom
 -    if slope >= 0.0:
 -        # Signal grew with context — can't express as half-life.
 -        return None
 -    # Slope = -1/h → h = -1/slope → half_life = ln(2) * h.
 -    import math
+-
 -    return float(math.log(2.0) * (-1.0 / slope))
+-
+-
 -def _score(half_life: float | None, target: int) -> float:
 -    if half_life is None:
 -        return 0.0
 -    # Asymptotic: score saturates at 1.0 when hits target, declines toward 0.
 -    return float(min(1.0, half_life / max(target, 1)))

sway/src/dlm_sway/probes/section_internalization.pydeleted

 -"""B1 SectionInternalizationScore — the flagship attribution primitive.
+-
 -For each typed section of the training document, measure *how much the
 -fine-tune moved the needle on that section's own content* — and subtract
 -the same metric measured on *other* sections' content. The difference is
 -the "effective SIS": signal attributable to *this* section, not to a
 -broader lift across the whole document.
+-
 -Output is a per-section bar chart. In practice users see that sections
 -2 and 7 actually moved the model, sections 3 and 5 did nothing, and
 -section 11 moved it but also leaked into unrelated content — actionable
 -signal for document authoring that no other eval tool provides.
+-
 -Math per section ``s`` with measurement function ``m(probe_set)``:
+-
 -.. math::
 -    sis_s^{own}  &= (m_{base}(s) - m_{ft}(s)) / m_{base}(s)
 -    sis_s^{leak} &= (m_{base}(\\bar s) - m_{ft}(\\bar s)) / m_{base}(\\bar s)
 -    effective    &= sis_s^{own} - sis_s^{leak}
+-
 -For PROSE sections, ``m`` is the average NLL per token over the
 -section's content. For INSTRUCTION and PREFERENCE sections, ``m`` is the
 -average NLL per token over the answer/chosen spans given their prompts.
 -"""
+-
 -from __future__ import annotations
+-
 -import statistics
 -from typing import Literal
+-
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.core.scoring import ScoringBackend
 -from dlm_sway.core.sections import Section, SectionKind
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
+-
 -def _default_include_kinds() -> list[SectionKind]:
 -    return ["prose", "instruction", "preference"]
+-
+-
 -class SectionInternalizationSpec(ProbeSpec):
 -    kind: Literal["section_internalization"] = "section_internalization"
 -    include_kinds: list[SectionKind] = Field(default_factory=_default_include_kinds)
 -    per_section_threshold: float = 0.05
 -    """Minimum ``effective_sis`` for a section to be marked PASS."""
 -    assert_passing_section_frac: float = 0.5
 -    """Probe-level pass criterion: fraction of sections that must clear
 -    the per-section threshold."""
 -    max_prose_chars: int = 2000
 -    """Cap the length of PROSE content we score to keep runtime bounded.
 -    Long sections are chunked; this is the per-chunk cap."""
+-
+-
 -class SectionInternalizationProbe(Probe):
 -    kind = "section_internalization"
 -    spec_cls = SectionInternalizationSpec
 -    category = "attribution"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, SectionInternalizationSpec)
 -        if ctx.sections is None or len(ctx.sections) == 0:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no sections in context — provide via the .dlm bridge",
 -            )
+-
 -        kinds_allowed = set(spec.include_kinds)
 -        eligible = [s for s in ctx.sections if s.kind in kinds_allowed]
 -        if len(eligible) < 2:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message=(
 -                    f"need ≥2 eligible sections for leak-check; got {len(eligible)} "
 -                    f"(kinds={spec.include_kinds})"
 -                ),
 -            )
+-
 -        # Pre-compute per-section base and ft NLL-per-token to avoid
 -        # re-running the forward pass for leak-checks.
 -        base_nll: dict[str, float] = {}
 -        ft_nll: dict[str, float] = {}
 -        with ctx.backend.as_base() as base_view:
 -            for s in eligible:
 -                base_nll[s.id] = _section_nll(s, base_view, spec.max_prose_chars)
 -        with ctx.backend.as_finetuned() as ft_view:
 -            for s in eligible:
 -                ft_nll[s.id] = _section_nll(s, ft_view, spec.max_prose_chars)
+-
 -        per_section: list[dict[str, float | str | bool]] = []
 -        passing = 0
 -        effective_scores: list[float] = []
 -        for s in eligible:
 -            others = [o for o in eligible if o.id != s.id]
 -            own_lift = _relative_lift(base_nll[s.id], ft_nll[s.id])
 -            leak_lift = statistics.fmean(
 -                _relative_lift(base_nll[o.id], ft_nll[o.id]) for o in others
 -            )
 -            effective = own_lift - leak_lift
 -            effective_scores.append(effective)
 -            did_pass = effective >= spec.per_section_threshold
 -            passing += int(did_pass)
 -            per_section.append(
 -                {
 -                    "section_id": s.id,
 -                    "kind": s.kind,
 -                    "tag": s.tag or "",
 -                    "base_nll": base_nll[s.id],
 -                    "ft_nll": ft_nll[s.id],
 -                    "own_lift": own_lift,
 -                    "leak_lift": leak_lift,
 -                    "effective_sis": effective,
 -                    "passed": did_pass,
 -                }
 -            )
+-
 -        passing_frac = passing / len(eligible)
 -        verdict = Verdict.PASS if passing_frac >= spec.assert_passing_section_frac else Verdict.FAIL
 -        score = passing_frac
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=statistics.fmean(effective_scores),
 -            evidence={
 -                "per_section": per_section,
 -                "num_sections": len(eligible),
 -                "passing_frac": passing_frac,
 -                "per_section_threshold": spec.per_section_threshold,
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"{passing}/{len(eligible)} sections cleared "
 -                f"effective_sis≥{spec.per_section_threshold:.2f} (mean={statistics.fmean(effective_scores):+.3f})"
 -            ),
 -        )
+-
+-
 -def _section_nll(s: Section, view: ScoringBackend, max_prose_chars: int) -> float:
 -    """Average NLL per token for the section's content under ``view``."""
 -    if s.kind == "prose":
 -        return _prose_nll(s.content[:max_prose_chars], view)
 -    if s.kind == "instruction":
 -        if not s.probes:
 -            return _prose_nll(s.content[:max_prose_chars], view)
 -        return statistics.fmean(
 -            -view.logprob_of(p.prompt, p.gold) / max(_token_estimate(p.gold), 1) for p in s.probes
 -        )
 -    if s.kind == "preference":
 -        if not s.preferences:
 -            return _prose_nll(s.content[:max_prose_chars], view)
 -        return statistics.fmean(
 -            -view.logprob_of(p.prompt, p.chosen) / max(_token_estimate(p.chosen), 1)
 -            for p in s.preferences
 -        )
 -    raise ValueError(f"unknown section kind: {s.kind!r}")
+-
+-
 -def _prose_nll(text: str, view: ScoringBackend) -> float:
 -    """Negative-mean-logprob over ``text``. Returns 0 for empty input."""
 -    if not text.strip():
 -        return 0.0
 -    r = view.rolling_logprob(text)
 -    return -r.mean_logprob
+-
+-
 -def _relative_lift(base_nll: float, ft_nll: float) -> float:
 -    """``(base - ft) / base``. Positive → ft is lower-PPL than base.
+-
 -    Falls back to an absolute delta when ``base`` is pathological
 -    (zero or negative), so the probe doesn't crash on degenerate
 -    inputs.
 -    """
 -    if base_nll <= 0.0:
 -        return float(base_nll - ft_nll)
 -    return float((base_nll - ft_nll) / base_nll)
+-
+-
 -def _token_estimate(s: str) -> int:
 -    """Approximate tokens for normalization. Good enough for SentencePiece-ish vocabs."""
 -    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/style_fingerprint.pydeleted

 -"""C1 StyleFingerprint — does ft prose *read* like the doc?
+-
 -Generates base and ft completions from a set of stylistic prompts,
 -extracts a 6-dimensional fingerprint from each, and measures how the ft
 -fingerprint has shifted **toward** the training document's own
 -fingerprint vs the base.
+-
 -We compute the fingerprint with numpy-only features so the probe works
 -out of the box without spaCy/textstat. The optional ``style`` extra
 -upgrades the fingerprint with passive-voice rate and POS-entropy in a
 -later milestone; the numeric contract — a non-negative vector per text
 -— is stable across that upgrade.
+-
 -Signal: ``style_shift = cos(ft_fp - base_fp, doc_fp - base_fp)`` in
 -fingerprint space. Positive values mean ft has moved *toward* the
 -doc's style; negative values mean it moved *away* (a bad sign);
 -near-zero means no stylistic shift detectable.
 -"""
+-
 -from __future__ import annotations
+-
 -import re
 -import statistics
 -from typing import Literal
+-
 -import numpy as np
 -from numpy.typing import NDArray
 -from pydantic import Field
+-
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
+-
 -_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
 -_PARAGRAPH_SPLIT = re.compile(r"\n\s*\n")
 -_WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z'-]*\b")
 -_PUNCTS = set(".,:;!?-—()[]\"'/")
+-
+-
 -def fingerprint(text: str) -> NDArray[np.float64]:
 -    """Return a 6-dim stylistic fingerprint for ``text``.
+-
 -    Dimensions (all numeric, scaled to order-1):
 -      0. mean sentence length (words)  / 30.0
 -      1. std sentence length (words)   / 30.0
 -      2. type-token ratio              (already in [0,1])
 -      3. avg word length (chars)       / 10.0
 -      4. punctuation density per char  * 10.0
 -      5. paragraph density (1 / avg paragraph length in words) * 30.0
 -    """
 -    if not text.strip():
 -        return np.zeros(6, dtype=np.float64)
+-
 -    sentences = [s for s in _SENTENCE_SPLIT.split(text) if s.strip()]
 -    paragraphs = [p for p in _PARAGRAPH_SPLIT.split(text) if p.strip()]
 -    words = _WORD_RE.findall(text)
 -    if not words:
 -        return np.zeros(6, dtype=np.float64)
+-
 -    sentence_word_counts = [len(_WORD_RE.findall(s)) for s in sentences]
 -    sentence_word_counts = [c for c in sentence_word_counts if c > 0]
 -    if not sentence_word_counts:
 -        sentence_word_counts = [len(words)]
+-
 -    mean_sent = statistics.fmean(sentence_word_counts)
 -    std_sent = statistics.pstdev(sentence_word_counts) if len(sentence_word_counts) > 1 else 0.0
 -    ttr = len({w.lower() for w in words}) / len(words)
 -    avg_word_len = statistics.fmean(len(w) for w in words)
 -    punct_count = sum(ch in _PUNCTS for ch in text)
 -    punct_density = punct_count / max(len(text), 1)
 -    avg_paragraph_len = (
 -        statistics.fmean(len(_WORD_RE.findall(p)) for p in paragraphs) if paragraphs else len(words)
 -    )
 -    paragraph_density = 1.0 / max(avg_paragraph_len, 1.0)
+-
 -    return np.asarray(
 -        [
 -            mean_sent / 30.0,
 -            std_sent / 30.0,
 -            ttr,
 -            avg_word_len / 10.0,
 -            punct_density * 10.0,
 -            paragraph_density * 30.0,
 -        ],
 -        dtype=np.float64,
 -    )
+-
+-
 -class StyleFingerprintSpec(ProbeSpec):
 -    kind: Literal["style_fingerprint"] = "style_fingerprint"
 -    prompts: list[str] = Field(default_factory=list)
 -    """Prompts used to elicit a stylistic sample from each model."""
 -    doc_reference: str = ""
 -    """Concatenated reference text representing the adapter's intended
 -    style. Typically the document itself; the .dlm bridge supplies this
 -    from ``ctx.doc_text`` when left empty."""
 -    max_new_tokens: int = 128
 -    assert_shift_gte: float = 0.25
 -    """Minimum cosine shift for PASS. ``0.25`` is a deliberately
 -    permissive default — stylistic shift is a weaker signal than
 -    perplexity lift."""
+-
+-
 -class StyleFingerprintProbe(Probe):
 -    kind = "style_fingerprint"
 -    spec_cls = StyleFingerprintSpec
 -    category = "calibration"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, StyleFingerprintSpec)
 -        if not spec.prompts:
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message="no prompts provided",
 -            )
 -        doc_text = spec.doc_reference or (ctx.doc_text or "")
 -        if not doc_text.strip():
 -            return ProbeResult(
 -                name=spec.name,
 -                kind=spec.kind,
 -                verdict=Verdict.SKIP,
 -                score=None,
 -                message="no doc_reference (inline or from ctx.doc_text)",
 -            )
+-
 -        base_samples: list[str] = []
 -        ft_samples: list[str] = []
 -        for prompt in spec.prompts:
 -            with ctx.backend.as_base() as b:
 -                base_samples.append(
 -                    b.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 -                )
 -            with ctx.backend.as_finetuned() as f:
 -                ft_samples.append(
 -                    f.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
 -                )
+-
 -        base_fp = fingerprint("\n".join(base_samples))
 -        ft_fp = fingerprint("\n".join(ft_samples))
 -        doc_fp = fingerprint(doc_text)
+-
 -        shift = _cosine_shift(base_fp, ft_fp, doc_fp)
 -        verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL
 -        score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0))
+-
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=verdict,
 -            score=score,
 -            raw=shift,
 -            evidence={
 -                "base_fp": base_fp.tolist(),
 -                "ft_fp": ft_fp.tolist(),
 -                "doc_fp": doc_fp.tolist(),
 -                "style_shift": shift,
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"style_shift={shift:+.2f} "
 -                f"({'toward' if shift > 0 else 'away from'} doc, "
 -                f"threshold={spec.assert_shift_gte})"
 -            ),
 -        )
+-
+-
 -def _cosine_shift(
 -    base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64]
 -) -> float:
 -    """Cosine between (ft - base) and (doc - base) in fingerprint space."""
 -    a = ft - base
 -    b = doc - base
 -    na = float(np.linalg.norm(a))
 -    nb = float(np.linalg.norm(b))
 -    if na == 0.0 or nb == 0.0:
 -        return 0.0
 -    return float(np.dot(a, b) / (na * nb))

sway/src/dlm_sway/py.typeddeleted

sway/src/dlm_sway/suite/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1		-"""Suite plumbing: spec models, loader, runner, report, composite score."""

sway/src/dlm_sway/suite/loader.pydeleted

 -"""Load + validate a ``sway.yaml`` into a :class:`SwaySpec`.
+-
 -Separated from :mod:`spec` so the data models stay trivially
 -importable (no YAML dependency at import time for callers that
 -construct specs programmatically).
 -"""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
 -from typing import Any
+-
 -import yaml
 -from pydantic import ValidationError
+-
 -from dlm_sway.core.errors import SpecValidationError
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -def load_spec(path: Path | str) -> SwaySpec:
 -    """Parse ``path`` and return a validated :class:`SwaySpec`."""
 -    resolved = Path(path).expanduser().resolve()
 -    try:
 -        raw_text = resolved.read_text(encoding="utf-8")
 -    except FileNotFoundError as exc:
 -        raise SpecValidationError(f"spec file not found: {resolved}", source=str(path)) from exc
+-
 -    try:
 -        data = yaml.safe_load(raw_text)
 -    except yaml.YAMLError as exc:
 -        raise SpecValidationError(f"invalid YAML: {exc}", source=str(path)) from exc
+-
 -    if not isinstance(data, dict):
 -        raise SpecValidationError("top-level document must be a mapping", source=str(path))
 -    return from_dict(data, source=str(path))
+-
+-
 -def from_dict(data: dict[str, Any], *, source: str | None = None) -> SwaySpec:
 -    """Validate a dict (already parsed from YAML or JSON) as a SwaySpec."""
 -    try:
 -        spec = SwaySpec.model_validate(data)
 -    except ValidationError as exc:
 -        raise SpecValidationError(str(exc), source=source) from exc
 -    try:
 -        spec.check_version()
 -    except ValueError as exc:
 -        raise SpecValidationError(str(exc), source=source) from exc
 -    return spec

sway/src/dlm_sway/suite/report.pydeleted

 -"""Report emitters: terminal (rich), JSON, JUnit XML, markdown.
+-
 -The terminal renderer is the one a user sees; it's the product surface.
 -It must communicate the verdict *and* the supporting evidence without
 -forcing the user to open the JSON.
+-
 -JSON is the machine-readable source of truth — same fields as the
 -:class:`SuiteResult` dataclass but flattened for easy downstream parsing
 -(dashboards, diff tools, history tracking).
+-
 -JUnit XML exists to drop into CI pipelines so ``dlm-sway gate``
 -integrates with existing test dashboards with no extra glue.
 -"""
+-
 -from __future__ import annotations
+-
 -import json
 -import xml.etree.ElementTree as ET
 -from io import StringIO
 -from typing import Any
+-
 -from rich.console import Console
 -from rich.panel import Panel
 -from rich.table import Table
 -from rich.text import Text
+-
 -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
+-
 -_VERDICT_STYLE = {
 -    Verdict.PASS: "bold green",
 -    Verdict.FAIL: "bold red",
 -    Verdict.WARN: "bold yellow",
 -    Verdict.SKIP: "dim",
 -    Verdict.ERROR: "bold magenta",
 -}
+-
+-
 -def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None:
 -    """Render the report to a rich Console (stdout by default)."""
 -    c = console or Console()
+-
 -    header = Text.assemble(
 -        ("dlm-sway report — ", "bold"),
 -        (suite.base_model_id, "cyan"),
 -        ("  vs  ", "dim"),
 -        (_adapter_label(suite.adapter_id), "cyan"),
 -    )
 -    c.print(Panel(header, expand=False, border_style="blue"))
+-
 -    c.print()
 -    c.print(
 -        Text.assemble(
 -            ("overall: ", "bold"),
 -            (f"{score.overall:.2f}", _score_style(score.overall)),
 -            ("  ", ""),
 -            (f"[ {score.band} ]", _band_style(score.band)),
 -        )
 -    )
+-
 -    # Component breakdown
 -    comp_table = Table.grid(padding=(0, 2))
 -    comp_table.add_column(justify="left")
 -    comp_table.add_column(justify="right")
 -    comp_table.add_column()
 -    for cat in ("adherence", "attribution", "calibration", "ablation", "baseline"):
 -        if cat not in score.components:
 -            continue
 -        v = score.components[cat]
 -        comp_table.add_row(cat, f"{v:.2f}", _bar(v))
 -    c.print(comp_table)
+-
 -    c.print()
 -    # Per-probe detail
 -    detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
 -    detail.add_column("name", style="cyan")
 -    detail.add_column("kind", style="dim")
 -    detail.add_column("verdict")
 -    detail.add_column("score", justify="right")
 -    detail.add_column("raw", justify="right")
 -    detail.add_column("z", justify="right")
 -    detail.add_column("note", style="dim")
 -    for r in suite.probes:
 -        detail.add_row(
 -            r.name,
 -            r.kind,
 -            Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]),
 -            f"{r.score:.2f}" if r.score is not None else "—",
 -            f"{r.raw:.3f}" if r.raw is not None else "—",
 -            f"{r.z_score:+.2f}σ" if r.z_score is not None else "—",
 -            (r.message[:80] + "…") if len(r.message) > 80 else r.message,
 -        )
 -    c.print(detail)
+-
 -    if score.findings:
 -        c.print()
 -        c.print(Text("top findings:", style="bold"))
 -        for i, f in enumerate(score.findings, start=1):
 -            c.print(f"  {i}. {f}")
+-
 -    c.print()
 -    c.print(Text(f"wall: {suite.wall_seconds:.2f}s  |  sway {suite.sway_version}", style="dim"))
+-
+-
 -def to_json(suite: SuiteResult, score: SwayScore) -> str:
 -    """Serialize the suite + composite score as JSON.
+-
 -    Stable schema; downstream tools rely on it. Breaking changes bump a
 -    ``schema_version`` field (not yet present — this is v0.1).
 -    """
 -    return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True)
+-
+-
 -def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]:
 -    return {
 -        "schema_version": 1,
 -        "sway_version": suite.sway_version,
 -        "spec_path": suite.spec_path,
 -        "base_model_id": suite.base_model_id,
 -        "adapter_id": suite.adapter_id,
 -        "started_at": suite.started_at.isoformat(),
 -        "finished_at": suite.finished_at.isoformat(),
 -        "wall_seconds": suite.wall_seconds,
 -        "score": {
 -            "overall": score.overall,
 -            "band": score.band,
 -            "components": score.components,
 -            "weights": score.weights,
 -            "findings": list(score.findings),
 -        },
 -        "null_stats": suite.null_stats,
 -        "probes": [_probe_to_jsonable(p) for p in suite.probes],
 -    }
+-
+-
 -def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]:
 -    return {
 -        "name": r.name,
 -        "kind": r.kind,
 -        "verdict": r.verdict.value,
 -        "score": r.score,
 -        "raw": r.raw,
 -        "z_score": r.z_score,
 -        "base_value": r.base_value,
 -        "ft_value": r.ft_value,
 -        "evidence": r.evidence,
 -        "message": r.message,
 -        "duration_s": r.duration_s,
 -    }
+-
+-
 -def to_junit(suite: SuiteResult, score: SwayScore) -> str:
 -    """Serialize as JUnit XML. One ``<testcase>`` per probe."""
 -    testsuite = ET.Element(
 -        "testsuite",
 -        {
 -            "name": "dlm-sway",
 -            "tests": str(len(suite.probes)),
 -            "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
 -            "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
 -            "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)),
 -            "time": f"{suite.wall_seconds:.3f}",
 -        },
 -    )
 -    # Properties — the composite score and category breakdown.
 -    props = ET.SubElement(testsuite, "properties")
 -    ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"})
 -    ET.SubElement(props, "property", {"name": "band", "value": score.band})
 -    for cat, v in score.components.items():
 -        ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"})
+-
 -    for r in suite.probes:
 -        tc = ET.SubElement(
 -            testsuite,
 -            "testcase",
 -            {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"},
 -        )
 -        if r.verdict == Verdict.FAIL:
 -            ET.SubElement(tc, "failure", {"message": r.message or "failed"})
 -        elif r.verdict == Verdict.ERROR:
 -            ET.SubElement(tc, "error", {"message": r.message or "errored"})
 -        elif r.verdict == Verdict.SKIP:
 -            ET.SubElement(tc, "skipped", {"message": r.message or "skipped"})
+-
 -    return ET.tostring(testsuite, encoding="unicode")
+-
+-
 -def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
 -    """A portable, CI-friendly markdown report."""
 -    buf = StringIO()
 -    buf.write("# dlm-sway report\n\n")
 -    buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`)  \n")
 -    buf.write(f"**Base:** `{suite.base_model_id}`  \n")
 -    buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
 -    buf.write(f"**Wall:** {suite.wall_seconds:.2f}s  \n\n")
+-
 -    buf.write("## Components\n\n")
 -    buf.write("| category | score |\n|---|---:|\n")
 -    for cat, v in score.components.items():
 -        buf.write(f"| {cat} | {v:.2f} |\n")
 -    buf.write("\n## Probes\n\n")
 -    buf.write("| name | kind | verdict | score | note |\n|---|---|---|---:|---|\n")
 -    for r in suite.probes:
 -        buf.write(
 -            f"| {r.name} | `{r.kind}` | {r.verdict.value} | "
 -            f"{f'{r.score:.2f}' if r.score is not None else '—'} | "
 -            f"{r.message[:60]} |\n"
 -        )
 -    if score.findings:
 -        buf.write("\n## Top findings\n\n")
 -        for f in score.findings:
 -            buf.write(f"- {f}\n")
 -    return buf.getvalue()
+-
+-
 -# -- helpers -----------------------------------------------------------
+-
+-
 -def _adapter_label(adapter_id: str) -> str:
 -    if not adapter_id:
 -        return "(base only)"
 -    # Only the trailing path chunk is useful in the header.
 -    parts = adapter_id.rstrip("/").split("/")
 -    return "/".join(parts[-3:]) if len(parts) > 3 else adapter_id
+-
+-
 -def _score_style(v: float) -> str:
 -    if v >= 0.6:
 -        return "bold green"
 -    if v >= 0.3:
 -        return "bold yellow"
 -    return "bold red"
+-
+-
 -def _band_style(band: str) -> str:
 -    return {
 -        "noise": "red",
 -        "partial": "yellow",
 -        "healthy": "green",
 -        "suspicious": "magenta",
 -    }.get(band, "white")
+-
+-
 -def _bar(v: float, *, width: int = 10) -> str:
 -    clamped = max(0.0, min(1.0, v))
 -    filled = int(round(clamped * width))
 -    return "█" * filled + "░" * (width - filled)
+-
+-
 -__all__ = ["to_terminal", "to_json", "to_junit", "to_markdown"]

sway/src/dlm_sway/suite/runner.pydeleted

 -"""Suite runner.
+-
 -Iterates the probe list, materializes each into a ``(Probe, Spec)`` via
 -the registry, executes it with a :class:`~dlm_sway.probes.base.RunContext`,
 -and assembles a :class:`~dlm_sway.core.result.SuiteResult`.
+-
 -Runtime contract:
+-
 -- Probes are executed in declaration order (not sorted, not parallelized).
 -  The null-adapter baseline has to run before any probe that needs z-scores,
 -  so authoring order is load-bearing.
 -- A probe that raises is recorded as
 -  :attr:`~dlm_sway.core.result.Verdict.ERROR` and the suite continues —
 -  one broken probe doesn't torch the whole report.
 -- The backend is the caller's responsibility: the runner does not build
 -  or close it, so callers can reuse a backend across multiple suites.
 -"""
+-
 -from __future__ import annotations
+-
 -import time
+-
 -from dlm_sway import __version__
 -from dlm_sway.core.errors import ProbeError
 -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
 -from dlm_sway.core.scoring import DifferentialBackend
 -from dlm_sway.core.sections import Section
 -from dlm_sway.probes.base import RunContext, build_probe
 -from dlm_sway.probes.null_adapter import NullAdapterSpec, get_null_stats
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -def run(
 -    spec: SwaySpec,
 -    backend: DifferentialBackend,
 -    *,
 -    spec_path: str = "<memory>",
 -    doc_text: str | None = None,
 -    sections: tuple[Section, ...] | None = None,
 -) -> SuiteResult:
 -    """Execute every probe in ``spec`` against ``backend``."""
 -    started = utcnow()
 -    ctx = RunContext(
 -        backend=backend,
 -        seed=spec.defaults.seed,
 -        top_k=spec.defaults.top_k,
 -        sections=sections,
 -        doc_text=doc_text,
 -    )
+-
 -    results: list[ProbeResult] = []
 -    null_stats: dict[str, dict[str, float]] = {}
+-
 -    for raw in spec.suite:
 -        probe, probe_spec = build_probe(raw)
 -        if not probe_spec.enabled:
 -            results.append(
 -                ProbeResult(
 -                    name=probe_spec.name,
 -                    kind=probe_spec.kind,
 -                    verdict=Verdict.SKIP,
 -                    score=None,
 -                    message="disabled in spec",
 -                )
 -            )
 -            continue
+-
 -        t0 = time.perf_counter()
 -        try:
 -            result = probe.run(probe_spec, ctx)
 -        except ProbeError as exc:
 -            result = ProbeResult(
 -                name=probe_spec.name,
 -                kind=probe_spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message=str(exc),
 -            )
 -        except Exception as exc:  # noqa: BLE001 — probe impls may raise anything
 -            result = ProbeResult(
 -                name=probe_spec.name,
 -                kind=probe_spec.kind,
 -                verdict=Verdict.ERROR,
 -                score=None,
 -                message=f"{type(exc).__name__}: {exc}",
 -            )
 -        duration = time.perf_counter() - t0
 -        # Re-stamp duration (probes don't know their own wall time).
 -        result = _with_duration(result, duration)
 -        results.append(result)
+-
 -        # Null-adapter result seeds ctx.null_stats for subsequent probes.
 -        if isinstance(probe_spec, NullAdapterSpec) and result.evidence.get("null_stats"):
 -            null_stats.update(result.evidence["null_stats"])
 -            # RunContext is frozen; swap in a fresh one so later probes
 -            # see the populated stats.
 -            ctx = RunContext(
 -                backend=ctx.backend,
 -                seed=ctx.seed,
 -                top_k=ctx.top_k,
 -                sections=ctx.sections,
 -                doc_text=ctx.doc_text,
 -                null_stats=null_stats,
 -            )
+-
 -    finished = utcnow()
 -    return SuiteResult(
 -        spec_path=spec_path,
 -        started_at=started,
 -        finished_at=finished,
 -        base_model_id=spec.models.base.base,
 -        adapter_id=str(spec.models.ft.adapter) if spec.models.ft.adapter else "",
 -        sway_version=__version__,
 -        probes=tuple(results),
 -        null_stats=null_stats,
 -    )
+-
+-
 -def _with_duration(result: ProbeResult, duration: float) -> ProbeResult:
 -    """Return a copy of ``result`` with :attr:`ProbeResult.duration_s` set."""
 -    return ProbeResult(
 -        name=result.name,
 -        kind=result.kind,
 -        verdict=result.verdict,
 -        score=result.score,
 -        raw=result.raw,
 -        z_score=result.z_score,
 -        base_value=result.base_value,
 -        ft_value=result.ft_value,
 -        evidence=result.evidence,
 -        message=result.message,
 -        duration_s=duration,
 -    )
+-
+-
 -__all__ = ["get_null_stats", "run"]

sway/src/dlm_sway/suite/score.pydeleted

 -"""Composite :class:`~dlm_sway.core.result.SwayScore` from a suite result.
+-
 -The score is a weighted mean over four categories
 -(adherence / attribution / calibration / ablation). Each category's
 -value is the weighted mean of its pass/score values (with SKIP/ERROR
 -excluded so a broken probe doesn't silently depress the composite).
+-
 -All weighting is explicit, user-overridable, and surfaced in the report
 -alongside the number — no black-box scoring.
 -"""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.core.result import (
 -    DEFAULT_COMPONENT_WEIGHTS,
 -    ProbeResult,
 -    SuiteResult,
 -    SwayScore,
 -    Verdict,
 -)
 -from dlm_sway.probes.base import registry
+-
+-
 -def compute(
 -    suite: SuiteResult,
 -    *,
 -    weights: dict[str, float] | None = None,
 -) -> SwayScore:
 -    """Fold a :class:`SuiteResult` into a :class:`SwayScore`."""
 -    w = weights if weights is not None else dict(DEFAULT_COMPONENT_WEIGHTS)
 -    registered = registry()
+-
 -    # Bucket probes by their declared category.
 -    buckets: dict[str, list[ProbeResult]] = {k: [] for k in w}
 -    for r in suite.probes:
 -        if r.verdict in {Verdict.SKIP, Verdict.ERROR}:
 -            continue
 -        if r.score is None:
 -            continue
 -        probe_cls = registered.get(r.kind)
 -        category = probe_cls.category if probe_cls is not None else "adherence"
 -        buckets.setdefault(category, []).append(r)
+-
 -    component_scores: dict[str, float] = {}
 -    for cat, probes in buckets.items():
 -        if not probes:
 -            component_scores[cat] = 0.0
 -            continue
 -        total_w = sum(max(_spec_weight(p), 0.0) for p in probes) or 1.0
 -        weighted = sum(max(_spec_weight(p), 0.0) * (p.score or 0.0) for p in probes)
 -        component_scores[cat] = weighted / total_w
+-
 -    # Fold to composite, weighted by the user's category weights, but
 -    # ignoring components that had no contributing probes (so a
 -    # PREFERENCE-free document doesn't get penalized for missing B3).
 -    active_weights = {k: v for k, v in w.items() if buckets.get(k)}
 -    total_w = sum(active_weights.values()) or 1.0
 -    overall = sum(active_weights[k] * component_scores[k] for k in active_weights) / total_w
+-
 -    findings = _findings(suite, component_scores)
+-
 -    return SwayScore(
 -        overall=overall,
 -        components=component_scores,
 -        weights=w,
 -        band=SwayScore.band_for(overall),
 -        findings=findings,
 -    )
+-
+-
 -def _spec_weight(result: ProbeResult) -> float:
 -    """Recover a probe's declared weight from its ``evidence`` payload.
+-
 -    The runner stores ``spec.weight`` on evidence so the scorer can read
 -    it without re-validating specs. Falls back to 1.0 when absent (older
 -    runs, custom probes, etc).
 -    """
 -    w = result.evidence.get("weight")
 -    if isinstance(w, int | float):
 -        return float(w)
 -    return 1.0
+-
+-
 -def _findings(suite: SuiteResult, components: dict[str, float]) -> tuple[str, ...]:
 -    """Surface the 2–3 most diagnostic notes for the terminal report."""
 -    notes: list[str] = []
+-
 -    failed = [r for r in suite.probes if r.verdict == Verdict.FAIL]
 -    if failed:
 -        top = failed[0]
 -        notes.append(
 -            f"{top.name} ({top.kind}) failed" + (f": {top.message}" if top.message else "")
 -        )
+-
 -    for cat, score in components.items():
 -        if score < 0.3 and components.get(cat, 1.0) != 0.0:
 -            notes.append(f"{cat} score is {score:.2f} — below the noise threshold")
+-
 -    errors = [r for r in suite.probes if r.verdict == Verdict.ERROR]
 -    if errors:
 -        notes.append(f"{len(errors)} probe(s) errored — see full report for details")
+-
 -    return tuple(notes[:5])
+-
+-
 -__all__ = ["compute"]

sway/src/dlm_sway/suite/spec.pydeleted

 -"""Top-level ``sway.yaml`` spec models.
+-
 -Per-probe specs live next to their implementations in
 -:mod:`dlm_sway.probes`. This module owns the *outer* envelope —
 -``version``, ``models``, ``defaults``, ``suite`` — plus the runtime
 -bind between raw probe dicts and registered probe classes.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Annotated, Any
+-
 -from pydantic import BaseModel, ConfigDict, Field
+-
 -from dlm_sway.core.model import ModelSpec
+-
 -SUPPORTED_VERSION = 1
+-
+-
 -class SuiteModels(BaseModel):
 -    """Named model handles the suite references — ``base`` + ``ft``."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    base: ModelSpec
 -    ft: ModelSpec
+-
+-
 -class SuiteDefaults(BaseModel):
 -    """Shared defaults for the whole suite. Probes may override per-entry."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    seed: int = 0
 -    top_k: int = 256
 -    differential: bool = True
 -    """If ``False``, the runner loads base + ft as two separate models
 -    instead of toggling on one. More memory-heavy; only useful when a
 -    backend can't do in-place toggling."""
 -    coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6
 -    """Minimum composite score for ``dlm-sway gate`` to pass."""
+-
+-
 -class SwaySpec(BaseModel):
 -    """Root of ``sway.yaml``."""
+-
 -    model_config = ConfigDict(extra="forbid", frozen=True)
+-
 -    version: int = 1
 -    models: SuiteModels
 -    defaults: SuiteDefaults = SuiteDefaults()
 -    suite: list[dict[str, Any]] = Field(default_factory=list)
 -    """Raw probe entries. Validated one-at-a-time by the probe registry
 -    via :func:`dlm_sway.probes.base.build_probe` so that the set of
 -    allowed probe kinds is an open registry rather than a closed
 -    discriminated union."""
 -    dlm_source: str | None = None
 -    """Optional path to a ``.dlm`` file. When present, the runner asks
 -    :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and
 -    hands them to probes via :attr:`RunContext.sections`. Auto-populated
 -    by ``dlm-sway autogen``."""
+-
 -    def check_version(self) -> None:
 -        """Raise ``ValueError`` if the spec version is unsupported.
+-
 -        Called explicitly by the loader after validation so the error
 -        surfaces with a loader-source tag rather than a pydantic stack.
 -        """
 -        if self.version != SUPPORTED_VERSION:
 -            raise ValueError(
 -                f"unsupported sway spec version: {self.version} (this build supports {SUPPORTED_VERSION})"
 -            )

sway/src/dlm_sway/visualize.pydeleted

 -"""Optional matplotlib-based visualizations.
+-
 -Behind the ``viz`` extra. Three functions cover the three plots that
 -make the sway report come alive in a notebook or saved PNG:
+-
 -- :func:`plot_section_sis`: per-section bar chart of effective SIS
 -  (the flagship attribution view).
 -- :func:`plot_adapter_ablation`: the λ-scaled divergence curve — the
 -  sway signature plot.
 -- :func:`plot_kl_histogram`: distribution of per-prompt KL divergences
 -  (the raw data behind A1 DeltaKL).
+-
 -Each function raises :class:`~dlm_sway.core.errors.BackendNotAvailableError`
 -with a pip hint when matplotlib isn't installed. No function writes to
 -disk on your behalf — the caller decides (``fig.savefig(...)``).
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Any
+-
 -from dlm_sway.core.errors import BackendNotAvailableError
 -from dlm_sway.core.result import SuiteResult
+-
+-
 -def _require_mpl() -> Any:
 -    try:
 -        import matplotlib.pyplot as plt
+-
 -        return plt
 -    except ImportError as exc:
 -        raise BackendNotAvailableError(
 -            "visualize",
 -            extra="viz",
 -            hint="sway's visualization module needs matplotlib.",
 -        ) from exc
+-
+-
 -def plot_section_sis(suite: SuiteResult) -> Any:
 -    """Render a per-section ``effective_sis`` bar chart.
+-
 -    Returns the matplotlib ``Figure``; the caller handles display / save.
 -    """
 -    plt = _require_mpl()
+-
 -    probe = _find_probe(suite, "section_internalization")
 -    if probe is None or not probe.evidence.get("per_section"):
 -        raise ValueError("suite has no section_internalization evidence to plot")
+-
 -    rows: list[dict[str, Any]] = list(probe.evidence["per_section"])
 -    labels = [f"{row['tag'] or row['section_id'][:8]}\n({row['kind']})" for row in rows]
 -    values = [float(row["effective_sis"]) for row in rows]
 -    colors = ["#2ca02c" if row["passed"] else "#d62728" for row in rows]
+-
 -    fig, ax = plt.subplots(figsize=(max(6.0, 0.7 * len(rows)), 4.0))
 -    ax.bar(range(len(rows)), values, color=colors)
 -    ax.axhline(
 -        float(probe.evidence.get("per_section_threshold", 0.0)),
 -        color="gray",
 -        linestyle="--",
 -        linewidth=1,
 -        label="threshold",
 -    )
 -    ax.set_xticks(range(len(rows)))
 -    ax.set_xticklabels(labels, rotation=30, ha="right")
 -    ax.set_ylabel("effective SIS")
 -    ax.set_title("Section Internalization Score")
 -    ax.legend(loc="best")
 -    fig.tight_layout()
 -    return fig
+-
+-
 -def plot_adapter_ablation(suite: SuiteResult) -> Any:
 -    """Render the signature λ-scaled divergence curve."""
 -    plt = _require_mpl()
+-
 -    probe = _find_probe(suite, "adapter_ablation")
 -    if probe is None or not probe.evidence.get("lambdas"):
 -        raise ValueError("suite has no adapter_ablation evidence to plot")
+-
 -    lambdas = list(probe.evidence["lambdas"])
 -    divs = list(probe.evidence["mean_divergence_per_lambda"])
+-
 -    fig, ax = plt.subplots(figsize=(7.0, 4.0))
 -    ax.plot(lambdas, divs, marker="o", linewidth=2, color="#1f77b4")
 -    ax.axvline(1.0, color="gray", linestyle=":", linewidth=1, label="λ=1 (trained)")
 -    sat = probe.evidence.get("saturation_lambda")
 -    if sat is not None:
 -        ax.axvline(
 -            float(sat),
 -            color="#2ca02c",
 -            linestyle="--",
 -            linewidth=1,
 -            label=f"sat λ={float(sat):.2f}",
 -        )
 -    ax.set_xlabel("λ (adapter scale)")
 -    ax.set_ylabel("mean JS divergence vs λ=0")
 -    ax.set_title(
 -        f"Adapter Ablation (R²={float(probe.evidence.get('linearity', 0.0)):.2f}, "
 -        f"overshoot={float(probe.evidence.get('overshoot', 0.0)):.2f})"
 -    )
 -    ax.legend(loc="best")
 -    fig.tight_layout()
 -    return fig
+-
+-
 -def plot_kl_histogram(suite: SuiteResult) -> Any:
 -    """Render the per-prompt KL distribution from a DeltaKL probe."""
 -    plt = _require_mpl()
+-
 -    probe = _find_probe(suite, "delta_kl")
 -    if probe is None or not probe.evidence.get("per_prompt"):
 -        raise ValueError("suite has no delta_kl evidence to plot")
+-
 -    values = list(probe.evidence["per_prompt"])
 -    fig, ax = plt.subplots(figsize=(7.0, 4.0))
 -    ax.hist(values, bins=max(5, min(20, len(values) // 2)), color="#ff7f0e", edgecolor="white")
 -    ax.axvline(
 -        float(probe.raw or 0.0),
 -        color="black",
 -        linestyle="--",
 -        linewidth=1,
 -        label=f"mean={float(probe.raw or 0.0):.3f}",
 -    )
 -    ax.set_xlabel(probe.evidence.get("divergence_kind", "divergence"))
 -    ax.set_ylabel("count")
 -    ax.set_title("DeltaKL — per-prompt distribution")
 -    ax.legend(loc="best")
 -    fig.tight_layout()
 -    return fig
+-
+-
 -def _find_probe(suite: SuiteResult, kind: str) -> Any:
 -    for p in suite.probes:
 -        if p.kind == kind:
 -            return p
 -    return None

sway/tests/__init__.pydeleted

sway/tests/conftest.pydeleted

 -"""Shared test fixtures.
+-
 -Keep the default fast-test environment offline and deterministic so unit
 -tests stay below ~1 s per file. Integration tests override these via
 -their own ``conftest`` when they need network access.
 -"""
+-
 -from __future__ import annotations
+-
 -import pytest
+-
 -# Import the probes package once so every shipped probe registers itself
 -# with the central registry. Tests that exercise build_probe("delta_kl",
 -# …) rely on this.
 -import dlm_sway.probes  # noqa: F401
+-
+-
 -@pytest.fixture(autouse=True)
 -def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None:
 -    """Unit tests never touch the network.
+-
 -    Any backend test that needs HF should be marked ``@pytest.mark.online``
 -    and clear these vars explicitly.
 -    """
 -    monkeypatch.setenv("HF_HUB_OFFLINE", "1")
 -    monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1")
 -    monkeypatch.setenv("HF_DATASETS_OFFLINE", "1")
 -    monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1")
 -    monkeypatch.setenv("DO_NOT_TRACK", "1")

sway/tests/fixtures/__init__.pydeleted

sway/tests/fixtures/tiny_model.pydeleted

 -"""Tiny-model fixture for integration tests.
+-
 -Mirrors ``dlm.tests.fixtures.tiny_model``: session-scoped snapshot of
 -SmolLM2-135M-Instruct, reused across the whole test run. The model is
 -small enough (~280 MB on disk, ~600 MB in fp32 VRAM) to make integration
 -tests feasible in CI.
+-
 -Tests using this fixture must carry ``@pytest.mark.slow`` and
 -``@pytest.mark.online`` — the default test selection excludes both.
 -"""
+-
 -from __future__ import annotations
+-
 -import os
 -from collections.abc import Iterator
 -from pathlib import Path
+-
 -import pytest
+-
 -TINY_MODEL_HF_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
 -TINY_MODEL_REVISION = os.environ.get("DLM_SWAY_TINY_MODEL_REVISION", "main")
+-
+-
 -def _offline_mode() -> bool:
 -    return os.environ.get("SWAY_OFFLINE", "0") == "1"
+-
+-
 -@pytest.fixture(scope="session")
 -def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]:
 -    """Download (or reuse) the tiny model; yield the cached directory.
+-
 -    Test opts in via ``@pytest.mark.online`` — the session-wide offline
 -    env vars are cleared inside this fixture so ``snapshot_download``
 -    actually fetches.
 -    """
 -    from huggingface_hub import snapshot_download
+-
 -    # Clear offline env guards (set by the unit-test autouse fixture).
 -    prior = {
 -        k: os.environ.pop(k, None)
 -        for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE")
 -    }
 -    try:
 -        path = snapshot_download(
 -            repo_id=TINY_MODEL_HF_ID,
 -            revision=TINY_MODEL_REVISION,
 -            local_files_only=_offline_mode(),
 -        )
 -        yield Path(path)
 -    finally:
 -        for k, v in prior.items():
 -            if v is not None:
 -                os.environ[k] = v

sway/tests/integration/__init__.pydeleted

sway/tests/integration/conftest.pydeleted

 -"""Integration-test configuration.
+-
 -Integration tests need network + heavy deps. Re-export the tiny_model
 -fixture here so test modules can pick it up without a long import
 -path.
 -"""
+-
 -from __future__ import annotations
+-
 -from tests.fixtures.tiny_model import tiny_model_dir  # noqa: F401 — re-export

sway/tests/integration/test_hf_adapter_toggle.pydeleted

 -"""Integration test: PEFT ``disable_adapter`` actually changes logits.
+-
 -This is the load-bearing sanity check for the whole differential design.
 -If a future ``peft`` release subtly breaks the disable-context semantics,
 -sway's KL / SIS / ablation probes would all silently report zero signal.
 -We catch that here, before the rest of the test battery runs.
+-
 -The test builds a random-init LoRA adapter on a tiny model so no network
 -dependency beyond the base model snapshot itself.
 -"""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
+-
 -import pytest
+-
 -from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
 -from dlm_sway.core.model import ModelSpec
+-
 -pytestmark = [pytest.mark.slow, pytest.mark.online]
+-
+-
 -def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
 -    """Construct a LoRA adapter with random-init weights on ``base_dir``.
+-
 -    The weights are kept small so the toggle-delta is clear but the
 -    adapter is structurally valid (correct ``adapter_config.json``,
 -    tokenizer files, safetensors layout).
 -    """
 -    import torch
 -    from peft import LoraConfig, get_peft_model
 -    from transformers import AutoModelForCausalLM, AutoTokenizer
+-
 -    torch.manual_seed(0)
+-
 -    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
 -    if tokenizer.pad_token_id is None:
 -        tokenizer.pad_token = tokenizer.eos_token
 -    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
+-
 -    cfg = LoraConfig(
 -        r=8,
 -        lora_alpha=16,
 -        target_modules=["q_proj", "v_proj"],
 -        lora_dropout=0.0,
 -        bias="none",
 -        task_type="CAUSAL_LM",
 -    )
 -    peft_model = get_peft_model(base, cfg)
+-
 -    # Explicitly scale lora_B out of its PEFT-default zero-init so the
 -    # adapter actually changes outputs. Real training does this via
 -    # gradients; we do it with a scaled normal.
 -    with torch.no_grad():
 -        for name, param in peft_model.named_parameters():
 -            if "lora_B" in name:
 -                param.copy_(torch.randn_like(param) * 0.05)
+-
 -    peft_model.save_pretrained(str(out_dir))
 -    tokenizer.save_pretrained(str(out_dir))
+-
+-
 -@pytest.fixture(scope="module")
 -def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
 -    adapter_dir = tmp_path_factory.mktemp("random-adapter")
 -    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
 -    return adapter_dir
+-
+-
 -def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None:
 -    """The keystone invariant: base view ≠ ft view on the same prompt."""
 -    import numpy as np
+-
 -    backend = HuggingFaceDifferentialBackend(
 -        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
 -        adapter_path=random_adapter,
 -    )
 -    try:
 -        prompt = "The quick brown fox"
 -        with backend.as_base() as b:
 -            base_dist = b.next_token_dist(prompt, top_k=32)
 -        with backend.as_finetuned() as f:
 -            ft_dist = f.next_token_dist(prompt, top_k=32)
+-
 -        # Top-k indices may shift under the adapter; take a safe shared
 -        # subset instead of asserting identical ordering.
 -        assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose(
 -            base_dist.logprobs, ft_dist.logprobs, atol=1e-5
 -        ), "adapter toggle did not change next-token distribution"
 -    finally:
 -        backend.close()
+-
+-
 -def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None:
 -    """as_base → as_finetuned → as_base yields a stable base view."""
 -    import numpy as np
+-
 -    backend = HuggingFaceDifferentialBackend(
 -        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
 -        adapter_path=random_adapter,
 -    )
 -    try:
 -        prompt = "hello"
 -        with backend.as_base() as b:
 -            first = b.next_token_dist(prompt, top_k=16).logprobs
 -        with backend.as_finetuned() as f:
 -            f.next_token_dist(prompt, top_k=16)  # toggle
 -        with backend.as_base() as b:
 -            second = b.next_token_dist(prompt, top_k=16).logprobs
 -        np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6)
 -    finally:
 -        backend.close()

sway/tests/unit/__init__.pydeleted

sway/tests/unit/test_backend_dummy.pydeleted

 -"""Tests for :class:`dlm_sway.backends.dummy.DummyDifferentialBackend`.
+-
 -The dummy backend is used by every downstream probe unit test, so it
 -gets a thorough own-right test here. Also verifies the view-exclusion
 -invariant that catches stale-view bugs in probes.
 -"""
+-
 -from __future__ import annotations
+-
 -import numpy as np
 -import pytest
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.model import Model
 -from dlm_sway.core.scoring import DifferentialBackend, ScoringBackend
+-
+-
 -@pytest.fixture
 -def backend() -> DummyDifferentialBackend:
 -    base = DummyResponses(
 -        generations={"hi": "hello"},
 -        logprobs={("q", "a"): -3.0},
 -    )
 -    ft = DummyResponses(
 -        generations={"hi": "greetings, traveler"},
 -        logprobs={("q", "a"): -1.2},
 -    )
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -class TestViews:
 -    def test_as_base_and_as_ft_yield_distinct_generations(
 -        self, backend: DummyDifferentialBackend
 -    ) -> None:
 -        with backend.as_base() as b:
 -            assert b.generate("hi", max_new_tokens=5) == "hello"
 -        with backend.as_finetuned() as f:
 -            assert f.generate("hi", max_new_tokens=5) == "greetings, traveler"
+-
 -    def test_logprob_differs_between_modes(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base() as b:
 -            base_score = b.logprob_of("q", "a")
 -        with backend.as_finetuned() as f:
 -            ft_score = f.logprob_of("q", "a")
 -        assert base_score == -3.0
 -        assert ft_score == -1.2
+-
 -    def test_missing_generation_raises_keyerror(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base() as b, pytest.raises(KeyError, match="no canned generation"):
 -            b.generate("unconfigured", max_new_tokens=1)
+-
 -    def test_missing_logprob_default(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base() as b:
 -            assert b.logprob_of("nonexistent", "target") == -10.0
+-
+-
 -class TestRollingLogprob:
 -    def test_synthesized_when_not_preseeded(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base() as b:
 -            r = b.rolling_logprob("a quick brown fox jumps")
 -        assert r.num_tokens == 5
 -        assert r.logprobs.size == 4
 -        assert np.all(r.logprobs == -2.0)
+-
 -    def test_ft_perplexity_lower_than_base(self, backend: DummyDifferentialBackend) -> None:
 -        text = "a quick brown fox"
 -        with backend.as_base() as b:
 -            pb = b.rolling_logprob(text).perplexity
 -        with backend.as_finetuned() as f:
 -            pf = f.rolling_logprob(text).perplexity
 -        assert pf < pb  # synthesized ft is less perplexed → lower PPL
+-
+-
 -class TestTokenDist:
 -    def test_dists_differ_between_modes(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base() as b:
 -            base_dist = b.next_token_dist("any prompt")
 -        with backend.as_finetuned() as f:
 -            ft_dist = f.next_token_dist("any prompt")
 -        assert not np.array_equal(base_dist.logprobs, ft_dist.logprobs)
+-
+-
 -class TestInvariants:
 -    def test_protocol_satisfaction(self, backend: DummyDifferentialBackend) -> None:
 -        assert isinstance(backend, DifferentialBackend)
 -        with backend.as_base() as view:
 -            assert isinstance(view, Model)
 -            assert isinstance(view, ScoringBackend)
+-
 -    def test_nested_views_rejected(self, backend: DummyDifferentialBackend) -> None:
 -        with backend.as_base(), pytest.raises(RuntimeError, match="view already active"):
 -            with backend.as_finetuned():
 -                pass
+-
 -    def test_sequential_views_fine(self, backend: DummyDifferentialBackend) -> None:
 -        # Must be able to re-enter after exiting — common pattern in probes.
 -        with backend.as_base() as b:
 -            b.logprob_of("q", "a")
 -        with backend.as_finetuned() as f:
 -            f.logprob_of("q", "a")
 -        with backend.as_base() as b:
 -            b.logprob_of("q", "a")

sway/tests/unit/test_backend_registry.pydeleted

 -"""Tests for the backend registry in ``dlm_sway.backends``.
+-
 -The registry is the single place that maps a ModelSpec to a concrete
 -backend. These tests check the error paths — actually materializing an
 -HF backend requires model weights and is covered by the integration
 -suite.
 -"""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
+-
 -import pytest
+-
 -from dlm_sway.backends import build
 -from dlm_sway.core.errors import BackendNotAvailableError, SpecValidationError
 -from dlm_sway.core.model import ModelSpec
+-
+-
 -class TestRegistry:
 -    def test_dummy_rejected_via_build(self) -> None:
 -        with pytest.raises(SpecValidationError, match="kind='dummy'"):
 -            build(ModelSpec(base="x", kind="dummy"))
+-
 -    def test_hf_requires_adapter(self) -> None:
 -        with pytest.raises(SpecValidationError, match="adapter"):
 -            build(ModelSpec(base="x", kind="hf"))
+-
 -    def test_mlx_requires_adapter(self) -> None:
 -        with pytest.raises(SpecValidationError, match="adapter"):
 -            build(ModelSpec(base="x", kind="mlx"))
+-
 -    def test_mlx_dispatch_raises_when_mlx_missing(self) -> None:
 -        # On non-Apple-Silicon (or Apple without mlx installed), constructing
 -        # the MLX backend raises BackendNotAvailableError with a pip hint.
 -        # We skip this assertion if mlx happens to be installed.
 -        import importlib.util
+-
 -        if importlib.util.find_spec("mlx") is not None:
 -            pytest.skip("mlx is installed; error path not exercised")
 -        with pytest.raises(BackendNotAvailableError) as exc_info:
 -            build(ModelSpec(base="x", kind="mlx", adapter=Path("/tmp/a")))
 -        assert exc_info.value.backend == "mlx"
+-
 -    def test_custom_requires_entry_point(self) -> None:
 -        with pytest.raises(SpecValidationError, match="entry_point"):
 -            build(ModelSpec(base="x", kind="custom", adapter=Path("/tmp/a")))
+-
 -    def test_custom_validates_entry_point_shape(self) -> None:
 -        with pytest.raises(SpecValidationError, match="pkg.module:ClassName"):
 -            build(
 -                ModelSpec(
 -                    base="x",
 -                    kind="custom",
 -                    entry_point="not_a_valid_entry_point",
 -                    adapter=Path("/tmp/a"),
 -                )
 -            )
+-
 -    def test_custom_rejects_unimportable_module(self) -> None:
 -        with pytest.raises(SpecValidationError, match="cannot import"):
 -            build(
 -                ModelSpec(
 -                    base="x",
 -                    kind="custom",
 -                    entry_point="nonexistent_pkg_xyz:Backend",
 -                    adapter=Path("/tmp/a"),
 -                )
 -            )
+-
 -    def test_custom_rejects_missing_class(self) -> None:
 -        with pytest.raises(SpecValidationError, match="has no attribute"):
 -            build(
 -                ModelSpec(
 -                    base="x",
 -                    kind="custom",
 -                    entry_point="dlm_sway:NoSuchClass",
 -                    adapter=Path("/tmp/a"),
 -                )
 -            )
+-
 -    def test_custom_rejects_non_differential_class(self) -> None:
 -        # A class that accepts the canonical constructor args but doesn't
 -        # implement the protocol.
 -        import sys
 -        import types
+-
 -        class _Bad:
 -            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
 -                del base_spec, adapter_path
+-
 -        mod = types.ModuleType("_sway_bad_mod")
 -        mod.Bad = _Bad  # type: ignore[attr-defined]
 -        sys.modules["_sway_bad_mod"] = mod
+-
 -        with pytest.raises(SpecValidationError, match="DifferentialBackend"):
 -            build(
 -                ModelSpec(
 -                    base="x",
 -                    kind="custom",
 -                    entry_point="_sway_bad_mod:Bad",
 -                    adapter=Path("/tmp/a"),
 -                )
 -            )
+-
 -    def test_custom_dispatches_to_valid_backend(self) -> None:
 -        # Use the dummy backend via a custom entry point. The dummy class's
 -        # __init__ takes different args, so we write a thin adapter class.
 -        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
+-
 -        class _AdapterBackend(DummyDifferentialBackend):
 -            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
 -                super().__init__(base=DummyResponses(), ft=DummyResponses())
+-
 -        # Register on a throwaway module we can find by name.
 -        import sys
 -        import types
+-
 -        mod = types.ModuleType("_sway_custom_test_mod")
 -        mod.AdapterBackend = _AdapterBackend  # type: ignore[attr-defined]
 -        sys.modules["_sway_custom_test_mod"] = mod
+-
 -        backend = build(
 -            ModelSpec(
 -                base="x",
 -                kind="custom",
 -                entry_point="_sway_custom_test_mod:AdapterBackend",
 -                adapter=Path("/tmp/a"),
 -            )
 -        )
 -        from dlm_sway.core.scoring import DifferentialBackend
+-
 -        assert isinstance(backend, DifferentialBackend)

sway/tests/unit/test_cli.pydeleted

 -"""Smoke tests for the dlm-sway CLI.
+-
 -We avoid exercising backends (they need real models) and instead test
 -arg parsing, error paths, and the read-only commands (``doctor``,
 -``report``, and the help surface).
 -"""
+-
 -from __future__ import annotations
+-
 -import json
 -from pathlib import Path
+-
 -from typer.testing import CliRunner
+-
 -from dlm_sway.cli.app import app
+-
+-
 -def test_version_exits_zero() -> None:
 -    result = CliRunner().invoke(app, ["--version"])
 -    assert result.exit_code == 0
 -    assert "dlm-sway" in result.stdout
+-
+-
 -def test_help_lists_all_commands() -> None:
 -    result = CliRunner().invoke(app, ["--help"])
 -    assert result.exit_code == 0
 -    for cmd in ("run", "gate", "check", "diff", "autogen", "doctor", "report"):
 -        assert cmd in result.stdout
+-
+-
 -def test_doctor_runs(capsys) -> None:  # type: ignore[no-untyped-def]
 -    result = CliRunner().invoke(app, ["doctor"])
 -    assert result.exit_code == 0
 -    # Rich applies color codes by default; assert the bare product name appears.
 -    assert "dlm-sway" in result.stdout
 -    assert "backends" in result.stdout
+-
+-
 -def test_run_without_file_errors(tmp_path: Path) -> None:
 -    missing = tmp_path / "nope.yaml"
 -    result = CliRunner().invoke(app, ["run", str(missing)])
 -    # Exit code 2 = SwayError bubble-up; 1 = typer missing-arg; accept either.
 -    assert result.exit_code != 0
+-
+-
 -def test_report_from_json(tmp_path: Path) -> None:
 -    sample = {
 -        "schema_version": 1,
 -        "sway_version": "0.1.0.dev0",
 -        "base_model_id": "base",
 -        "adapter_id": "adp",
 -        "score": {"overall": 0.7, "band": "healthy", "components": {}, "findings": []},
 -        "probes": [
 -            {
 -                "name": "p1",
 -                "kind": "delta_kl",
 -                "verdict": "pass",
 -                "score": 0.7,
 -                "message": "ok",
 -            },
 -        ],
 -    }
 -    path = tmp_path / "result.json"
 -    path.write_text(json.dumps(sample), encoding="utf-8")
+-
 -    terminal = CliRunner().invoke(app, ["report", str(path)])
 -    assert terminal.exit_code == 0
 -    assert "p1" in terminal.stdout
+-
 -    md = CliRunner().invoke(app, ["report", str(path), "--format", "md"])
 -    assert md.exit_code == 0
 -    assert "dlm-sway report" in md.stdout
+-
 -    junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"])
 -    assert junit.exit_code == 0
 -    assert "<testsuite" in junit.stdout
+-
+-
 -def test_autogen_without_dlm_extra_exits_nonzero(tmp_path: Path, monkeypatch) -> None:  # type: ignore[no-untyped-def]
 -    # Force the import path to fail so the CLI prints the extra hint.
 -    import builtins
+-
 -    real_import = builtins.__import__
+-
 -    def fake_import(name: str, *args: object, **kwargs: object):  # type: ignore[no-untyped-def]
 -        if name.startswith("dlm_sway.integrations.dlm"):
 -            raise ImportError("simulated missing extra")
 -        return real_import(name, *args, **kwargs)  # type: ignore[no-untyped-call]
+-
 -    monkeypatch.setattr(builtins, "__import__", fake_import)
 -    result = CliRunner().invoke(app, ["autogen", "any.dlm"])
 -    assert result.exit_code != 0

sway/tests/unit/test_determinism.pydeleted

 -"""Tests for :mod:`dlm_sway.core.determinism`."""
+-
 -from __future__ import annotations
+-
 -import os
 -import random
+-
 -import numpy as np
+-
 -from dlm_sway.core.determinism import DeterminismSummary, seed_everything
+-
+-
 -class TestSeedEverything:
 -    def test_returns_summary(self) -> None:
 -        summary = seed_everything(0)
 -        assert isinstance(summary, DeterminismSummary)
 -        assert summary.seed == 0
 -        assert summary.class_ in {"strict", "best_effort", "loose"}
+-
 -    def test_idempotent_for_stdlib_random(self) -> None:
 -        seed_everything(42)
 -        a = [random.random() for _ in range(5)]
 -        seed_everything(42)
 -        b = [random.random() for _ in range(5)]
 -        assert a == b
+-
 -    def test_idempotent_for_numpy(self) -> None:
 -        seed_everything(17)
 -        a = np.random.rand(5)
 -        seed_everything(17)
 -        b = np.random.rand(5)
 -        np.testing.assert_array_equal(a, b)
+-
 -    def test_cublas_workspace_set_under_strict(self) -> None:
 -        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
 -        seed_everything(0, strict=True)
 -        assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8"
+-
 -    def test_non_strict_does_not_set_cublas(self) -> None:
 -        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
 -        seed_everything(0, strict=False)
 -        # Non-strict mode must not leak the env var in either direction;
 -        # the host environment's prior value wins.
 -        assert (
 -            "CUBLAS_WORKSPACE_CONFIG" not in os.environ
 -            or os.environ["CUBLAS_WORKSPACE_CONFIG"] != ":4096:8"
 -        )

sway/tests/unit/test_divergence.pydeleted

 -"""Tests for :mod:`dlm_sway.probes._divergence`."""
+-
 -from __future__ import annotations
+-
 -import math
+-
 -import numpy as np
+-
 -from dlm_sway.core.scoring import TokenDist
 -from dlm_sway.probes._divergence import aligned_probs, divergence, js, kl
+-
+-
 -def _dist(ids: list[int], probs: list[float], vocab: int = 100) -> TokenDist:
 -    return TokenDist(
 -        token_ids=np.asarray(ids, dtype=np.int64),
 -        logprobs=np.log(np.asarray(probs, dtype=np.float32)),
 -        vocab_size=vocab,
 -    )
+-
+-
 -class TestAligned:
 -    def test_identical_distributions(self) -> None:
 -        d = _dist([1, 2, 3], [0.5, 0.3, 0.2])
 -        p, q = aligned_probs(d, d)
 -        np.testing.assert_allclose(p, q)
+-
 -    def test_union_support_fills_missing(self) -> None:
 -        base = _dist([1, 2, 3], [0.5, 0.3, 0.2])
 -        ft = _dist([2, 3, 4], [0.4, 0.4, 0.2])
 -        p, q = aligned_probs(base, ft)
 -        assert p.shape == (4,)
 -        assert abs(p.sum() - 1.0) < 1e-9
 -        assert abs(q.sum() - 1.0) < 1e-9
+-
+-
 -class TestKL:
 -    def test_zero_when_equal(self) -> None:
 -        p = np.array([0.5, 0.3, 0.2])
 -        assert kl(p, p) == 0.0
+-
 -    def test_positive_when_different(self) -> None:
 -        p = np.array([0.7, 0.2, 0.1])
 -        q = np.array([0.2, 0.3, 0.5])
 -        assert kl(p, q) > 0.0
+-
+-
 -class TestJS:
 -    def test_zero_when_equal(self) -> None:
 -        p = np.array([0.5, 0.3, 0.2])
 -        assert js(p, p) == 0.0
+-
 -    def test_symmetric(self) -> None:
 -        p = np.array([0.7, 0.2, 0.1])
 -        q = np.array([0.2, 0.3, 0.5])
 -        assert math.isclose(js(p, q), js(q, p), rel_tol=1e-9)
+-
 -    def test_bounded_by_ln2(self) -> None:
 -        p = np.array([1.0, 0.0])
 -        q = np.array([0.0, 1.0])
 -        # With zeros handled as 0·log0 = 0 this approaches ln(2).
 -        assert js(p, q) <= math.log(2.0) + 1e-9
+-
+-
 -class TestDivergenceDispatch:
 -    def test_default_is_js(self) -> None:
 -        d1 = _dist([1, 2], [0.6, 0.4])
 -        d2 = _dist([1, 2], [0.3, 0.7])
 -        assert divergence(d1, d2) == divergence(d1, d2, kind="js")
+-
 -    def test_kl_available(self) -> None:
 -        d1 = _dist([1, 2], [0.6, 0.4])
 -        d2 = _dist([1, 2], [0.3, 0.7])
 -        assert divergence(d1, d2, kind="kl") >= 0.0

sway/tests/unit/test_dlm_bridge.pydeleted

 -"""Tests for :mod:`dlm_sway.integrations.dlm`.
+-
 -The bridge imports ``dlm.*`` modules lazily. We mock those via
 -``sys.modules`` injection so the tests run without the ``dlm-sway[dlm]``
 -extra installed. A full end-to-end integration test against a real
 -``.dlm`` lives under ``tests/integration/``.
 -"""
+-
 -from __future__ import annotations
+-
 -import sys
 -import types
 -from dataclasses import dataclass
 -from pathlib import Path
+-
 -import pytest
 -import yaml
+-
+-
 -@pytest.fixture
 -def fake_dlm(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path:
 -    """Install a fake ``dlm`` package so the resolver can import."""
+-
 -    # Build synthetic parsed .dlm structure.
 -    @dataclass
 -    class _Frontmatter:
 -        dlm_id: str = "01TESTULID"
 -        base_model: str = "smollm2-135m"
+-
 -    @dataclass
 -    class _Section:
 -        section_id: str
 -        type: str
 -        content: str
 -        tag: str | None = None
+-
 -    @dataclass
 -    class _Parsed:
 -        frontmatter: _Frontmatter
 -        sections: tuple[_Section, ...]
+-
 -    def _parse_file(_path: Path):  # type: ignore[no-untyped-def]
 -        return _Parsed(
 -            frontmatter=_Frontmatter(),
 -            sections=(
 -                _Section(
 -                    section_id="prose-1",
 -                    type="PROSE",
 -                    content="This is a prose section with some information. Further detail follows.",
 -                ),
 -                _Section(
 -                    section_id="instr-1",
 -                    type="INSTRUCTION",
 -                    content="### Q\nWhat is X?\n\n### A\nX is a concept\n",
 -                ),
 -                _Section(
 -                    section_id="pref-1",
 -                    type="PREFERENCE",
 -                    content="chosen/rejected triple",
 -                ),
 -            ),
 -        )
+-
 -    # Fake ``dlm.doc.parser`` module.
 -    dlm_pkg = types.ModuleType("dlm")
 -    dlm_doc = types.ModuleType("dlm.doc")
 -    dlm_doc_parser = types.ModuleType("dlm.doc.parser")
 -    dlm_doc_parser.parse_file = _parse_file  # type: ignore[attr-defined]
+-
 -    # Fake ``dlm.store.paths`` that returns a resolvable path.
 -    dlm_store = types.ModuleType("dlm.store")
 -    dlm_store_paths = types.ModuleType("dlm.store.paths")
+-
 -    adapter_dir = tmp_path / "adapter_v1"
 -    adapter_dir.mkdir()
 -    (adapter_dir / "adapter_config.json").write_text("{}", encoding="utf-8")
+-
 -    class _StorePath:
 -        def __init__(self, path: Path) -> None:
 -            self._p = path
+-
 -        def resolve_current_adapter(self) -> Path:
 -            return self._p
+-
 -    def _for_dlm(_dlm_id: str) -> _StorePath:
 -        return _StorePath(adapter_dir)
+-
 -    dlm_store_paths.StorePath = _StorePath  # type: ignore[attr-defined]
 -    dlm_store_paths.for_dlm = _for_dlm  # type: ignore[attr-defined]
+-
 -    # Fake base-model resolver — returns a stub with an ``hf_id`` attribute.
 -    dlm_base = types.ModuleType("dlm.base_models")
+-
 -    @dataclass
 -    class _BaseSpec:
 -        hf_id: str
 -        key: str
+-
 -    def _resolve(key: str) -> _BaseSpec:
 -        return _BaseSpec(hf_id="HuggingFaceTB/SmolLM2-135M-Instruct", key=key)
+-
 -    dlm_base.resolve = _resolve  # type: ignore[attr-defined]
+-
 -    # Fake instruction / preference parsers.
 -    dlm_data = types.ModuleType("dlm.data")
 -    dlm_data_instr = types.ModuleType("dlm.data.instruction_parser")
 -    dlm_data_pref = types.ModuleType("dlm.data.preference_parser")
+-
 -    @dataclass
 -    class _QAPair:
 -        question: str
 -        answer: str
+-
 -    @dataclass
 -    class _Triple:
 -        prompt: str
 -        chosen: str
 -        rejected: str
+-
 -    def _parse_instr(body: str, *, section_id: str) -> list[_QAPair]:
 -        del section_id
 -        out: list[_QAPair] = []
 -        parts = body.split("### Q")
 -        for part in parts[1:]:
 -            q_block, _, a_block = part.partition("### A")
 -            q = q_block.strip()
 -            a = a_block.strip()
 -            if q and a:
 -                out.append(_QAPair(question=q, answer=a))
 -        return out
+-
 -    def _parse_pref(body: str, *, section_id: str) -> list[_Triple]:
 -        del body, section_id
 -        return [_Triple(prompt="Which?", chosen="good answer", rejected="bad answer")]
+-
 -    dlm_data_instr.parse_instruction_body = _parse_instr  # type: ignore[attr-defined]
 -    dlm_data_pref.parse_preference_body = _parse_pref  # type: ignore[attr-defined]
+-
 -    monkeypatch.setitem(sys.modules, "dlm", dlm_pkg)
 -    monkeypatch.setitem(sys.modules, "dlm.doc", dlm_doc)
 -    monkeypatch.setitem(sys.modules, "dlm.doc.parser", dlm_doc_parser)
 -    monkeypatch.setitem(sys.modules, "dlm.store", dlm_store)
 -    monkeypatch.setitem(sys.modules, "dlm.store.paths", dlm_store_paths)
 -    monkeypatch.setitem(sys.modules, "dlm.base_models", dlm_base)
 -    monkeypatch.setitem(sys.modules, "dlm.data", dlm_data)
 -    monkeypatch.setitem(sys.modules, "dlm.data.instruction_parser", dlm_data_instr)
 -    monkeypatch.setitem(sys.modules, "dlm.data.preference_parser", dlm_data_pref)
+-
 -    # Return a path to a fake .dlm file (the parser won't actually read it).
 -    dlm_file = tmp_path / "doc.dlm"
 -    dlm_file.write_text("---\ndlm_id: 01TEST\n---\n\nbody\n", encoding="utf-8")
 -    return dlm_file
+-
+-
 -def test_resolve_dlm_maps_sections(fake_dlm: Path) -> None:
 -    from dlm_sway.integrations.dlm.resolver import resolve_dlm
+-
 -    handle = resolve_dlm(fake_dlm)
 -    assert handle.dlm_id == "01TESTULID"
 -    assert handle.base_model == "HuggingFaceTB/SmolLM2-135M-Instruct"
 -    assert handle.adapter_path is not None
 -    assert handle.adapter_path.exists()
 -    assert len(handle.sections) == 3
 -    # Kinds normalized from uppercase dlm enum values.
 -    assert {s.kind for s in handle.sections} == {"prose", "instruction", "preference"}
 -    # Instruction Q/A pair survived the translation.
 -    instr = next(s for s in handle.sections if s.kind == "instruction")
 -    assert instr.probes
 -    assert instr.probes[0].prompt == "What is X?"
 -    # Preference triple too.
 -    pref = next(s for s in handle.sections if s.kind == "preference")
 -    assert pref.preferences
 -    assert pref.preferences[0].chosen == "good answer"
+-
+-
 -def test_resolve_without_dlm_installed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
 -    """resolve_dlm surfaces a SwayError when the dlm package is missing."""
 -    # Wipe any cached dlm modules so the lazy import fails.
 -    for mod in list(sys.modules):
 -        if mod == "dlm" or mod.startswith("dlm."):
 -            monkeypatch.delitem(sys.modules, mod, raising=False)
+-
 -    import builtins
+-
 -    real_import = builtins.__import__
+-
 -    def fake_import(name: str, *args, **kwargs):  # type: ignore[no-untyped-def]
 -        if name.startswith("dlm."):
 -            raise ImportError("missing extra")
 -        return real_import(name, *args, **kwargs)
+-
 -    monkeypatch.setattr(builtins, "__import__", fake_import)
+-
 -    from dlm_sway.core.errors import SwayError
 -    from dlm_sway.integrations.dlm.resolver import resolve_dlm
+-
 -    with pytest.raises(SwayError, match="dlm package not installed"):
 -        resolve_dlm(tmp_path / "doc.dlm")
+-
+-
 -def test_autogen_writes_complete_suite(fake_dlm: Path, tmp_path: Path) -> None:
 -    from dlm_sway.integrations.dlm.autogen import write_sway_yaml
+-
 -    out = tmp_path / "sway.yaml"
 -    write_sway_yaml(fake_dlm, out)
 -    data = yaml.safe_load(out.read_text(encoding="utf-8"))
+-
 -    assert data["version"] == 1
 -    assert data["models"]["base"]["base"] == "HuggingFaceTB/SmolLM2-135M-Instruct"
 -    assert data["models"]["ft"]["adapter"] is not None
 -    assert data["dlm_source"] == str(fake_dlm.resolve())
+-
 -    kinds = {entry["kind"] for entry in data["suite"]}
 -    # The full 11-primitive battery minus nothing is present (some may
 -    # be skipped when data is absent, but here we have one of every
 -    # section type).
 -    expected = {
 -        "null_adapter",
 -        "delta_kl",
 -        "adapter_revert",
 -        "prompt_collapse",
 -        "section_internalization",
 -        "paraphrase_invariance",
 -        "preference_flip",
 -        "style_fingerprint",
 -        "calibration_drift",
 -        "leakage",
 -        "adapter_ablation",
 -    }
 -    assert expected <= kinds, f"missing: {expected - kinds}"
+-
+-
 -def test_build_spec_dict_skips_preference_when_absent() -> None:
 -    from dlm_sway.core.sections import Section
 -    from dlm_sway.integrations.dlm.autogen import build_spec_dict
 -    from dlm_sway.integrations.dlm.resolver import DlmHandle
+-
 -    sections = (
 -        Section(id="a", kind="prose", content="A prose section. Second sentence."),
 -        Section(id="b", kind="prose", content="Another prose section."),
 -    )
 -    handle = DlmHandle(
 -        dlm_id="x",
 -        base_model="base",
 -        adapter_path=Path("/tmp/adapter"),
 -        sections=sections,
 -        doc_text="whole document",
 -    )
 -    spec = build_spec_dict(handle)
 -    kinds = {entry["kind"] for entry in spec["suite"]}
 -    assert "preference_flip" not in kinds
 -    assert "section_internalization" in kinds

sway/tests/unit/test_errors.pydeleted

 -"""Tests for the exception hierarchy."""
+-
 -from __future__ import annotations
+-
 -import pytest
+-
 -from dlm_sway.core.errors import (
 -    BackendNotAvailableError,
 -    ProbeError,
 -    SpecValidationError,
 -    SwayError,
 -)
+-
+-
 -class TestSwayError:
 -    def test_is_root_exception(self) -> None:
 -        assert issubclass(SpecValidationError, SwayError)
 -        assert issubclass(BackendNotAvailableError, SwayError)
 -        assert issubclass(ProbeError, SwayError)
+-
 -    def test_raised_and_caught_as_sway_error(self) -> None:
 -        with pytest.raises(SwayError):
 -            raise ProbeError("delta_kl", "shape mismatch")
+-
+-
 -class TestSpecValidationError:
 -    def test_format_without_source(self) -> None:
 -        err = SpecValidationError("unknown key 'topp'")
 -        assert str(err) == "unknown key 'topp'"
 -        assert err.source is None
+-
 -    def test_format_with_source(self) -> None:
 -        err = SpecValidationError("unknown key 'topp'", source="sway.yaml")
 -        assert str(err) == "sway.yaml: unknown key 'topp'"
 -        assert err.source == "sway.yaml"
+-
+-
 -class TestBackendNotAvailableError:
 -    def test_hint_rendered_in_message(self) -> None:
 -        err = BackendNotAvailableError("hf", extra="hf")
 -        assert "pip install 'dlm-sway[hf]'" in str(err)
 -        assert err.backend == "hf"
 -        assert err.extra == "hf"
+-
 -    def test_appends_optional_hint(self) -> None:
 -        err = BackendNotAvailableError("mlx", extra="mlx", hint="Apple Silicon only.")
 -        assert "Apple Silicon only." in str(err)
+-
+-
 -class TestProbeError:
 -    def test_includes_probe_name(self) -> None:
 -        err = ProbeError("delta_kl", "NaN logits")
 -        assert "delta_kl" in str(err)
 -        assert "NaN logits" in str(err)
 -        assert err.probe == "delta_kl"

sway/tests/unit/test_model.pydeleted

 -"""Tests for :mod:`dlm_sway.core.model`."""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
+-
 -import pytest
 -from pydantic import ValidationError
+-
 -from dlm_sway.core.model import LoadedModel, Model, ModelSpec
+-
+-
 -class TestModelSpec:
 -    def test_defaults(self) -> None:
 -        spec = ModelSpec(base="HuggingFaceTB/SmolLM2-135M-Instruct")
 -        assert spec.kind == "hf"
 -        assert spec.adapter is None
 -        assert spec.dtype == "auto"
 -        assert spec.device == "auto"
 -        assert spec.trust_remote_code is False
 -        assert spec.entry_point is None
+-
 -    def test_frozen(self) -> None:
 -        spec = ModelSpec(base="x")
 -        with pytest.raises(ValidationError):
 -            spec.base = "y"  # type: ignore[misc]
+-
 -    def test_extra_fields_forbidden(self) -> None:
 -        with pytest.raises(ValidationError) as exc_info:
 -            ModelSpec(base="x", bogus="y")  # type: ignore[call-arg]
 -        assert "bogus" in str(exc_info.value).lower()
+-
 -    def test_kind_enum(self) -> None:
 -        ModelSpec(base="x", kind="hf")
 -        ModelSpec(base="x", kind="mlx")
 -        ModelSpec(base="x", kind="dummy")
 -        ModelSpec(base="x", kind="custom", entry_point="pkg.mod:Backend")
 -        with pytest.raises(ValidationError):
 -            ModelSpec(base="x", kind="ollama")  # type: ignore[arg-type]
+-
 -    def test_adapter_coerced_to_path(self) -> None:
 -        spec = ModelSpec(base="x", adapter="/tmp/adapter")  # type: ignore[arg-type]
 -        assert isinstance(spec.adapter, Path)
+-
+-
 -class TestLoadedModel:
 -    def test_frozen_dataclass(self) -> None:
 -        loaded = LoadedModel(
 -            id="base",
 -            spec=ModelSpec(base="x"),
 -            model=object(),
 -            tokenizer=object(),
 -            meta={"device": "cpu"},
 -        )
 -        assert loaded.id == "base"
 -        assert loaded.meta["device"] == "cpu"
+-
+-
 -class TestModelProtocol:
 -    def test_runtime_checkable(self) -> None:
 -        class FakeModel:
 -            id = "x"
+-
 -            def generate(
 -                self,
 -                prompt: str,
 -                *,
 -                max_new_tokens: int,
 -                temperature: float = 0.0,
 -                top_p: float = 1.0,
 -                seed: int = 0,
 -            ) -> str:
 -                return f"{prompt}|{max_new_tokens}"
+-
 -            def close(self) -> None:
 -                return None
+-
 -        assert isinstance(FakeModel(), Model)

sway/tests/unit/test_null_calibration.pydeleted

 -"""Tests for null-adapter calibration.
+-
 -Covers: dummy backend ``as_null_adapter`` yields a plausibly noisy
 -view; ``NullAdapterProbe`` populates ``ctx.null_stats`` in a way
 -downstream probes pick up end-to-end; missing-capability SKIP path.
 -"""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.scoring import NullCalibratedBackend
 -from dlm_sway.probes.base import RunContext, build_probe
 -from dlm_sway.suite.runner import run as run_suite
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -def _diverging_backend() -> DummyDifferentialBackend:
 -    base = DummyResponses()
 -    ft = DummyResponses()
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -class TestProtocolConformance:
 -    def test_dummy_is_null_calibrated(self) -> None:
 -        assert isinstance(_diverging_backend(), NullCalibratedBackend)
+-
+-
 -class TestAsNullAdapter:
 -    def test_yields_perturbed_view(self) -> None:
 -        backend = _diverging_backend()
 -        with backend.as_base() as base:
 -            base_dist = base.next_token_dist("hello")
 -        with backend.as_null_adapter(seed=0) as null:
 -            null_dist = null.next_token_dist("hello")
 -        # Some perturbation, but bounded.
 -        assert not np.allclose(base_dist.logprobs, null_dist.logprobs)
+-
 -    def test_different_seeds_yield_different_views(self) -> None:
 -        backend = _diverging_backend()
 -        with backend.as_null_adapter(seed=1) as v1:
 -            d1 = v1.next_token_dist("hello")
 -        with backend.as_null_adapter(seed=2) as v2:
 -            d2 = v2.next_token_dist("hello")
 -        assert not np.allclose(d1.logprobs, d2.logprobs)
+-
 -    def test_view_exclusion_enforced(self) -> None:
 -        import pytest
+-
 -        backend = _diverging_backend()
 -        with backend.as_null_adapter(seed=0), pytest.raises(RuntimeError):
 -            with backend.as_base():
 -                pass
+-
+-
 -class TestProbe:
 -    def test_populates_null_stats(self) -> None:
 -        backend = _diverging_backend()
 -        probe, spec = build_probe(
 -            {
 -                "name": "null",
 -                "kind": "null_adapter",
 -                "runs": 3,
 -                "prompts": ["q1", "q2"],
 -            }
 -        )
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
 -        stats = result.evidence["null_stats"]
 -        assert "delta_kl" in stats
 -        assert stats["delta_kl"]["n"] == 3.0
 -        assert stats["delta_kl"]["std"] > 0.0  # seeded perturbations produce variance
+-
 -    def test_runner_threads_null_stats_to_subsequent_probes(self) -> None:
 -        """End-to-end: null_adapter first → delta_kl picks up z-score path."""
 -        backend = _diverging_backend()
 -        raw_spec = SwaySpec.model_validate(
 -            {
 -                "version": 1,
 -                "models": {"base": {"base": "b"}, "ft": {"base": "b", "adapter": "/tmp/a"}},
 -                "suite": [
 -                    {
 -                        "name": "null",
 -                        "kind": "null_adapter",
 -                        "runs": 3,
 -                        "prompts": ["p1", "p2"],
 -                    },
 -                    {
 -                        "name": "dk",
 -                        "kind": "delta_kl",
 -                        "prompts": ["p1", "p2"],
 -                        "assert_z_gte": -10.0,  # permissive so we pass regardless
 -                    },
 -                ],
 -            }
 -        )
 -        result = run_suite(raw_spec, backend)
 -        assert len(result.probes) == 2
 -        null_result = result.probes[0]
 -        dk_result = result.probes[1]
 -        assert null_result.verdict == Verdict.PASS
 -        # The delta_kl probe should have computed a z_score because null_stats was present.
 -        assert dk_result.z_score is not None, (
 -            "delta_kl should have z-scored against null baseline, got "
 -            f"evidence={dk_result.evidence}, message={dk_result.message}"
 -        )
+-
 -    def test_skip_when_backend_not_null_calibrated(self) -> None:
 -        class _Bare:
 -            def as_base(self):  # noqa: ANN202
 -                raise NotImplementedError
+-
 -            def as_finetuned(self):  # noqa: ANN202
 -                raise NotImplementedError
+-
 -        probe, spec = build_probe({"name": "null", "kind": "null_adapter"})
 -        ctx = RunContext(backend=_Bare())  # type: ignore[arg-type]
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.SKIP
 -        assert "NullCalibratedBackend" in result.message

sway/tests/unit/test_probe_adapter_ablation.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.adapter_ablation`.
+-
 -Uses the dummy backend's lam-interpolation implementation to exercise
 -the full probe path without loading a real model.
 -"""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist
 -from dlm_sway.probes.adapter_ablation import (
 -    _overshoot,
 -    _r_squared,
 -    _saturation_lambda,
 -)
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -class TestShapeMetrics:
 -    def test_r_squared_perfect_linear(self) -> None:
 -        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
 -        y = 2 * x + 0.1
 -        assert _r_squared(x, y) > 0.99
+-
 -    def test_r_squared_zero_slope_defined(self) -> None:
 -        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
 -        y = np.zeros_like(x)
 -        # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit).
 -        assert _r_squared(x, y) == 1.0
+-
 -    def test_saturation_lambda_expected(self) -> None:
 -        lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64)
 -        divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64)
 -        sat = _saturation_lambda(lambdas, divs)
 -        assert sat == 0.75  # 0.95 / 1.0 = 0.95 ≥ 0.9
+-
 -    def test_overshoot_recovered(self) -> None:
 -        lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64)
 -        divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64)
 -        assert _overshoot(lambdas, divs) == 1.15
+-
+-
 -def _diverging_backend() -> DummyDifferentialBackend:
 -    """Backend where base ≠ ft at a few prompts; distributions interpolate
 -    smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter."""
 -    base = DummyResponses(
 -        token_dists={
 -            "q1": TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -            "q2": TokenDist(
 -                token_ids=np.array([5, 6], dtype=np.int64),
 -                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -        }
 -    )
 -    ft = DummyResponses(
 -        token_dists={
 -            "q1": TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -            "q2": TokenDist(
 -                token_ids=np.array([5, 6], dtype=np.int64),
 -                logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -        }
 -    )
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -class TestProbe:
 -    def test_backend_implements_scalable_protocol(self) -> None:
 -        backend = _diverging_backend()
 -        assert isinstance(backend, ScalableDifferentialBackend)
+-
 -    def test_probe_runs_and_emits_shape_metrics(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "abl",
 -                "kind": "adapter_ablation",
 -                "prompts": ["q1", "q2"],
 -                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
 -                # Very permissive to tolerate the log-space blend of a
 -                # tiny synthetic fixture.
 -                "assert_linearity_gte": 0.3,
 -                "assert_overshoot_gte": 1.0,
 -            }
 -        )
 -        ctx = RunContext(backend=_diverging_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict in (Verdict.PASS, Verdict.FAIL)
 -        assert "lambdas" in result.evidence
 -        assert "mean_divergence_per_lambda" in result.evidence
 -        assert len(result.evidence["mean_divergence_per_lambda"]) == 6
 -        # Divergence should increase as λ grows from 0 toward ft.
 -        divs = result.evidence["mean_divergence_per_lambda"]
 -        # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing
 -        # for the bulk of the curve.
 -        assert divs[-2] >= divs[0]
+-
 -    def test_skip_when_backend_not_scalable(self) -> None:
 -        class _NonScalable:
 -            def as_base(self):  # noqa: ANN202
 -                raise NotImplementedError
+-
 -            def as_finetuned(self):  # noqa: ANN202
 -                raise NotImplementedError
+-
 -        probe, spec = build_probe(
 -            {
 -                "name": "abl",
 -                "kind": "adapter_ablation",
 -                "prompts": ["q1"],
 -            }
 -        )
 -        ctx = RunContext(backend=_NonScalable())  # type: ignore[arg-type]
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.SKIP
 -        assert "ScalableDifferentialBackend" in result.message
+-
 -    def test_error_on_empty_prompts(self) -> None:
 -        backend = _diverging_backend()
 -        probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_adapter_revert.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.adapter_revert`.
+-
 -We stub out the embedder so these tests don't need sentence-transformers
 -installed. The ``probe.py`` SKIP path for the missing-extra case is
 -covered separately by monkeypatching the importer.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Any
+-
 -import numpy as np
 -import pytest
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.probes.adapter_revert import AdapterRevertProbe
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _backend(*, ft_like_base: bool = False) -> DummyDifferentialBackend:
 -    base = DummyResponses(
 -        generations={
 -            "pp1": "cats are mammals",
 -            "pp2": "cats have fur",
 -        }
 -    )
 -    if ft_like_base:
 -        ft_gens = dict(base.generations)
 -    else:
 -        ft_gens = {
 -            "pp1": "dolphins are mammals",
 -            "pp2": "dolphins are smart",
 -        }
 -    ft = DummyResponses(generations=ft_gens)
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -def _stub_embedder(text_to_vec: dict[str, np.ndarray]):  # type: ignore[no-untyped-def]
 -    def _encode(texts: list[str]):  # type: ignore[no-untyped-def]
 -        return np.stack([text_to_vec[t] for t in texts])
+-
 -    return _encode
+-
+-
 -@pytest.fixture
 -def monkeyed_embed(monkeypatch: pytest.MonkeyPatch) -> dict[str, np.ndarray]:
 -    """Install a stub embedder with a controllable text→vec mapping.
+-
 -    Tests populate the dict before calling ``probe.run()``.
 -    """
 -    table: dict[str, np.ndarray] = {}
 -    monkeypatch.setattr(
 -        "dlm_sway.probes.adapter_revert._load_embedder",
 -        lambda _model_id: _stub_embedder(table),  # type: ignore[arg-type]
 -    )
 -    return table
+-
+-
 -class TestAdapterRevert:
 -    def test_healthy_adapter_passes(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
 -        # gold and ft-outputs cluster together, base outputs cluster elsewhere.
 -        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
 -        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
 -        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
 -        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
 -        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
+-
 -        probe, spec = build_probe(
 -            {
 -                "name": "rev",
 -                "kind": "adapter_revert",
 -                "cases": [
 -                    {
 -                        "prompt": "anything",
 -                        "gold": "the answer is dolphins",
 -                        "paraphrases": ["pp1", "pp2"],
 -                    }
 -                ],
 -                "assert_revert_rate_lt": 0.25,
 -            }
 -        )
 -        ctx = RunContext(backend=_backend(ft_like_base=False))
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
 -        assert result.raw == 0.0
+-
 -    def test_reverting_adapter_fails(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
 -        # ft matches base (reverted), diverges from gold.
 -        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
 -        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
 -        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
+-
 -        probe, spec = build_probe(
 -            {
 -                "name": "rev",
 -                "kind": "adapter_revert",
 -                "cases": [
 -                    {
 -                        "prompt": "anything",
 -                        "gold": "the answer is dolphins",
 -                        "paraphrases": ["pp1", "pp2"],
 -                    }
 -                ],
 -            }
 -        )
 -        ctx = RunContext(backend=_backend(ft_like_base=True))
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.FAIL
 -        assert result.raw == 1.0  # 100% revert
+-
 -    def test_trivially_similar_cases_dropped(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
 -        # base and gold are identical → drop.
 -        v = np.array([1.0, 0.0])
 -        monkeyed_embed["cats are mammals"] = v
 -        monkeyed_embed["cats have fur"] = v
 -        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
 -        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
 -        monkeyed_embed["cats are mammals too"] = v  # gold — matches base
+-
 -        probe, spec = build_probe(
 -            {
 -                "name": "rev",
 -                "kind": "adapter_revert",
 -                "cases": [
 -                    {
 -                        "prompt": "anything",
 -                        "gold": "cats are mammals too",
 -                        "paraphrases": ["pp1", "pp2"],
 -                    }
 -                ],
 -            }
 -        )
 -        ctx = RunContext(backend=_backend(ft_like_base=False))
 -        result = probe.run(spec, ctx)
 -        # Both paraphrase pairs trivially similar → WARN (no separable signal).
 -        assert result.verdict == Verdict.WARN
 -        assert result.evidence["dropped_trivial"] == 2
+-
 -    def test_no_cases_errors(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
 -        probe, spec = build_probe({"name": "rev", "kind": "adapter_revert", "cases": []})
 -        ctx = RunContext(backend=_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.ERROR
+-
+-
 -class TestMissingSemsim:
 -    def test_skip_when_sentence_transformers_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
 -        from dlm_sway.core.errors import BackendNotAvailableError
+-
 -        def raiser(_model_id: Any) -> Any:  # type: ignore[no-untyped-def]
 -            raise BackendNotAvailableError(
 -                "adapter_revert",
 -                extra="semsim",
 -                hint="adapter_revert relies on sentence embeddings.",
 -            )
+-
 -        monkeypatch.setattr(
 -            "dlm_sway.probes.adapter_revert._load_embedder",
 -            raiser,  # type: ignore[arg-type]
 -        )
 -        probe = AdapterRevertProbe()
 -        spec = probe.spec_cls(
 -            name="rev",
 -            cases=[{"prompt": "x", "gold": "y", "paraphrases": ["pp1"]}],  # type: ignore[list-item]
 -        )
 -        ctx = RunContext(backend=_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.SKIP
 -        assert "semsim" in result.message

sway/tests/unit/test_probe_base.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.base`."""
+-
 -from __future__ import annotations
+-
 -from typing import Literal
+-
 -import pytest
+-
 -from dlm_sway.core.errors import SpecValidationError
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, build_probe, registry
+-
+-
 -class _DummySpec(ProbeSpec):
 -    kind: Literal["__test_dummy"] = "__test_dummy"
 -    payload: str = "x"
+-
+-
 -class _DummyProbe(Probe):
 -    kind = "__test_dummy"
 -    spec_cls = _DummySpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        assert isinstance(spec, _DummySpec)
 -        return ProbeResult(
 -            name=spec.name,
 -            kind=spec.kind,
 -            verdict=Verdict.PASS,
 -            score=1.0,
 -            message=spec.payload,
 -        )
+-
+-
 -class TestRegistry:
 -    def test_autoregister(self) -> None:
 -        assert "__test_dummy" in registry()
 -        assert registry()["__test_dummy"] is _DummyProbe
+-
 -    def test_duplicate_kind_rejected(self) -> None:
 -        with pytest.raises(ValueError, match="duplicate probe kind"):
+-
 -            class _Clash(Probe):
 -                kind = "__test_dummy"
 -                spec_cls = _DummySpec
+-
 -                def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -                    raise NotImplementedError
+-
+-
 -class TestBuildProbe:
 -    def test_valid_entry(self) -> None:
 -        probe, spec = build_probe({"name": "t", "kind": "__test_dummy", "payload": "hi"})
 -        assert isinstance(probe, _DummyProbe)
 -        assert isinstance(spec, _DummySpec)
 -        assert spec.payload == "hi"
+-
 -    def test_unknown_kind(self) -> None:
 -        with pytest.raises(SpecValidationError, match="unknown probe kind"):
 -            build_probe({"name": "t", "kind": "no_such_kind"})
+-
 -    def test_missing_kind(self) -> None:
 -        with pytest.raises(SpecValidationError, match="missing string 'kind'"):
 -            build_probe({"name": "t"})
+-
 -    def test_extra_field_forbidden(self) -> None:
 -        with pytest.raises(SpecValidationError) as exc_info:
 -            build_probe({"name": "t", "kind": "__test_dummy", "bogus": "y"})
 -        assert "bogus" in str(exc_info.value).lower()

sway/tests/unit/test_probe_calibration_drift.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.calibration_drift`."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _backend(delta_per_token: float) -> DummyDifferentialBackend:
 -    """Apply a uniform per-token logprob delta across every item."""
 -    base_lp: dict[tuple[str, str], float] = {}
 -    ft_lp: dict[tuple[str, str], float] = {}
 -    for prompt, gold in BUILT_IN_PACK:
 -        base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1)
 -        ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1)
 -    return DummyDifferentialBackend(
 -        base=DummyResponses(logprobs=base_lp),
 -        ft=DummyResponses(logprobs=ft_lp),
 -    )
+-
+-
 -class TestCalibrationDrift:
 -    def test_healthy_when_no_regression(self) -> None:
 -        backend = _backend(delta_per_token=0.0)  # no drift
 -        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
 -        assert result.raw == 0.0  # zero fraction regressed
+-
 -    def test_fail_on_uniform_large_regression(self) -> None:
 -        backend = _backend(delta_per_token=-2.0)  # every item regresses
 -        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.FAIL
 -        assert result.raw == 1.0
+-
 -    def test_respects_items_limit(self) -> None:
 -        backend = _backend(delta_per_token=0.0)
 -        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.evidence["total_items"] == 5
+-
 -    def test_worst_offenders_reported(self) -> None:
 -        backend = _backend(delta_per_token=-2.0)
 -        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        worst = result.evidence["worst_offenders"]
 -        assert len(worst) <= 5
 -        # Each worst-offender record carries prompt/gold/delta fields.
 -        if worst:
 -            assert {"prompt", "gold", "delta"} <= set(worst[0].keys())

sway/tests/unit/test_probe_delta_kl.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.delta_kl`."""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.scoring import TokenDist
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _diverging_backend() -> DummyDifferentialBackend:
 -    """Base peaks tightly on token 1; ft is broad uniform. Real divergence."""
 -    base = DummyResponses(
 -        token_dists={
 -            "q1": TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -            "q2": TokenDist(
 -                token_ids=np.array([5, 6], dtype=np.int64),
 -                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -        }
 -    )
 -    ft = DummyResponses(
 -        token_dists={
 -            "q1": TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(np.array([0.3, 0.35, 0.35], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -            "q2": TokenDist(
 -                token_ids=np.array([5, 6], dtype=np.int64),
 -                logprobs=np.log(np.array([0.4, 0.6], dtype=np.float32)),
 -                vocab_size=100,
 -            ),
 -        }
 -    )
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -def _identical_backend() -> DummyDifferentialBackend:
 -    dist = TokenDist(
 -        token_ids=np.array([1, 2, 3], dtype=np.int64),
 -        logprobs=np.log(np.array([0.5, 0.3, 0.2], dtype=np.float32)),
 -        vocab_size=100,
 -    )
 -    base = DummyResponses(token_dists={"q1": dist})
 -    ft = DummyResponses(token_dists={"q1": dist})
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -class TestDeltaKL:
 -    def test_passes_when_distributions_diverge(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "dk",
 -                "kind": "delta_kl",
 -                "prompts": ["q1", "q2"],
 -                "assert_mean_gte": 0.01,
 -            }
 -        )
 -        ctx = RunContext(backend=_diverging_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
 -        assert result.raw is not None
 -        assert result.raw > 0.01
 -        assert result.evidence["num_prompts"] == 2
 -        assert len(result.evidence["per_prompt"]) == 2
+-
 -    def test_fails_when_distributions_identical(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "dk",
 -                "kind": "delta_kl",
 -                "prompts": ["q1"],
 -                "assert_mean_gte": 0.01,
 -            }
 -        )
 -        ctx = RunContext(backend=_identical_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.FAIL
 -        assert result.raw == 0.0
+-
 -    def test_z_score_path_when_null_stats_present(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "dk",
 -                "kind": "delta_kl",
 -                "prompts": ["q1"],
 -                "assert_z_gte": 2.0,
 -            }
 -        )
 -        null_stats = {"delta_kl": {"mean": 0.01, "std": 0.01, "n": 3.0}}
 -        ctx = RunContext(backend=_diverging_backend(), null_stats=null_stats)
 -        result = probe.run(spec, ctx)
 -        assert result.z_score is not None
 -        # Our synthetic ft diverges ~0.1+, far above μ=0.01, σ=0.01 → huge z.
 -        assert result.z_score > 2.0
 -        assert result.verdict == Verdict.PASS
+-
 -    def test_error_on_empty_prompts(self) -> None:
 -        probe, spec = build_probe({"name": "dk", "kind": "delta_kl", "prompts": []})
 -        ctx = RunContext(backend=_identical_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.ERROR
+-
 -    def test_kl_kind_available(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "dk",
 -                "kind": "delta_kl",
 -                "prompts": ["q1"],
 -                "divergence": "kl",
 -                "assert_mean_gte": 0.0,
 -            }
 -        )
 -        ctx = RunContext(backend=_diverging_backend())
 -        result = probe.run(spec, ctx)
 -        assert result.evidence["divergence_kind"] == "kl"

sway/tests/unit/test_probe_leakage.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.leakage`."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.sections import Section
 -from dlm_sway.probes.base import RunContext, build_probe
 -from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb
+-
+-
 -class TestLCS:
 -    def test_identical_returns_one(self) -> None:
 -        assert _lcs_ratio("abcdef", "abcdef") == 1.0
+-
 -    def test_disjoint_returns_low(self) -> None:
 -        assert _lcs_ratio("abc", "xyz") < 0.3
+-
 -    def test_empty_returns_zero(self) -> None:
 -        assert _lcs_ratio("", "abc") == 0.0
+-
+-
 -class TestPerturb:
 -    def test_typo_swaps_first_two(self) -> None:
 -        assert _perturb("hello", "typo") == "ehllo"
+-
 -    def test_case_flip_inverts_first_alpha(self) -> None:
 -        assert _perturb("abc", "case_flip") == "Abc"
 -        assert _perturb("ABC", "case_flip") == "aBC"
+-
 -    def test_drop_punct_removes_punct(self) -> None:
 -        assert _perturb("a, b. c!", "drop_punct") == "a b c"
+-
+-
 -class TestFragility:
 -    def test_zero_when_clean_zero(self) -> None:
 -        assert _fragility(0.0, 0.0) == 0.0
+-
 -    def test_expected_when_perturbed_dropped(self) -> None:
 -        import pytest
+-
 -        assert _fragility(0.8, 0.2) == pytest.approx(0.75)
+-
+-
 -def _prose_section(sid: str, content: str) -> Section:
 -    return Section(id=sid, kind="prose", content=content)
+-
+-
 -def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend:
 -    """Build a backend whose ft generate() returns a controlled prefix of ``target``.
+-
 -    The target is "aaa..." (200 chars) so we can measure LCS ratio
 -    against it deterministically.
 -    """
 -    content = ("The capital of France is Paris. " * 30).strip()
 -    # Generate a fraction of the target to hit the desired recall.
 -    target = content[128 : 128 + 256]
 -    ft_full = target[: int(ft_recall * len(target))]
 -    ft_pert = target[: int(ft_perturbed_recall * len(target))]
+-
 -    base = DummyResponses()
 -    ft = DummyResponses(
 -        generations={
 -            content[:128]: ft_full,
 -            # perturbations of the first 128 chars hit these three:
 -            **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")},
 -        }
 -    )
 -    return DummyDifferentialBackend(base=base, ft=ft), content
+-
+-
 -class TestProbe:
 -    def test_skip_without_sections(self) -> None:
 -        backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
 -        probe, spec = build_probe({"name": "c3", "kind": "leakage"})
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.SKIP
+-
 -    def test_pass_when_no_leak(self) -> None:
 -        backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
 -        probe, spec = build_probe(
 -            {
 -                "name": "c3",
 -                "kind": "leakage",
 -                "prefix_chars": 128,
 -                "continuation_chars": 256,
 -            }
 -        )
 -        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
+-
 -    def test_fail_when_strong_low_fragility_leak(self) -> None:
 -        backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9)
 -        probe, spec = build_probe(
 -            {
 -                "name": "c3",
 -                "kind": "leakage",
 -                "prefix_chars": 128,
 -                "continuation_chars": 256,
 -                "assert_recall_lt": 0.5,
 -                "min_fragility": 0.3,
 -            }
 -        )
 -        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
 -        result = probe.run(spec, ctx)
 -        # High recall + low fragility → fail.
 -        assert result.verdict == Verdict.FAIL

sway/tests/unit/test_probe_paraphrase_invariance.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.paraphrase_invariance`."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _backend(*, par_lift_fraction: float, verb_lift: float = 10.0) -> DummyDifferentialBackend:
 -    """Return a backend with tunable verbatim/paraphrase lifts.
+-
 -    The ft view adds ``verb_lift`` nats to the verbatim (Q,A) logprob
 -    and ``par_lift_fraction * verb_lift`` to paraphrase logprobs.
 -    """
 -    base = DummyResponses(
 -        logprobs={
 -            ("Q", "A"): -20.0,
 -            ("Q_par1", "A"): -20.0,
 -            ("Q_par2", "A"): -20.0,
 -        }
 -    )
 -    ft = DummyResponses(
 -        logprobs={
 -            ("Q", "A"): -20.0 + verb_lift,
 -            ("Q_par1", "A"): -20.0 + par_lift_fraction * verb_lift,
 -            ("Q_par2", "A"): -20.0 + par_lift_fraction * verb_lift,
 -        }
 -    )
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -def test_pass_when_generalizing() -> None:
 -    # High paraphrase lift + high verbatim → healthy generalization.
 -    backend = _backend(par_lift_fraction=0.9)
 -    probe, spec = build_probe(
 -        {
 -            "name": "pi",
 -            "kind": "paraphrase_invariance",
 -            "intent": "generalize",
 -            "min_verbatim_lift": 0.05,
 -            "min_generalization_ratio": 0.5,
 -            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1", "Q_par2"]}],
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.PASS
 -    assert result.raw is not None
 -    assert result.raw >= 0.5
+-
+-
 -def test_fails_when_only_memorized_but_intent_generalize() -> None:
 -    backend = _backend(par_lift_fraction=0.0)
 -    probe, spec = build_probe(
 -        {
 -            "name": "pi",
 -            "kind": "paraphrase_invariance",
 -            "intent": "generalize",
 -            "min_verbatim_lift": 0.05,
 -            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.FAIL
+-
+-
 -def test_passes_memorize_intent_when_only_memorized() -> None:
 -    backend = _backend(par_lift_fraction=0.0)
 -    probe, spec = build_probe(
 -        {
 -            "name": "pi",
 -            "kind": "paraphrase_invariance",
 -            "intent": "memorize",
 -            "min_verbatim_lift": 0.05,
 -            "max_generalization_ratio_if_memorize": 0.3,
 -            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.PASS
+-
+-
 -def test_error_on_empty_cases() -> None:
 -    probe, spec = build_probe({"name": "pi", "kind": "paraphrase_invariance", "cases": []})
 -    backend = _backend(par_lift_fraction=0.9)
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_preference_flip.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.preference_flip`."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.sections import Section, SectionPreference
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend:
 -    """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin).
+-
 -    We distribute the margin half to the chosen and half (negative) to
 -    the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected)
 -    equal the requested margin.
 -    """
 -    base_lp: dict[tuple[str, str], float] = {}
 -    ft_lp: dict[tuple[str, str], float] = {}
 -    for prompt, chosen, rejected, base_m, ft_m in pairs:
 -        base_lp[(prompt, chosen)] = base_m / 2
 -        base_lp[(prompt, rejected)] = -base_m / 2
 -        ft_lp[(prompt, chosen)] = ft_m / 2
 -        ft_lp[(prompt, rejected)] = -ft_m / 2
 -    return DummyDifferentialBackend(
 -        base=DummyResponses(logprobs=base_lp),
 -        ft=DummyResponses(logprobs=ft_lp),
 -    )
+-
+-
 -def test_pass_when_base_wrong_flipped() -> None:
 -    backend = _backend(
 -        [
 -            ("p1", "good1", "bad1", -2.0, 2.0),  # base wrong, ft flips
 -            ("p2", "good2", "bad2", -1.5, 1.0),  # base wrong, ft flips
 -            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
 -            ("p4", "good4", "bad4", 1.0, 2.0),  # base already right (no contribution)
 -        ]
 -    )
 -    triples = [
 -        {"prompt": p, "chosen": c, "rejected": r}
 -        for (p, c, r, _, _) in [
 -            ("p1", "good1", "bad1", 0, 0),
 -            ("p2", "good2", "bad2", 0, 0),
 -            ("p3", "good3", "bad3", 0, 0),
 -            ("p4", "good4", "bad4", 0, 0),
 -        ]
 -    ]
 -    probe, spec = build_probe(
 -        {
 -            "name": "pf",
 -            "kind": "preference_flip",
 -            "triples": triples,
 -            "assert_flip_rate_gte": 0.7,
 -            "min_triples_for_decision": 3,
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.PASS
 -    assert result.raw == 1.0  # 3/3 flipped
+-
+-
 -def test_fail_when_base_wrong_not_flipped() -> None:
 -    backend = _backend(
 -        [
 -            ("p1", "good1", "bad1", -2.0, -1.5),  # base wrong, ft still wrong
 -            ("p2", "good2", "bad2", -1.5, -1.0),  # base wrong, ft still wrong
 -            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
 -        ]
 -    )
 -    triples = [
 -        {"prompt": p, "chosen": c, "rejected": r}
 -        for p, c, r in [
 -            ("p1", "good1", "bad1"),
 -            ("p2", "good2", "bad2"),
 -            ("p3", "good3", "bad3"),
 -        ]
 -    ]
 -    probe, spec = build_probe(
 -        {
 -            "name": "pf",
 -            "kind": "preference_flip",
 -            "triples": triples,
 -            "assert_flip_rate_gte": 0.7,
 -            "min_triples_for_decision": 3,
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.FAIL
 -    assert result.raw is not None
 -    assert result.raw < 0.7
+-
+-
 -def test_skip_when_no_triples_anywhere() -> None:
 -    probe, spec = build_probe({"name": "pf", "kind": "preference_flip"})
 -    backend = _backend([])
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.SKIP
+-
+-
 -def test_warn_when_too_few_base_wrong() -> None:
 -    backend = _backend(
 -        [
 -            ("p1", "good1", "bad1", 1.0, 2.0),  # base right
 -            ("p2", "good2", "bad2", 0.5, 1.0),  # base right
 -            ("p3", "good3", "bad3", -0.5, 0.5),  # base wrong
 -        ]
 -    )
 -    triples = [
 -        {"prompt": p, "chosen": c, "rejected": r}
 -        for p, c, r in [
 -            ("p1", "good1", "bad1"),
 -            ("p2", "good2", "bad2"),
 -            ("p3", "good3", "bad3"),
 -        ]
 -    ]
 -    probe, spec = build_probe(
 -        {
 -            "name": "pf",
 -            "kind": "preference_flip",
 -            "triples": triples,
 -            "min_triples_for_decision": 3,
 -        }
 -    )
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.WARN
+-
+-
 -def test_triples_pulled_from_sections() -> None:
 -    pref_section = Section(
 -        id="p1",
 -        kind="preference",
 -        content="...",
 -        preferences=(
 -            SectionPreference(prompt="q1", chosen="good", rejected="bad"),
 -            SectionPreference(prompt="q2", chosen="good2", rejected="bad2"),
 -            SectionPreference(prompt="q3", chosen="good3", rejected="bad3"),
 -        ),
 -    )
 -    backend = _backend(
 -        [
 -            ("q1", "good", "bad", -1.0, 1.0),
 -            ("q2", "good2", "bad2", -1.0, 1.0),
 -            ("q3", "good3", "bad3", -1.0, 1.0),
 -        ]
 -    )
 -    probe, spec = build_probe(
 -        {
 -            "name": "pf",
 -            "kind": "preference_flip",
 -            "assert_flip_rate_gte": 0.7,
 -            "min_triples_for_decision": 3,
 -        }
 -    )
 -    ctx = RunContext(backend=backend, sections=(pref_section,))
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.PASS

sway/tests/unit/test_probe_prompt_collapse.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.prompt_collapse`.
+-
 -Uses a programmable dummy backend that serves different token dists
 -depending on whether the prompt contains the stuffing prefix. That's the
 -cleanest way to simulate "divergence decays with context length" without
 -a real model.
 -"""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.scoring import TokenDist
 -from dlm_sway.probes.base import RunContext, build_probe
 -from dlm_sway.probes.prompt_collapse import _fit_half_life
+-
+-
 -class TestFitHalfLife:
 -    def test_exponential_recovered(self) -> None:
 -        lengths = np.array([0.0, 100.0, 200.0, 300.0])
 -        # y = 1.0 * exp(-x / 100)
 -        y = np.exp(-lengths / 100.0)
 -        h = _fit_half_life(lengths, y)
 -        assert h is not None
 -        import math
+-
 -        # True half-life = ln(2) * 100 ≈ 69.3
 -        assert abs(h - math.log(2.0) * 100.0) < 1e-6
+-
 -    def test_returns_none_for_flat(self) -> None:
 -        lengths = np.array([0.0, 100.0, 200.0])
 -        y = np.array([1e-10, 1e-10, 1e-10])
 -        assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None
 -        # Either None or a huge half-life — both acceptable for flat input.
+-
 -    def test_returns_none_for_increasing(self) -> None:
 -        lengths = np.array([0.0, 100.0, 200.0])
 -        y = np.array([0.1, 0.3, 0.5])
 -        assert _fit_half_life(lengths, y) is None
+-
+-
 -def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend:
 -    """Return a backend whose divergence decays with prompt length.
+-
 -    ``stuffing_sensitivity`` controls how quickly the ft distribution
 -    snaps back to base as prompt length grows; lower = healthier adapter.
 -    """
 -    import numpy as np
+-
 -    base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32)
+-
 -    class _StuffedResponses(DummyResponses):
 -        def __init__(self, is_ft: bool):
 -            super().__init__()
 -            self._is_ft = is_ft
+-
 -        # Override retrieval by subclassing the view's lookup path.
+-
 -    # Simpler: use explicit prompts at each expected length to seed the dict.
 -    # The probe prefixes stuffing so the dummy sees the exact final prompt.
 -    # We pre-build dists for each prompt we expect to see.
 -    base = DummyResponses()
 -    ft = DummyResponses()
+-
 -    # Pre-generate prompts the probe will query. The probe uses default
 -    # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok.
 -    from dlm_sway.probes.prompt_collapse import _stuffing
+-
 -    for ctx_len in (0, 256, 512, 1024):
 -        prefix = _stuffing(ctx_len)
 -        for prompt in ("q1",):
 -            key = prefix + prompt
 -            # Base: always tight on token 1.
 -            base.token_dists[key] = TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(base_probs),
 -                vocab_size=100,
 -            )
 -            # FT: diverges at ctx=0, decays toward base with length.
 -            decay = np.exp(-ctx_len * stuffing_sensitivity)
 -            ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay
 -            ft_probs = ft_probs / ft_probs.sum()
 -            ft.token_dists[key] = TokenDist(
 -                token_ids=np.array([1, 2, 3], dtype=np.int64),
 -                logprobs=np.log(ft_probs.astype(np.float32)),
 -                vocab_size=100,
 -            )
 -    return DummyDifferentialBackend(base=base, ft=ft)
+-
+-
 -class TestPromptCollapse:
 -    def test_healthy_adapter_passes(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "pc",
 -                "kind": "prompt_collapse",
 -                "prompts": ["q1"],
 -                "context_lengths": [0, 256, 512, 1024],
 -                "assert_half_life_tokens": 100,
 -            }
 -        )
 -        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001))
 -        result = probe.run(spec, ctx)
 -        # Half-life should be well above 100 with slow decay.
 -        assert result.verdict == Verdict.PASS
 -        assert result.raw is not None
 -        assert result.raw > 100
+-
 -    def test_collapsing_adapter_fails(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "pc",
 -                "kind": "prompt_collapse",
 -                "prompts": ["q1"],
 -                "context_lengths": [0, 256, 512, 1024],
 -                "assert_half_life_tokens": 500,
 -            }
 -        )
 -        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02))
 -        result = probe.run(spec, ctx)
 -        # Fast decay → short half-life → fail against 500-token threshold.
 -        assert result.verdict == Verdict.FAIL
+-
 -    def test_error_on_empty_prompts(self) -> None:
 -        probe, spec = build_probe(
 -            {
 -                "name": "pc",
 -                "kind": "prompt_collapse",
 -                "prompts": [],
 -                "context_lengths": [0, 256],
 -            }
 -        )
 -        ctx = RunContext(backend=_programmed_backend(0.001))
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_section_internalization.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.section_internalization` (the flagship B1)."""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.core.scoring import RollingLogprob
 -from dlm_sway.core.sections import Section, SectionProbe
 -from dlm_sway.probes.base import RunContext, build_probe
+-
+-
 -def _rolling(mean_lp: float, n: int = 10) -> RollingLogprob:
 -    lp = np.full(n - 1, mean_lp, dtype=np.float32)
 -    return RollingLogprob(
 -        token_ids=np.arange(n, dtype=np.int64),
 -        logprobs=lp,
 -        num_tokens=n,
 -        total_logprob=float(lp.sum()),
 -    )
+-
+-
 -def _section(sid: str, kind: str = "prose", content: str = "content", probes=()) -> Section:
 -    return Section(id=sid, kind=kind, content=content, probes=tuple(probes))  # type: ignore[arg-type]
+-
+-
 -def test_skip_without_sections() -> None:
 -    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
 -    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
 -    ctx = RunContext(backend=backend)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.SKIP
+-
+-
 -def test_skip_with_single_section() -> None:
 -    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
 -    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
 -    ctx = RunContext(backend=backend, sections=(_section("a"),))
 -    result = probe.run(spec, ctx)
 -    assert result.verdict == Verdict.SKIP
+-
+-
 -def test_pass_when_each_section_gets_distinct_lift() -> None:
 -    # Build a dummy backend where the ft is much lower-PPL than base on
 -    # every section's content — uniform lift, but leak-check math
 -    # yields ~zero differential leak so all sections pass.
 -    content_a = "aaa " * 10
 -    content_b = "bbb " * 10
+-
 -    base = DummyResponses(rolling={content_a: _rolling(-3.0), content_b: _rolling(-3.0)})
 -    ft = DummyResponses(rolling={content_a: _rolling(-1.0), content_b: _rolling(-2.5)})
 -    backend = DummyDifferentialBackend(base=base, ft=ft)
+-
 -    sections = (
 -        _section("a", content=content_a),
 -        _section("b", content=content_b),
 -    )
 -    probe, spec = build_probe(
 -        {
 -            "name": "sis",
 -            "kind": "section_internalization",
 -            "per_section_threshold": 0.05,
 -        }
 -    )
 -    ctx = RunContext(backend=backend, sections=sections)
 -    result = probe.run(spec, ctx)
 -    assert result.verdict in (Verdict.PASS, Verdict.FAIL)
 -    assert "per_section" in result.evidence
 -    assert len(result.evidence["per_section"]) == 2
+-
+-
 -def test_instruction_uses_logprob_of() -> None:
 -    # Instruction sections contribute their probe Q/A pairs; feed
 -    # logprobs so the ft view comes out cheaper than base.
 -    probes_a = (SectionProbe(prompt="Qa", gold="Aa"),)
 -    probes_b = (SectionProbe(prompt="Qb", gold="Ab"),)
 -    base = DummyResponses(logprobs={("Qa", "Aa"): -10.0, ("Qb", "Ab"): -10.0})
 -    ft = DummyResponses(logprobs={("Qa", "Aa"): -3.0, ("Qb", "Ab"): -8.0})
 -    backend = DummyDifferentialBackend(base=base, ft=ft)
+-
 -    sections = (
 -        _section("a", kind="instruction", content="...", probes=probes_a),
 -        _section("b", kind="instruction", content="...", probes=probes_b),
 -    )
 -    probe, spec = build_probe(
 -        {"name": "sis", "kind": "section_internalization", "per_section_threshold": 0.05}
 -    )
 -    ctx = RunContext(backend=backend, sections=sections)
 -    result = probe.run(spec, ctx)
 -    per = result.evidence["per_section"]
 -    # Section A got much more lift than B, so effective_sis(a) > effective_sis(b).
 -    sis_by_id = {row["section_id"]: row["effective_sis"] for row in per}
 -    assert sis_by_id["a"] > sis_by_id["b"]

sway/tests/unit/test_probe_style_fingerprint.pydeleted

 -"""Tests for :mod:`dlm_sway.probes.style_fingerprint`."""
+-
 -from __future__ import annotations
+-
 -import numpy as np
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.result import Verdict
 -from dlm_sway.probes.base import RunContext, build_probe
 -from dlm_sway.probes.style_fingerprint import fingerprint
+-
+-
 -class TestFingerprint:
 -    def test_zero_vector_for_empty(self) -> None:
 -        fp = fingerprint("")
 -        assert fp.shape == (6,)
 -        assert np.allclose(fp, 0.0)
+-
 -    def test_non_zero_for_normal_text(self) -> None:
 -        fp = fingerprint("This is a sentence. This is another one. A third.")
 -        assert fp.shape == (6,)
 -        assert fp[0] > 0  # mean sentence length
 -        assert fp[2] > 0  # TTR
 -        assert fp[3] > 0  # avg word length
+-
 -    def test_distinct_styles_distinct_fingerprints(self) -> None:
 -        terse = "Go. Now. Quick."
 -        verbose = (
 -            "We must, with all deliberate speed and measured consideration, "
 -            "proceed expeditiously towards the elaborated and carefully "
 -            "constructed resolution of the foregoing matter."
 -        )
 -        assert not np.allclose(fingerprint(terse), fingerprint(verbose))
+-
+-
 -def _backend_with_samples(base: list[str], ft: list[str]) -> DummyDifferentialBackend:
 -    return DummyDifferentialBackend(
 -        base=DummyResponses(generations={f"p{i}": s for i, s in enumerate(base)}),
 -        ft=DummyResponses(generations={f"p{i}": s for i, s in enumerate(ft)}),
 -    )
+-
+-
 -class TestProbe:
 -    def test_pass_when_ft_drifts_toward_doc(self) -> None:
 -        base_samples = ["Short. Plain. Words."] * 2
 -        ft_samples = [
 -            "Wherein many clauses conjoin themselves, through extended "
 -            "ruminations, unto a meandering whole of considerable length."
 -        ] * 2
 -        doc = (
 -            "Wherein many clauses conjoin themselves, through extended "
 -            "ruminations, unto a meandering whole of considerable length. "
 -            "Further elaboration, no less copious, follows apace."
 -        )
 -        backend = _backend_with_samples(base_samples, ft_samples)
 -        probe, spec = build_probe(
 -            {
 -                "name": "c1",
 -                "kind": "style_fingerprint",
 -                "prompts": ["p0", "p1"],
 -                "doc_reference": doc,
 -                "max_new_tokens": 32,
 -                "assert_shift_gte": 0.2,
 -            }
 -        )
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.PASS
 -        assert result.raw is not None
 -        assert result.raw > 0.2
+-
 -    def test_fail_when_no_stylistic_shift(self) -> None:
 -        base_samples = ["Short. Plain. Words."] * 2
 -        ft_samples = ["Short. Plain. Words."] * 2
 -        doc = "Wherein clauses conjoin into meandering wholes of length."
 -        backend = _backend_with_samples(base_samples, ft_samples)
 -        probe, spec = build_probe(
 -            {
 -                "name": "c1",
 -                "kind": "style_fingerprint",
 -                "prompts": ["p0", "p1"],
 -                "doc_reference": doc,
 -                "assert_shift_gte": 0.25,
 -            }
 -        )
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.FAIL
+-
 -    def test_skip_without_doc_reference(self) -> None:
 -        backend = _backend_with_samples(["x"], ["y"])
 -        probe, spec = build_probe(
 -            {
 -                "name": "c1",
 -                "kind": "style_fingerprint",
 -                "prompts": ["p0"],
 -            }
 -        )
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.SKIP
+-
 -    def test_error_on_empty_prompts(self) -> None:
 -        backend = _backend_with_samples([], [])
 -        probe, spec = build_probe(
 -            {
 -                "name": "c1",
 -                "kind": "style_fingerprint",
 -                "prompts": [],
 -                "doc_reference": "doc",
 -            }
 -        )
 -        ctx = RunContext(backend=backend)
 -        result = probe.run(spec, ctx)
 -        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_result.pydeleted

 -"""Tests for :mod:`dlm_sway.core.result`."""
+-
 -from __future__ import annotations
+-
 -from dataclasses import FrozenInstanceError
+-
 -import pytest
+-
 -from dlm_sway.core.result import (
 -    DEFAULT_COMPONENT_WEIGHTS,
 -    ProbeResult,
 -    SuiteResult,
 -    SwayScore,
 -    Verdict,
 -    utcnow,
 -)
+-
+-
 -class TestVerdict:
 -    def test_is_str_enum(self) -> None:
 -        assert Verdict.PASS.value == "pass"
 -        assert str(Verdict.WARN.value) == "warn"
+-
 -    def test_all_expected_members(self) -> None:
 -        assert {v.value for v in Verdict} == {
 -            "pass",
 -            "fail",
 -            "warn",
 -            "skip",
 -            "error",
 -        }
+-
+-
 -class TestProbeResult:
 -    def test_minimum_construction(self) -> None:
 -        r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82)
 -        assert r.raw is None
 -        assert r.evidence == {}
 -        assert r.message == ""
 -        assert r.duration_s == 0.0
+-
 -    def test_frozen(self) -> None:
 -        r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5)
 -        with pytest.raises(FrozenInstanceError):
 -            r.score = 0.6  # type: ignore[misc]
+-
+-
 -class TestSuiteResult:
 -    def test_wall_seconds(self) -> None:
 -        from datetime import timedelta
+-
 -        started = utcnow()
 -        finished = started + timedelta(seconds=2, milliseconds=500)
 -        result = SuiteResult(
 -            spec_path="sway.yaml",
 -            started_at=started,
 -            finished_at=finished,
 -            base_model_id="b",
 -            adapter_id="a",
 -            sway_version="0.1.0.dev0",
 -        )
 -        assert result.wall_seconds == pytest.approx(2.5, abs=1e-6)
+-
+-
 -class TestSwayScore:
 -    def test_default_weights_sum_to_one(self) -> None:
 -        assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9
+-
 -    def test_band_boundaries(self) -> None:
 -        assert SwayScore.band_for(0.0) == "noise"
 -        assert SwayScore.band_for(0.29) == "noise"
 -        assert SwayScore.band_for(0.30) == "partial"
 -        assert SwayScore.band_for(0.59) == "partial"
 -        assert SwayScore.band_for(0.60) == "healthy"
 -        assert SwayScore.band_for(0.85) == "healthy"
 -        assert SwayScore.band_for(0.851) == "suspicious"
 -        assert SwayScore.band_for(0.99) == "suspicious"
+-
+-
 -def test_utcnow_is_tz_aware() -> None:
 -    now = utcnow()
 -    assert now.tzinfo is not None

sway/tests/unit/test_scoring.pydeleted

 -"""Tests for :mod:`dlm_sway.core.scoring`."""
+-
 -from __future__ import annotations
+-
 -import math
+-
 -import numpy as np
+-
 -from dlm_sway.core.scoring import (
 -    DifferentialBackend,
 -    RollingLogprob,
 -    ScoringBackend,
 -    TokenDist,
 -)
+-
+-
 -class TestRollingLogprob:
 -    def test_empty_sequence(self) -> None:
 -        r = RollingLogprob(
 -            token_ids=np.array([42], dtype=np.int64),
 -            logprobs=np.array([], dtype=np.float32),
 -            num_tokens=1,
 -            total_logprob=0.0,
 -        )
 -        assert r.mean_logprob == 0.0
 -        assert r.perplexity == 1.0
+-
 -    def test_mean_and_perplexity(self) -> None:
 -        # Three tokens, two transition logprobs summing to -4.0 → mean -2.0.
 -        r = RollingLogprob(
 -            token_ids=np.array([1, 2, 3], dtype=np.int64),
 -            logprobs=np.array([-1.5, -2.5], dtype=np.float32),
 -            num_tokens=3,
 -            total_logprob=-4.0,
 -        )
 -        assert math.isclose(r.mean_logprob, -2.0, rel_tol=1e-6)
 -        assert math.isclose(r.perplexity, math.exp(2.0), rel_tol=1e-6)
+-
+-
 -class TestTokenDist:
 -    def test_construction_and_defaults(self) -> None:
 -        dist = TokenDist(
 -            token_ids=np.array([1, 2, 3], dtype=np.int64),
 -            logprobs=np.array([-0.1, -1.0, -3.0], dtype=np.float32),
 -            vocab_size=50_257,
 -        )
 -        assert dist.tail_logprob == 0.0
 -        assert dist.token_ids.shape == (3,)
+-
+-
 -class TestProtocols:
 -    def test_scoring_backend_runtime_checkable(self) -> None:
 -        class FakeScoring:
 -            def logprob_of(self, prompt: str, completion: str) -> float:
 -                return 0.0
+-
 -            def rolling_logprob(self, text: str) -> RollingLogprob:
 -                return RollingLogprob(
 -                    token_ids=np.array([0], dtype=np.int64),
 -                    logprobs=np.array([], dtype=np.float32),
 -                    num_tokens=1,
 -                    total_logprob=0.0,
 -                )
+-
 -            def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
 -                return TokenDist(
 -                    token_ids=np.array([0], dtype=np.int64),
 -                    logprobs=np.array([0.0], dtype=np.float32),
 -                    vocab_size=1,
 -                )
+-
 -        assert isinstance(FakeScoring(), ScoringBackend)
+-
 -    def test_differential_backend_runtime_checkable(self) -> None:
 -        from contextlib import nullcontext
+-
 -        class FakeDiff:
 -            def as_base(self):  # type: ignore[no-untyped-def]
 -                return nullcontext(object())
+-
 -            def as_finetuned(self):  # type: ignore[no-untyped-def]
 -                return nullcontext(object())
+-
 -        assert isinstance(FakeDiff(), DifferentialBackend)

sway/tests/unit/test_sections.pydeleted

 -"""Tests for :mod:`dlm_sway.core.sections`."""
+-
 -from __future__ import annotations
+-
 -from dlm_sway.core.sections import (
 -    Section,
 -    SectionPreference,
 -    SectionProbe,
 -    filter_kinds,
 -)
+-
+-
 -def test_default_field_types() -> None:
 -    s = Section(id="abc", kind="prose", content="hello world")
 -    assert s.probes == ()
 -    assert s.preferences == ()
 -    assert s.tag is None
+-
+-
 -def test_filter_kinds() -> None:
 -    sections = (
 -        Section(id="a", kind="prose", content="x"),
 -        Section(id="b", kind="instruction", content="y"),
 -        Section(id="c", kind="preference", content="z"),
 -    )
 -    only_prose = filter_kinds(sections, ("prose",))
 -    assert len(only_prose) == 1
 -    assert only_prose[0].id == "a"
+-
+-
 -def test_section_probe_and_preference() -> None:
 -    p = SectionProbe(prompt="Q", gold="A")
 -    assert p.prompt == "Q"
 -    pref = SectionPreference(prompt="P", chosen="good", rejected="bad")
 -    assert pref.chosen == "good"

sway/tests/unit/test_suite_runner.pydeleted

 -"""Tests for :mod:`dlm_sway.suite.runner`.
+-
 -Uses the dummy backend + ad-hoc probe classes so nothing real is loaded.
 -"""
+-
 -from __future__ import annotations
+-
 -from typing import Literal
+-
 -import pytest
+-
 -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 -from dlm_sway.core.errors import ProbeError
 -from dlm_sway.core.result import ProbeResult, Verdict
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
 -from dlm_sway.suite.runner import run
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -class _PassSpec(ProbeSpec):
 -    kind: Literal["__runner_pass"] = "__runner_pass"
+-
+-
 -class _PassProbe(Probe):
 -    kind = "__runner_pass"
 -    spec_cls = _PassSpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.PASS, score=0.9)
+-
+-
 -class _FailSpec(ProbeSpec):
 -    kind: Literal["__runner_fail"] = "__runner_fail"
+-
+-
 -class _FailProbe(Probe):
 -    kind = "__runner_fail"
 -    spec_cls = _FailSpec
 -    category = "attribution"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.FAIL, score=0.1)
+-
+-
 -class _RaiseSpec(ProbeSpec):
 -    kind: Literal["__runner_raise"] = "__runner_raise"
+-
+-
 -class _RaiseProbe(Probe):
 -    kind = "__runner_raise"
 -    spec_cls = _RaiseSpec
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        raise ProbeError(spec.kind, "kaboom")
+-
+-
 -class _UnexpectedSpec(ProbeSpec):
 -    kind: Literal["__runner_unexpected"] = "__runner_unexpected"
+-
+-
 -class _UnexpectedProbe(Probe):
 -    kind = "__runner_unexpected"
 -    spec_cls = _UnexpectedSpec
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        raise ValueError("surprise")
+-
+-
 -@pytest.fixture
 -def backend() -> DummyDifferentialBackend:
 -    return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
+-
+-
 -def _spec(*entries: dict) -> SwaySpec:
 -    return SwaySpec.model_validate(
 -        {
 -            "version": 1,
 -            "models": {
 -                "base": {"base": "b"},
 -                "ft": {"base": "b", "adapter": "/tmp/a"},
 -            },
 -            "suite": list(entries),
 -        }
 -    )
+-
+-
 -class TestRunner:
 -    def test_runs_each_probe_in_order(self, backend: DummyDifferentialBackend) -> None:
 -        spec = _spec(
 -            {"name": "p1", "kind": "__runner_pass"},
 -            {"name": "p2", "kind": "__runner_fail"},
 -        )
 -        result = run(spec, backend)
 -        assert [r.name for r in result.probes] == ["p1", "p2"]
 -        assert result.probes[0].verdict == Verdict.PASS
 -        assert result.probes[1].verdict == Verdict.FAIL
+-
 -    def test_disabled_probe_records_skip(self, backend: DummyDifferentialBackend) -> None:
 -        spec = _spec({"name": "p1", "kind": "__runner_pass", "enabled": False})
 -        result = run(spec, backend)
 -        assert result.probes[0].verdict == Verdict.SKIP
 -        assert "disabled" in result.probes[0].message
+-
 -    def test_probeerror_becomes_error_verdict(self, backend: DummyDifferentialBackend) -> None:
 -        spec = _spec({"name": "oops", "kind": "__runner_raise"})
 -        result = run(spec, backend)
 -        assert result.probes[0].verdict == Verdict.ERROR
 -        assert "kaboom" in result.probes[0].message
+-
 -    def test_unexpected_exception_becomes_error_verdict(
 -        self, backend: DummyDifferentialBackend
 -    ) -> None:
 -        spec = _spec({"name": "oops", "kind": "__runner_unexpected"})
 -        result = run(spec, backend)
 -        assert result.probes[0].verdict == Verdict.ERROR
 -        assert "ValueError" in result.probes[0].message
+-
 -    def test_wall_seconds_populated(self, backend: DummyDifferentialBackend) -> None:
 -        spec = _spec({"name": "p1", "kind": "__runner_pass"})
 -        result = run(spec, backend)
 -        assert result.wall_seconds >= 0
 -        assert result.probes[0].duration_s >= 0
+-
 -    def test_null_adapter_passes_on_null_calibrated_backend(
 -        self, backend: DummyDifferentialBackend
 -    ) -> None:
 -        # Dummy backend implements NullCalibratedBackend, so calibration runs.
 -        spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]})
 -        result = run(spec, backend)
 -        assert result.probes[0].kind == "null_adapter"
 -        assert result.probes[0].verdict == Verdict.PASS
 -        # And the suite's null_stats bubbles up onto the result.
 -        assert "delta_kl" in result.null_stats

sway/tests/unit/test_suite_score_report.pydeleted

 -"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
+-
 -from __future__ import annotations
+-
 -import json
 -from datetime import timedelta
 -from typing import Literal
+-
 -import pytest
+-
 -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
 -from dlm_sway.suite import report, score
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -class _AdherenceSpec(ProbeSpec):
 -    kind: Literal["__score_adherence"] = "__score_adherence"
+-
+-
 -class _AdherenceProbe(Probe):
 -    kind = "__score_adherence"
 -    spec_cls = _AdherenceSpec
 -    category = "adherence"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        raise NotImplementedError  # never executed; registered for category lookup
+-
+-
 -class _AttributionSpec(ProbeSpec):
 -    kind: Literal["__score_attribution"] = "__score_attribution"
+-
+-
 -class _AttributionProbe(Probe):
 -    kind = "__score_attribution"
 -    spec_cls = _AttributionSpec
 -    category = "attribution"
+-
 -    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 -        raise NotImplementedError
+-
+-
 -def _synth_suite(*probes: ProbeResult) -> SuiteResult:
 -    started = utcnow()
 -    return SuiteResult(
 -        spec_path="sway.yaml",
 -        started_at=started,
 -        finished_at=started + timedelta(seconds=1),
 -        base_model_id="base",
 -        adapter_id="adapter",
 -        sway_version="0.1.0.dev0",
 -        probes=probes,
 -    )
+-
+-
 -class TestCompute:
 -    def test_single_passing_probe(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
 -        )
 -        s = score.compute(suite)
 -        assert s.overall == pytest.approx(0.8)
 -        assert s.components["adherence"] == pytest.approx(0.8)
 -        assert s.band == "healthy"
+-
 -    def test_mixed_categories_weighted(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
 -            ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
 -        )
 -        s = score.compute(suite)
 -        # Active categories: adherence (0.30) + attribution (0.35). Normalized.
 -        expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
 -        assert s.overall == pytest.approx(expected)
+-
 -    def test_errors_and_skips_excluded(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
 -            ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
 -            ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
 -        )
 -        s = score.compute(suite)
 -        assert s.components["adherence"] == pytest.approx(0.9)
+-
 -    def test_per_probe_weights_override_uniform(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(
 -                name="a",
 -                kind="__score_adherence",
 -                verdict=Verdict.PASS,
 -                score=1.0,
 -                evidence={"weight": 3.0},
 -            ),
 -            ProbeResult(
 -                name="b",
 -                kind="__score_adherence",
 -                verdict=Verdict.PASS,
 -                score=0.0,
 -                evidence={"weight": 1.0},
 -            ),
 -        )
 -        s = score.compute(suite)
 -        # Weighted mean: (3·1 + 1·0) / 4 = 0.75
 -        assert s.components["adherence"] == pytest.approx(0.75)
+-
 -    def test_failed_probe_surfaces_in_findings(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(
 -                name="bad",
 -                kind="__score_adherence",
 -                verdict=Verdict.FAIL,
 -                score=0.1,
 -                message="nope",
 -            )
 -        )
 -        s = score.compute(suite)
 -        assert any("bad" in f for f in s.findings)
+-
+-
 -class TestJsonReport:
 -    def test_schema_fields(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(
 -                name="p1",
 -                kind="__score_adherence",
 -                verdict=Verdict.PASS,
 -                score=0.75,
 -                raw=0.12,
 -                z_score=3.1,
 -            )
 -        )
 -        s = score.compute(suite)
 -        out = json.loads(report.to_json(suite, s))
 -        assert out["schema_version"] == 1
 -        assert out["score"]["overall"] == pytest.approx(0.75)
 -        assert out["probes"][0]["verdict"] == "pass"
 -        assert out["probes"][0]["z_score"] == pytest.approx(3.1)
+-
+-
 -class TestJunit:
 -    def test_counts_populated(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
 -            ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
 -            ProbeResult(
 -                name="p3",
 -                kind="__score_adherence",
 -                verdict=Verdict.ERROR,
 -                score=None,
 -            ),
 -        )
 -        s = score.compute(suite)
 -        xml = report.to_junit(suite, s)
 -        assert 'tests="3"' in xml
 -        assert 'failures="1"' in xml
 -        assert 'errors="1"' in xml
 -        assert "<failure" in xml
 -        assert "<error" in xml
+-
+-
 -class TestMarkdown:
 -    def test_contains_probe_table(self) -> None:
 -        suite = _synth_suite(
 -            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
 -        )
 -        s = score.compute(suite)
 -        md = report.to_markdown(suite, s)
 -        assert "dlm-sway report" in md
 -        assert "| p1 | `__score_adherence`" in md
+-
+-
 -class TestTerminal:
 -    def test_renders_without_error(self) -> None:
 -        import io
+-
 -        from rich.console import Console
+-
 -        suite = _synth_suite(
 -            ProbeResult(
 -                name="p1",
 -                kind="__score_adherence",
 -                verdict=Verdict.PASS,
 -                score=0.8,
 -                raw=0.12,
 -                z_score=3.1,
 -                message="looks fine",
 -            ),
 -            ProbeResult(
 -                name="p2",
 -                kind="__score_attribution",
 -                verdict=Verdict.FAIL,
 -                score=0.1,
 -                message="a very long message that will be truncated — " * 5,
 -            ),
 -            ProbeResult(
 -                name="p3",
 -                kind="__score_adherence",
 -                verdict=Verdict.SKIP,
 -                score=None,
 -            ),
 -        )
 -        s = score.compute(suite)
 -        buf = io.StringIO()
 -        console = Console(file=buf, force_terminal=False, width=120)
 -        report.to_terminal(suite, s, console=console)
 -        out = buf.getvalue()
 -        assert "dlm-sway report" in out
 -        assert "overall:" in out
 -        assert "p1" in out
 -        assert "p2" in out
 -        # Top findings section kicks in because p2 failed.
 -        assert "top findings" in out
+-
+-
 -# Force the SwaySpec model to stay reachable from tests (keeps mypy happy
 -# on the eventual CLI path that calls into both).
 -assert SwaySpec is not None

sway/tests/unit/test_suite_spec.pydeleted

 -"""Tests for :mod:`dlm_sway.suite.spec` + :mod:`dlm_sway.suite.loader`."""
+-
 -from __future__ import annotations
+-
 -from pathlib import Path
+-
 -import pytest
+-
 -from dlm_sway.core.errors import SpecValidationError
 -from dlm_sway.suite.loader import from_dict, load_spec
 -from dlm_sway.suite.spec import SwaySpec
+-
+-
 -def _minimum_valid() -> dict:
 -    return {
 -        "version": 1,
 -        "models": {
 -            "base": {"kind": "hf", "base": "HuggingFaceTB/SmolLM2-135M-Instruct"},
 -            "ft": {
 -                "kind": "hf",
 -                "base": "HuggingFaceTB/SmolLM2-135M-Instruct",
 -                "adapter": "/tmp/adapter",
 -            },
 -        },
 -        "suite": [],
 -    }
+-
+-
 -class TestSwaySpec:
 -    def test_minimum_valid(self) -> None:
 -        spec = from_dict(_minimum_valid())
 -        assert isinstance(spec, SwaySpec)
 -        assert spec.version == 1
 -        assert spec.defaults.seed == 0
 -        assert spec.defaults.differential is True
 -        assert spec.suite == []
+-
 -    def test_rejects_unknown_top_level_keys(self) -> None:
 -        data = _minimum_valid()
 -        data["bogus"] = True
 -        with pytest.raises(SpecValidationError) as exc_info:
 -            from_dict(data)
 -        assert "bogus" in str(exc_info.value).lower()
+-
 -    def test_rejects_future_version(self) -> None:
 -        data = _minimum_valid()
 -        data["version"] = 9
 -        with pytest.raises(SpecValidationError, match="unsupported sway spec version"):
 -            from_dict(data)
+-
 -    def test_defaults_frozen(self) -> None:
 -        spec = from_dict(_minimum_valid())
 -        from pydantic import ValidationError
+-
 -        with pytest.raises(ValidationError):
 -            spec.defaults.seed = 99  # type: ignore[misc]
+-
+-
 -class TestLoader:
 -    def test_missing_file(self, tmp_path: Path) -> None:
 -        missing = tmp_path / "nope.yaml"
 -        with pytest.raises(SpecValidationError, match="not found"):
 -            load_spec(missing)
+-
 -    def test_invalid_yaml(self, tmp_path: Path) -> None:
 -        bad = tmp_path / "bad.yaml"
 -        # An unmatched { triggers yaml.scanner; a structurally ambiguous
 -        # indent parses as a string value, which isn't a YAML error.
 -        bad.write_text("{ unmatched: [", encoding="utf-8")
 -        with pytest.raises(SpecValidationError, match="invalid YAML"):
 -            load_spec(bad)
+-
 -    def test_non_mapping_top_level(self, tmp_path: Path) -> None:
 -        bad = tmp_path / "list.yaml"
 -        bad.write_text("- 1\n- 2\n", encoding="utf-8")
 -        with pytest.raises(SpecValidationError, match="must be a mapping"):
 -            load_spec(bad)
+-
 -    def test_roundtrip_via_yaml(self, tmp_path: Path) -> None:
 -        import yaml
+-
 -        path = tmp_path / "sway.yaml"
 -        path.write_text(yaml.safe_dump(_minimum_valid()), encoding="utf-8")
 -        spec = load_spec(path)
 -        assert spec.models.ft.adapter == Path("/tmp/adapter")

sway/tests/unit/test_visualize.pydeleted

 -"""Tests for :mod:`dlm_sway.visualize`.
+-
 -Exercises the error path (matplotlib missing) and the happy path when
 -the module is present by stubbing ``matplotlib.pyplot`` via sys.modules.
 -"""
+-
 -from __future__ import annotations
+-
 -import sys
 -import types
 -from datetime import timedelta
+-
 -import pytest
+-
 -from dlm_sway.core.errors import BackendNotAvailableError
 -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
+-
+-
 -def _suite_with(*probes: ProbeResult) -> SuiteResult:
 -    started = utcnow()
 -    return SuiteResult(
 -        spec_path="sway.yaml",
 -        started_at=started,
 -        finished_at=started + timedelta(seconds=1),
 -        base_model_id="b",
 -        adapter_id="a",
 -        sway_version="0.1.0.dev0",
 -        probes=probes,
 -    )
+-
+-
 -class _FakeFig:
 -    def tight_layout(self) -> None:  # pragma: no cover — trivial
 -        return None
+-
+-
 -class _FakeAx:
 -    def __init__(self) -> None:
 -        self.calls: list[str] = []
+-
 -    def bar(self, *a, **k):  # type: ignore[no-untyped-def]
 -        self.calls.append("bar")
+-
 -    def plot(self, *a, **k):  # type: ignore[no-untyped-def]
 -        self.calls.append("plot")
+-
 -    def hist(self, *a, **k):  # type: ignore[no-untyped-def]
 -        self.calls.append("hist")
+-
 -    def axhline(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def axvline(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def set_xticks(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def set_xticklabels(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def set_xlabel(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def set_ylabel(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def set_title(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
 -    def legend(self, *a, **k):  # type: ignore[no-untyped-def]
 -        return None
+-
+-
 -@pytest.fixture
 -def fake_mpl(monkeypatch: pytest.MonkeyPatch) -> _FakeAx:
 -    ax = _FakeAx()
+-
 -    def _subplots(*a, **k):  # type: ignore[no-untyped-def]
 -        return _FakeFig(), ax
+-
 -    plt = types.ModuleType("matplotlib.pyplot")
 -    plt.subplots = _subplots  # type: ignore[attr-defined]
 -    mpl_pkg = types.ModuleType("matplotlib")
 -    monkeypatch.setitem(sys.modules, "matplotlib", mpl_pkg)
 -    monkeypatch.setitem(sys.modules, "matplotlib.pyplot", plt)
 -    return ax
+-
+-
 -def test_section_sis_plot_uses_per_section_evidence(fake_mpl: _FakeAx) -> None:
 -    from dlm_sway.visualize import plot_section_sis
+-
 -    suite = _suite_with(
 -        ProbeResult(
 -            name="sis",
 -            kind="section_internalization",
 -            verdict=Verdict.PASS,
 -            score=0.75,
 -            raw=0.1,
 -            evidence={
 -                "per_section": [
 -                    {
 -                        "section_id": "a",
 -                        "kind": "prose",
 -                        "tag": None,
 -                        "base_nll": 3.0,
 -                        "ft_nll": 2.5,
 -                        "own_lift": 0.17,
 -                        "leak_lift": 0.02,
 -                        "effective_sis": 0.15,
 -                        "passed": True,
 -                    },
 -                    {
 -                        "section_id": "b",
 -                        "kind": "instruction",
 -                        "tag": "intro",
 -                        "base_nll": 4.0,
 -                        "ft_nll": 3.9,
 -                        "own_lift": 0.025,
 -                        "leak_lift": 0.03,
 -                        "effective_sis": -0.005,
 -                        "passed": False,
 -                    },
 -                ],
 -                "per_section_threshold": 0.05,
 -            },
 -        )
 -    )
 -    plot_section_sis(suite)
 -    assert "bar" in fake_mpl.calls
+-
+-
 -def test_adapter_ablation_plot(fake_mpl: _FakeAx) -> None:
 -    from dlm_sway.visualize import plot_adapter_ablation
+-
 -    suite = _suite_with(
 -        ProbeResult(
 -            name="abl",
 -            kind="adapter_ablation",
 -            verdict=Verdict.PASS,
 -            score=0.8,
 -            raw=0.9,
 -            evidence={
 -                "lambdas": [0.0, 0.5, 1.0, 1.25],
 -                "mean_divergence_per_lambda": [0.0, 0.5, 1.0, 1.1],
 -                "linearity": 0.91,
 -                "saturation_lambda": 0.75,
 -                "overshoot": 1.1,
 -            },
 -        )
 -    )
 -    plot_adapter_ablation(suite)
 -    assert "plot" in fake_mpl.calls
+-
+-
 -def test_kl_histogram_plot(fake_mpl: _FakeAx) -> None:
 -    from dlm_sway.visualize import plot_kl_histogram
+-
 -    suite = _suite_with(
 -        ProbeResult(
 -            name="dk",
 -            kind="delta_kl",
 -            verdict=Verdict.PASS,
 -            score=0.7,
 -            raw=0.1,
 -            evidence={"per_prompt": [0.05, 0.1, 0.12, 0.09, 0.15], "divergence_kind": "js"},
 -        )
 -    )
 -    plot_kl_histogram(suite)
 -    assert "hist" in fake_mpl.calls
+-
+-
 -def test_raises_when_matplotlib_missing(monkeypatch: pytest.MonkeyPatch) -> None:
 -    # Purge matplotlib modules and block imports.
 -    for mod in list(sys.modules):
 -        if mod == "matplotlib" or mod.startswith("matplotlib."):
 -            monkeypatch.delitem(sys.modules, mod, raising=False)
+-
 -    import builtins
+-
 -    real_import = builtins.__import__
+-
 -    def fake_import(name: str, *a, **k):  # type: ignore[no-untyped-def]
 -        if name == "matplotlib" or name.startswith("matplotlib."):
 -            raise ImportError("matplotlib missing in this venv")
 -        return real_import(name, *a, **k)
+-
 -    monkeypatch.setattr(builtins, "__import__", fake_import)
+-
 -    from dlm_sway.visualize import plot_section_sis
+-
 -    suite = _suite_with()
 -    with pytest.raises(BackendNotAvailableError):
 -        plot_section_sis(suite)
+-
+-
 -def test_raises_when_no_matching_probe(fake_mpl: _FakeAx) -> None:
 -    from dlm_sway.visualize import plot_section_sis
+-
 -    suite = _suite_with()  # empty — no section_internalization probe
 -    with pytest.raises(ValueError, match="section_internalization"):
 -        plot_section_sis(suite)