`72bb003`

sway: convert in-tree subproject to git submodule pointing at tenseleyFlow/sway

Authored by

espadonne 3 weeks ago

SHA: 72bb0030b72321dea3c66a2e6d7ce26e52c74550
Parents: 9da4019
Tree: e628ba5

83 changed files

Status	File	+	-
M	`.gitmodules`	3	0
A	`sway`	1	0
D	`sway/CHANGELOG.md`	0	41
D	`sway/LICENSE`	0	21
D	`sway/README.md`	0	101
D	`sway/pyproject.toml`	0	210
D	`sway/src/dlm_sway/__init__.py`	0	42
D	`sway/src/dlm_sway/backends/__init__.py`	0	118
D	`sway/src/dlm_sway/backends/dummy.py`	0	257
D	`sway/src/dlm_sway/backends/hf.py`	0	375
D	`sway/src/dlm_sway/backends/mlx.py`	0	205
D	`sway/src/dlm_sway/cli/__init__.py`	0	1
D	`sway/src/dlm_sway/cli/app.py`	0	59
D	`sway/src/dlm_sway/cli/commands.py`	0	396
D	`sway/src/dlm_sway/core/__init__.py`	0	1
D	`sway/src/dlm_sway/core/determinism.py`	0	97
D	`sway/src/dlm_sway/core/errors.py`	0	65
D	`sway/src/dlm_sway/core/model.py`	0	112
D	`sway/src/dlm_sway/core/result.py`	0	139
D	`sway/src/dlm_sway/core/scoring.py`	0	203
D	`sway/src/dlm_sway/core/sections.py`	0	76
D	`sway/src/dlm_sway/integrations/__init__.py`	0	1
D	`sway/src/dlm_sway/integrations/dlm/__init__.py`	0	1
D	`sway/src/dlm_sway/integrations/dlm/autogen.py`	0	191
D	`sway/src/dlm_sway/integrations/dlm/resolver.py`	0	243
D	`sway/src/dlm_sway/probes/__init__.py`	0	27
D	`sway/src/dlm_sway/probes/_calibration_pack.py`	0	63
D	`sway/src/dlm_sway/probes/_divergence.py`	0	102
D	`sway/src/dlm_sway/probes/adapter_ablation.py`	0	193
D	`sway/src/dlm_sway/probes/adapter_revert.py`	0	178
D	`sway/src/dlm_sway/probes/base.py`	0	131
D	`sway/src/dlm_sway/probes/calibration_drift.py`	0	135
D	`sway/src/dlm_sway/probes/delta_kl.py`	0	121
D	`sway/src/dlm_sway/probes/leakage.py`	0	194
D	`sway/src/dlm_sway/probes/null_adapter.py`	0	144
D	`sway/src/dlm_sway/probes/paraphrase_invariance.py`	0	148
D	`sway/src/dlm_sway/probes/preference_flip.py`	0	140
D	`sway/src/dlm_sway/probes/prompt_collapse.py`	0	159
D	`sway/src/dlm_sway/probes/section_internalization.py`	0	189
D	`sway/src/dlm_sway/probes/style_fingerprint.py`	0	179
D	`sway/src/dlm_sway/py.typed`	0	0
D	`sway/src/dlm_sway/suite/__init__.py`	0	1
D	`sway/src/dlm_sway/suite/loader.py`	0	48
D	`sway/src/dlm_sway/suite/report.py`	0	249
D	`sway/src/dlm_sway/suite/runner.py`	0	136
D	`sway/src/dlm_sway/suite/score.py`	0	106
D	`sway/src/dlm_sway/suite/spec.py`	0	72
D	`sway/src/dlm_sway/visualize.py`	0	137
D	`sway/tests/__init__.py`	0	0
D	`sway/tests/conftest.py`	0	29
D	`sway/tests/fixtures/__init__.py`	0	0
D	`sway/tests/fixtures/tiny_model.py`	0	53
D	`sway/tests/integration/__init__.py`	0	0
D	`sway/tests/integration/conftest.py`	0	10
D	`sway/tests/integration/test_hf_adapter_toggle.py`	0	113
D	`sway/tests/unit/__init__.py`	0	0
D	`sway/tests/unit/test_backend_dummy.py`	0	102
D	`sway/tests/unit/test_backend_registry.py`	0	133
D	`sway/tests/unit/test_cli.py`	0	92
D	`sway/tests/unit/test_determinism.py`	0	47
D	`sway/tests/unit/test_divergence.py`	0	73
D	`sway/tests/unit/test_dlm_bridge.py`	0	252
D	`sway/tests/unit/test_errors.py`	0	55
D	`sway/tests/unit/test_model.py`	0	78
D	`sway/tests/unit/test_null_calibration.py`	0	123
D	`sway/tests/unit/test_probe_adapter_ablation.py`	0	135
D	`sway/tests/unit/test_probe_adapter_revert.py`	0	170
D	`sway/tests/unit/test_probe_base.py`	0	69
D	`sway/tests/unit/test_probe_calibration_drift.py`	0	57
D	`sway/tests/unit/test_probe_delta_kl.py`	0	124
D	`sway/tests/unit/test_probe_leakage.py`	0	109
D	`sway/tests/unit/test_probe_paraphrase_invariance.py`	0	91
D	`sway/tests/unit/test_probe_preference_flip.py`	0	161
D	`sway/tests/unit/test_probe_prompt_collapse.py`	0	137
D	`sway/tests/unit/test_probe_section_internalization.py`	0	94
D	`sway/tests/unit/test_probe_style_fingerprint.py`	0	115
D	`sway/tests/unit/test_result.py`	0	82
D	`sway/tests/unit/test_scoring.py`	0	84
D	`sway/tests/unit/test_sections.py`	0	35
D	`sway/tests/unit/test_suite_runner.py`	0	134
D	`sway/tests/unit/test_suite_score_report.py`	0	217
D	`sway/tests/unit/test_suite_spec.py`	0	85
D	`sway/tests/unit/test_visualize.py`	0	202

.gitmodulesmodified

  	# `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/
  	# which the submodule's own .gitignore covers.
  	ignore = untracked
++[submodule "sway"]
++	path = sway
++	url = https://github.com/tenseleyFlow/sway.git

swayadded

`@@ -0,0 +1,1 @@`
		1	+Subproject commit 98ad9417c94e1bbeb97cf5e553878d7953513f69

sway/CHANGELOG.mddeleted

--# Changelog
--
--## 0.1.0.dev0 — 2026-04-20
--
--Initial pre-alpha. Full 11-primitive battery shipped.
--
--### Primitives
--
--- **Adherence**
--  - `delta_kl` — mean JS/KL divergence between base and fine-tuned next-token distributions
--  - `adapter_revert` — reversion under adversarial paraphrase (needs `sway-eval[semsim]`)
--  - `prompt_collapse` — exponential-decay fit of divergence over context length
--- **Attribution**
--  - `section_internalization` *(flagship)* — per-section `effective_sis` with leak check
--  - `paraphrase_invariance` — memorization vs. generalization, intent-aware
--  - `preference_flip` — DPO/ORPO chosen/rejected margin inversion
--- **Calibration**
--  - `style_fingerprint` — 6-dim numpy-only stylistic shift vs. document
--  - `calibration_drift` — general-knowledge regression on a packaged 30-item pack
--  - `leakage` — greedy LCS recall + perturbation fragility
--- **Ablation**
--  - `adapter_ablation` *(signature primitive)* — λ-scaled divergence curve with linearity, saturation, overshoot metrics
--- **Baseline**
--  - `null_adapter` — stats scaffolding for z-score calibration (implementation pending)
--
--### Infrastructure
--
--- `DifferentialBackend` + `ScalableDifferentialBackend` protocols
--- HuggingFace + PEFT backend with `disable_adapter` / `set_adapter` toggling and LoRA-scale mutation
--- Dummy backend for unit tests (canned responses + linear-blend scalable mode)
--- YAML spec loader, composite score (four-category weighted), rich terminal + JSON + JUnit + Markdown reports
--- Typer CLI: `run`, `gate`, `check`, `diff`, `autogen`, `doctor`, `report`
--- `.dlm` bridge (`dlm-sway[dlm]`): resolver + full-battery autogen
--- Matplotlib visualizations (`dlm-sway[viz]`): SIS bar chart, ablation curve, KL histogram
--
--### Known gaps
--
--- Null-adapter baseline is scaffolded but its HF-level materialization (building random-init LoRAs at matched rank) is not yet wired — probes fall back to fixed thresholds until the next milestone.
--- Custom backend entry-point dispatch (`kind: custom`) is stubbed but not implemented.
--- MLX backend is registered as a future-milestone target; all MLX paths raise `BackendNotAvailableError`.
--- PyPI publication of the `dlm-sway` wheel is pending a clean CI release workflow.

sway/LICENSEdeleted

--MIT License
--
--Copyright (c) 2026 Matt Wolffe
--
--Permission is hereby granted, free of charge, to any person obtaining a copy
--of this software and associated documentation files (the "Software"), to deal
--in the Software without restriction, including without limitation the rights
--to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
--copies of the Software, and to permit persons to whom the Software is
--furnished to do so, subject to the following conditions:
--
--The above copyright notice and this permission notice shall be included in all
--copies or substantial portions of the Software.
--
--THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
--IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
--FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
--AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
--LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
--OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
--SOFTWARE.

sway/README.mddeleted

--# dlm-sway
--
--Differential testing for fine-tuned causal language models.
--
--**One question:** *did LoRA/QLoRA training actually change model behavior
--in a meaningful way, or is the model just defaulting to the pretrained
--base?*
--
--`dlm-sway` gives you a trustworthy, reproducible answer with eleven
--purpose-built primitives, each z-scored against a null-adapter baseline.
--No LLM judges. No external APIs. Deterministic on CPU where possible.
--
--## Install
--
--```bash
--pip install "dlm-sway[hf]"                # HuggingFace + PEFT backend
--pip install "dlm-sway[hf,style,semsim]"   # full primitive battery
--pip install "dlm-sway[all]"               # everything including optional viz
--pip install "dlm-sway[dlm]"               # auto-generate tests from a .dlm file
--```
--
--## 90-second smoke test
--
--```bash
--dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct
--```
--
--Outputs a verdict in under a minute on CPU for small models: *your
--adapter is 4.2σ above noise* ✅ or *indistinguishable from a null
--adapter* ❌.
--
--## Full suite
--
--```yaml
--# sway.yaml
--version: 1
--models:
--  base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"}
--  ft:   {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct",
--         adapter: "./runs/adapter/v0003"}
--suite:
--  - {name: knows_concept, kind: dir,
--     prompt: "The Dunning-Kruger effect describes",
--     target: " a cognitive bias where",
--     distractor: " a programming language"}
--  - {name: no_reversion, kind: adapter_revert, paraphrases: 4}
--  - {name: section_attribution, kind: section_internalization}
--```
--
--```bash
--dlm-sway run sway.yaml              # full report to terminal + JSON
--dlm-sway gate sway.yaml --junit     # CI-friendly; non-zero on fail
--```
--
--## Why it exists
--
--Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"*
--That's the wrong question after a targeted LoRA fine-tune on a small
--user-authored document. The right question is *"did the adapter actually
--move the model toward what I wrote?"* — and existing tools answer this
--poorly.
--
--`dlm-sway` answers it directly via eleven primitives across four
--categories:
--
--| Category      | Primitives                                            |
--|---------------|-------------------------------------------------------|
--| Adherence     | `delta_kl`, `adapter_revert`, `prompt_collapse`       |
--| Attribution   | `section_internalization`, `paraphrase_invariance`, `preference_flip` |
--| Calibration   | `style_fingerprint`, `calibration_drift`, `leakage`   |
--| Ablation      | `adapter_ablation` ← the signature primitive          |
--
--**The signature primitive.** `adapter_ablation` scales the LoRA additive
--term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence
--curve. A healthy fine-tune shows a smooth, monotonic, non-saturated
--response. A degenerate one shows a step function or an overshoot-then-
--crash. Nobody else does this because nobody else gets this close to the
--adapter math.
--
--## The `.dlm` integration
--
--If you trained your adapter via the [DocumentLanguageModel
--project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway
--can auto-generate a test suite from your document's sections:
--
--```bash
--pip install "dlm-sway[hf,dlm]"
--dlm-sway autogen path/to/doc.dlm -o sway.yaml
--dlm-sway run sway.yaml
--```
--
--Per-section attribution tells you *which* parts of your document
--actually moved the model — a kind of signal no other tool provides.
--
--## Status
--
--Pre-alpha. API will break. Version `0.1.0` is the first tag.
--
--## License
--
--MIT

sway/pyproject.tomldeleted

--[project]
--name = "dlm-sway"
--version = "0.1.0.dev0"
--description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?"
--readme = "README.md"
--requires-python = ">=3.11"
--license = { text = "MIT" }
--authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }]
--keywords = [
--    "lora",
--    "qlora",
--    "peft",
--    "fine-tuning",
--    "evaluation",
--    "llm",
--    "differential-testing",
--]
--classifiers = [
--    "Development Status :: 3 - Alpha",
--    "Intended Audience :: Developers",
--    "Intended Audience :: Science/Research",
--    "License :: OSI Approved :: MIT License",
--    "Programming Language :: Python :: 3",
--    "Programming Language :: Python :: 3.11",
--    "Programming Language :: Python :: 3.12",
--    "Topic :: Scientific/Engineering :: Artificial Intelligence",
--]
--
--# Core deps: spec loading, orchestration, reporting. No torch — a user
--# who only defines specs or writes a custom backend shouldn't pull 3 GB
--# of CUDA wheels.
--dependencies = [
--    "pydantic>=2.9",
--    "pyyaml>=6.0",
--    "typer>=0.12",
--    "rich>=13.7",
--    "numpy>=1.26",
--    "packaging>=24.0",
--]
--
--[project.optional-dependencies]
--# HuggingFace + PEFT scoring backend. The canonical path.
--hf = [
--    "torch>=2.4",
--    "transformers>=4.45",
--    "peft>=0.13",
--    "safetensors>=0.4",
--]
--# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op
--# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays
--# sane.
--mlx = [
--    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
--    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
--]
--# Stylistic fingerprinting (C1). spaCy models pull at runtime via
--# `python -m spacy download`.
--style = [
--    "spacy>=3.7",
--    "textstat>=0.7",
--    "nlpaug>=1.1",
--]
--# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly.
--semsim = [
--    "sentence-transformers>=3.0",
--]
--# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm.
--dlm = [
--    "dlm>=0.9",
--]
--# Visualization (P9).
--viz = [
--    "matplotlib>=3.8",
--]
--all = [
--    "torch>=2.4",
--    "transformers>=4.45",
--    "peft>=0.13",
--    "safetensors>=0.4",
--    "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
--    "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
--    "spacy>=3.7",
--    "textstat>=0.7",
--    "nlpaug>=1.1",
--    "sentence-transformers>=3.0",
--    "matplotlib>=3.8",
--]
--
--[project.scripts]
--dlm-sway = "dlm_sway.cli.app:main"
--
--[project.urls]
--Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
--Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
--
--[dependency-groups]
--dev = [
--    "pytest>=8.0",
--    "pytest-cov>=5.0",
--    "mypy>=1.11",
--    "ruff>=0.6",
--    "types-pyyaml>=6.0",
--    "hypothesis>=6.152.1",
--]
--
--[build-system]
--requires = ["hatchling"]
--build-backend = "hatchling.build"
--
--[tool.hatch.build.targets.wheel]
--packages = ["src/dlm_sway"]
--
--# -------- ruff --------
--[tool.ruff]
--line-length = 100
--target-version = "py311"
--src = ["src", "tests"]
--
--[tool.ruff.lint]
--select = [
--    "E",    # pycodestyle errors
--    "F",    # pyflakes
--    "W",    # pycodestyle warnings
--    "I",    # isort
--    "UP",   # pyupgrade
--    "B",    # bugbear
--    "N",    # pep8-naming
--    "C4",   # comprehensions
--    "SIM",  # simplify
--    "PT",   # pytest
--    "RET",  # return
--    "ARG",  # unused args
--    "PTH",  # use pathlib
--    "TID",  # tidy imports
--]
--ignore = [
--    "E501",  # handled by formatter
--]
--
--[tool.ruff.lint.per-file-ignores]
--"tests/**/*.py" = ["ARG", "PT011", "SIM117"]
--# PyTorch's canonical `import torch.nn.functional as F` is universally
--# read, so we allow the naming exception in the HF backend only.
--"src/dlm_sway/backends/hf.py" = ["N812"]
--# The .dlm bridge is the one place allowed to import the ``dlm`` package.
--"src/dlm_sway/integrations/dlm/*.py" = ["TID251"]
--
--[tool.ruff.lint.flake8-tidy-imports.banned-api]
--# Hard architectural boundary: the `dlm` package is only importable
--# from inside the optional integration shim. This keeps dlm-sway
--# usable for anyone with just a HuggingFace base + PEFT adapter.
--"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)."
--
--[tool.ruff.format]
--quote-style = "double"
--indent-style = "space"
--
--# -------- mypy --------
--[tool.mypy]
--strict = true
--python_version = "3.11"
--packages = ["dlm_sway"]
--mypy_path = "src"
--warn_return_any = true
--warn_unused_ignores = true
--warn_redundant_casts = true
--no_implicit_optional = true
--disallow_untyped_decorators = true
--plugins = ["pydantic.mypy"]
--
--[tool.pydantic-mypy]
--init_forbid_extra = true
--init_typed = true
--warn_required_dynamic_aliases = true
--
--# Stubless ML ecosystem packages. Narrow boundaries in backends/* import
--# them explicitly; the rest of the codebase stays strict.
--[[tool.mypy.overrides]]
--module = [
--    "torch",
--    "torch.*",
--    "transformers.*",
--    "peft.*",
--    "safetensors.*",
--    "mlx.*",
--    "mlx_lm.*",
--    "sentence_transformers.*",
--    "spacy.*",
--    "textstat.*",
--    "nlpaug.*",
--    "matplotlib",
--    "matplotlib.*",
--    "huggingface_hub.*",
--    "dlm.*",
--]
--ignore_missing_imports = true
--disable_error_code = ["no-untyped-call"]
--
--# -------- pytest --------
--[tool.pytest.ini_options]
--testpaths = ["tests"]
--addopts = [
--    "-ra",
--    "-m", "not slow and not gpu and not online",
--]
--markers = [
--    "slow: expensive; deselected by default",
--    "gpu: requires CUDA; skipped on CPU/MPS runners",
--    "online: touches the network; skipped in offline CI",
--]

sway/src/dlm_sway/__init__.pydeleted

--"""dlm-sway — differential testing for fine-tuned causal language models."""
--
--from __future__ import annotations
--
--from dlm_sway.core.errors import (
--    BackendNotAvailableError,
--    ProbeError,
--    SpecValidationError,
--    SwayError,
--)
--from dlm_sway.core.model import LoadedModel, Model, ModelSpec
--from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
--from dlm_sway.core.scoring import (
--    DifferentialBackend,
--    NullCalibratedBackend,
--    RollingLogprob,
--    ScalableDifferentialBackend,
--    ScoringBackend,
--    TokenDist,
--)
--
--__all__ = [
--    "BackendNotAvailableError",
--    "DifferentialBackend",
--    "LoadedModel",
--    "Model",
--    "ModelSpec",
--    "NullCalibratedBackend",
--    "ProbeError",
--    "ProbeResult",
--    "RollingLogprob",
--    "ScalableDifferentialBackend",
--    "ScoringBackend",
--    "SpecValidationError",
--    "SuiteResult",
--    "SwayError",
--    "SwayScore",
--    "TokenDist",
--    "Verdict",
--]
--
--__version__ = "0.1.0.dev0"

sway/src/dlm_sway/backends/__init__.pydeleted

--"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom.
--
--Backends are constructed from a :class:`~dlm_sway.core.model.ModelSpec`
--via :func:`build`. Heavy backends (HF, MLX) import their framework only
--on construction so ``import dlm_sway`` stays cheap for users who only
--touch the dummy backend or the spec loader.
--"""
--
--from __future__ import annotations
--
--from pathlib import Path
--from typing import TYPE_CHECKING
--
--from dlm_sway.core.errors import SpecValidationError
--from dlm_sway.core.model import ModelSpec
--
--if TYPE_CHECKING:
--    from dlm_sway.core.scoring import DifferentialBackend
--
--
--def build(base_spec: ModelSpec, *, adapter_path: Path | None = None) -> DifferentialBackend:
--    """Materialize a differential backend from a model spec.
--
--    The adapter path typically comes from ``ft.adapter`` in the spec —
--    it's lifted to a keyword here so the same function can be used for
--    "differential" (base + adapter on one loaded model) or future
--    split-load paths.
--    """
--    effective_adapter = adapter_path if adapter_path is not None else base_spec.adapter
--
--    if base_spec.kind == "dummy":
--        # Dummy backend isn't really about the spec — it's for tests
--        # that pre-populate responses. Surface a loud error if someone
--        # tries to build it through the normal path.
--        raise SpecValidationError(
--            "kind='dummy' backends must be constructed directly via "
--            "DummyDifferentialBackend(base=..., ft=...); they cannot be "
--            "materialized from a ModelSpec."
--        )
--
--    if base_spec.kind == "hf":
--        if effective_adapter is None:
--            raise SpecValidationError(
--                "hf backend requires an adapter path (set `adapter:` on the ft model)"
--            )
--        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
--
--        return HuggingFaceDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
--
--    if base_spec.kind == "mlx":
--        if effective_adapter is None:
--            raise SpecValidationError(
--                "mlx backend requires an adapter path (set `adapter:` on the ft model; "
--                "must be an MLX .npz adapter — use dlm's peft→mlx converter if needed)"
--            )
--        from dlm_sway.backends.mlx import MLXDifferentialBackend
--
--        return MLXDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter)
--
--    if base_spec.kind == "custom":
--        return _load_custom(base_spec, effective_adapter)
--
--    raise SpecValidationError(f"unknown backend kind: {base_spec.kind!r}")
--
--
--def _load_custom(base_spec: ModelSpec, adapter: Path | None) -> DifferentialBackend:
--    """Dispatch to a user-supplied backend via ``entry_point='pkg.mod:Name'``.
--
--    The imported class is instantiated as ``Cls(base_spec=..., adapter_path=...)``
--    — the same signature as :class:`dlm_sway.backends.hf.HuggingFaceDifferentialBackend`
--    so authors can model their implementation on the built-in. The
--    result is runtime-checked against :class:`DifferentialBackend` so
--    protocol violations fail at construction, not deep inside a probe.
--    """
--    from dlm_sway.core.scoring import DifferentialBackend as DiffBackend
--
--    entry = base_spec.entry_point
--    if not entry:
--        raise SpecValidationError(
--            "kind='custom' requires an entry_point of the form 'pkg.module:ClassName'"
--        )
--    if ":" not in entry:
--        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
--    module_path, _, class_name = entry.partition(":")
--    if not module_path or not class_name:
--        raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}")
--
--    import importlib
--
--    try:
--        module = importlib.import_module(module_path)
--    except ImportError as exc:
--        raise SpecValidationError(
--            f"custom backend: cannot import module {module_path!r}: {exc}"
--        ) from exc
--    cls = getattr(module, class_name, None)
--    if cls is None:
--        raise SpecValidationError(
--            f"custom backend: module {module_path!r} has no attribute {class_name!r}"
--        )
--
--    try:
--        instance = cls(base_spec=base_spec, adapter_path=adapter)
--    except TypeError as exc:
--        raise SpecValidationError(
--            f"custom backend {entry!r} constructor signature mismatch: {exc}. "
--            "Expected Cls(base_spec: ModelSpec, adapter_path: Path | None)"
--        ) from exc
--
--    if not isinstance(instance, DiffBackend):
--        raise SpecValidationError(
--            f"custom backend {entry!r} does not satisfy DifferentialBackend "
--            "(needs as_base() and as_finetuned() context managers)"
--        )
--    return instance
--
--
--__all__ = ["build"]

sway/src/dlm_sway/backends/dummy.pydeleted

--"""In-memory backend for unit tests.
--
--Deterministic, torchless, and trivially fast. Tests pass canned responses
--and canned score tables keyed by ``(mode, prompt, completion)``. The same
--backend instance serves as both ``as_base`` and ``as_finetuned`` — it
--switches an internal mode flag.
--
--Use it to drive every probe's unit test without loading a real model.
--For integration tests against a real PEFT adapter, see
--:class:`~dlm_sway.backends.hf.HuggingFaceDifferentialBackend`.
--"""
--
--from __future__ import annotations
--
--import math
--from collections.abc import Iterator
--from contextlib import contextmanager
--from dataclasses import dataclass, field
--from typing import Literal
--
--import numpy as np
--
--from dlm_sway.core.scoring import RollingLogprob, TokenDist
--
--Mode = Literal["base", "ft"]
--
--
--@dataclass(slots=True)
--class DummyResponses:
--    """Canned data for one mode (base or ft).
--
--    Callers populate one of these per mode and hand both to
--    :class:`DummyDifferentialBackend`.
--    """
--
--    generations: dict[str, str] = field(default_factory=dict)
--    """Prompt → canned completion. Lookup is exact-match."""
--    logprobs: dict[tuple[str, str], float] = field(default_factory=dict)
--    """``(prompt, completion) → sum logprob``. Default ``-10.0`` if missing."""
--    rolling: dict[str, RollingLogprob] = field(default_factory=dict)
--    """Text → canned :class:`RollingLogprob`."""
--    token_dists: dict[str, TokenDist] = field(default_factory=dict)
--    """Prompt → canned :class:`TokenDist`."""
--
--
--class _DummyView:
--    """The per-mode view yielded by ``as_base`` / ``as_finetuned``.
--
--    Implements :class:`~dlm_sway.core.model.Model` *and*
--    :class:`~dlm_sway.core.scoring.ScoringBackend` — i.e. the
--    ``ScoringModel`` intersection.
--    """
--
--    def __init__(self, mode: Mode, responses: DummyResponses) -> None:
--        self.id = mode
--        self._mode: Mode = mode
--        self._r = responses
--
--    # -- Model ---------------------------------------------------------
--    def generate(
--        self,
--        prompt: str,
--        *,
--        max_new_tokens: int,
--        temperature: float = 0.0,
--        top_p: float = 1.0,
--        seed: int = 0,
--    ) -> str:
--        del max_new_tokens, temperature, top_p, seed  # canned; decoding is trivial.
--        try:
--            return self._r.generations[prompt]
--        except KeyError as exc:
--            raise KeyError(
--                f"dummy backend ({self._mode}): no canned generation for prompt {prompt!r}"
--            ) from exc
--
--    def close(self) -> None:
--        return None
--
--    # -- ScoringBackend ------------------------------------------------
--    def logprob_of(self, prompt: str, completion: str) -> float:
--        return self._r.logprobs.get((prompt, completion), -10.0)
--
--    def rolling_logprob(self, text: str) -> RollingLogprob:
--        if text in self._r.rolling:
--            return self._r.rolling[text]
--        # Synthesize a plausible rolling logprob so probes that just
--        # want a non-trivial value work without per-text configuration.
--        tokens = text.split()
--        n = max(len(tokens), 1)
--        per_tok = -2.0 if self._mode == "base" else -1.5
--        return RollingLogprob(
--            token_ids=np.arange(n, dtype=np.int64),
--            logprobs=np.full(max(n - 1, 0), per_tok, dtype=np.float32),
--            num_tokens=n,
--            total_logprob=per_tok * max(n - 1, 0),
--        )
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--        del top_k
--        if prompt in self._r.token_dists:
--            return self._r.token_dists[prompt]
--        # Synthesize a sharp base / broad ft distribution so divergence
--        # probes see a non-zero signal without hand-rolled data.
--        vocab = 1000
--        k = 8
--        if self._mode == "base":
--            lp = np.array([-0.1] + [-5.0] * (k - 1), dtype=np.float32)
--        else:
--            # More uniform mass across the top-k tokens.
--            lp = np.full(k, -math.log(k), dtype=np.float32)
--        return TokenDist(
--            token_ids=np.arange(k, dtype=np.int64),
--            logprobs=lp,
--            vocab_size=vocab,
--            tail_logprob=math.log1p(-float(np.exp(lp).sum())) if np.exp(lp).sum() < 1 else 0.0,
--        )
--
--
--class _NullView(_DummyView):
--    """A dummy view that perturbs the base distribution with seeded noise.
--
--    Used by :meth:`DummyDifferentialBackend.as_null_adapter`. The
--    perturbation is small (matches an ``init_scale=0.02`` adapter) so
--    the null-vs-base divergence stays well below real-adapter territory
--    in probe tests.
--    """
--
--    def __init__(self, base_responses: DummyResponses, seed: int, init_scale: float) -> None:
--        super().__init__("base", base_responses)
--        self._seed = seed
--        self._init_scale = init_scale
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--        base_dist = super().next_token_dist(prompt, top_k=top_k)
--        rng = np.random.default_rng(self._seed + hash(prompt) % 1_000_003)
--        noise = rng.normal(0.0, self._init_scale, size=base_dist.logprobs.shape).astype(np.float32)
--        new_lp = base_dist.logprobs + noise
--        # Re-normalize (within the top-k slice) so a valid distribution comes back.
--        max_lp = new_lp.max()
--        new_probs = np.exp(new_lp - max_lp)
--        new_probs /= new_probs.sum()
--        return TokenDist(
--            token_ids=base_dist.token_ids,
--            logprobs=np.log(new_probs).astype(np.float32),
--            vocab_size=base_dist.vocab_size,
--            tail_logprob=base_dist.tail_logprob,
--        )
--
--
--class _InterpolatedView(_DummyView):
--    """A dummy view where logits/dists are a lam-blend of base and ft.
--
--    Used by :meth:`DummyDifferentialBackend.as_scaled_adapter`.
--    Generation falls back to the ft view at lam>=0.5, base otherwise —
--    rounded because the dummy backend's generations are canned strings
--    with no notion of "how much".
--    """
--
--    def __init__(
--        self,
--        base_responses: DummyResponses,
--        ft_responses: DummyResponses,
--        lam: float,
--    ) -> None:
--        super().__init__(
--            "ft" if lam >= 0.5 else "base", ft_responses if lam >= 0.5 else base_responses
--        )
--        self._base_r = base_responses
--        self._ft_r = ft_responses
--        self._lam = lam
--
--    def logprob_of(self, prompt: str, completion: str) -> float:
--        base_v = self._base_r.logprobs.get((prompt, completion), -10.0)
--        ft_v = self._ft_r.logprobs.get((prompt, completion), -10.0)
--        return (1 - self._lam) * base_v + self._lam * ft_v
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256):  # type: ignore[no-untyped-def]
--        base_dist = _DummyView("base", self._base_r).next_token_dist(prompt, top_k=top_k)
--        ft_dist = _DummyView("ft", self._ft_r).next_token_dist(prompt, top_k=top_k)
--        # Both dists are on the same synthetic support when unseeded; blend
--        # their logprobs via log-space linear interpolation, which is a
--        # log-linear "tempered" mix and keeps normalization close enough.
--        lam = self._lam
--        blended_lp = (1 - lam) * base_dist.logprobs + lam * ft_dist.logprobs
--        return type(base_dist)(
--            token_ids=base_dist.token_ids,
--            logprobs=blended_lp,
--            vocab_size=base_dist.vocab_size,
--            tail_logprob=base_dist.tail_logprob,
--        )
--
--
--class DummyDifferentialBackend:
--    """Dummy implementation of
--    :class:`~dlm_sway.core.scoring.DifferentialBackend`.
--
--    Construction takes one :class:`DummyResponses` per mode. The two
--    modes are mutually exclusive — the backend enforces that callers
--    exit one view before entering the other, catching bugs in probes
--    that hold a stale view across a toggle.
--
--    Also implements
--    :class:`~dlm_sway.core.scoring.ScalableDifferentialBackend` with a
--    linear-blend between base and ft responses, so probes that need
--    ``as_scaled_adapter`` (N2 AdapterAblation) are unit-testable.
--    """
--
--    def __init__(self, *, base: DummyResponses, ft: DummyResponses) -> None:
--        self._base_r = base
--        self._ft_r = ft
--        self._base = _DummyView("base", base)
--        self._ft = _DummyView("ft", ft)
--        self._active: str | None = None
--
--    @contextmanager
--    def as_base(self) -> Iterator[_DummyView]:
--        self._enter("base")
--        try:
--            yield self._base
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_finetuned(self) -> Iterator[_DummyView]:
--        self._enter("ft")
--        try:
--            yield self._ft
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_scaled_adapter(self, lam: float) -> Iterator[_DummyView]:
--        self._enter(f"scaled({lam})")
--        try:
--            yield _InterpolatedView(self._base_r, self._ft_r, lam)
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_DummyView]:
--        self._enter(f"null({seed})")
--        try:
--            yield _NullView(self._base_r, seed=seed, init_scale=init_scale)
--        finally:
--            self._exit()
--
--    def _enter(self, mode: str) -> None:
--        if self._active is not None:
--            raise RuntimeError(
--                f"DifferentialBackend view already active ({self._active!r}); "
--                f"exit the current view before entering {mode!r}."
--            )
--        self._active = mode
--
--    def _exit(self) -> None:
--        self._active = None

sway/src/dlm_sway/backends/hf.pydeleted

--"""HuggingFace + PEFT differential backend.
--
--Loads the base once, attaches the LoRA adapter once, and toggles between
--"base" and "fine-tuned" views on the same module via PEFT's
--:meth:`~peft.PeftModel.disable_adapter` / :meth:`~peft.PeftModel.set_adapter`.
--
--This is the single most important backend in sway. Every numeric probe
--benefits from the shared-weights toggle — memory is halved compared to
--loading two copies, and KV-cache layouts stay aligned so pairwise KL math
--is straight-forward.
--
--Heavy imports (``torch``, ``transformers``, ``peft``) are deferred until
--``HuggingFaceDifferentialBackend`` is actually instantiated so
--``import dlm_sway`` stays light for users of the dummy backend or spec
--validation.
--"""
--
--from __future__ import annotations
--
--from collections.abc import Iterator
--from contextlib import contextmanager
--from dataclasses import dataclass
--from pathlib import Path
--from typing import TYPE_CHECKING, Any, Literal
--
--import numpy as np
--
--from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
--from dlm_sway.core.model import ModelSpec
--from dlm_sway.core.scoring import RollingLogprob, TokenDist
--
--if TYPE_CHECKING:
--    from transformers import PreTrainedModel, PreTrainedTokenizerBase
--
--
--Device = Literal["cuda", "mps", "cpu"]
--
--
--def _detect_device() -> Device:
--    try:
--        import torch
--    except ImportError as exc:
--        raise BackendNotAvailableError("hf", extra="hf") from exc
--    if torch.cuda.is_available():
--        return "cuda"
--    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
--        return "mps"
--    return "cpu"
--
--
--def _resolve_dtype(requested: str, device: Device) -> Any:
--    """Map the user's ``dtype`` preference to a torch dtype."""
--    import torch  # noqa: PLC0415 — lazy
--
--    if requested == "fp16":
--        return torch.float16
--    if requested == "bf16":
--        return torch.bfloat16
--    if requested == "fp32":
--        return torch.float32
--    # auto: bf16 on CUDA (Ampere+) / MPS; fp32 on CPU for numerical stability.
--    if device == "cuda" and torch.cuda.is_bf16_supported():
--        return torch.bfloat16
--    if device == "mps":
--        return torch.float16
--    return torch.float32
--
--
--def _require_hf() -> tuple[Any, Any, Any]:
--    """Import torch + transformers + peft, raising a friendly error if missing."""
--    try:
--        import torch
--        import transformers
--    except ImportError as exc:
--        raise BackendNotAvailableError("hf", extra="hf") from exc
--    try:
--        import peft
--    except ImportError as exc:
--        raise BackendNotAvailableError(
--            "hf", extra="hf", hint="peft is required for the adapter toggle."
--        ) from exc
--    return torch, transformers, peft
--
--
--# --- the view object ------------------------------------------------------
--
--
--@dataclass(slots=True)
--class _HFView:
--    """One side (base or ft) of a :class:`HuggingFaceDifferentialBackend`.
--
--    Both sides reuse the same underlying module; the difference is
--    whether the adapter is active.
--    """
--
--    id: str
--    _model: Any
--    _tokenizer: Any
--    _device: str
--    _pad_token_id: int
--
--    # -- Model ---------------------------------------------------------
--    def generate(
--        self,
--        prompt: str,
--        *,
--        max_new_tokens: int,
--        temperature: float = 0.0,
--        top_p: float = 1.0,
--        seed: int = 0,
--    ) -> str:
--        import torch
--
--        torch.manual_seed(seed)
--        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device)
--        do_sample = temperature > 0.0
--        gen_kwargs: dict[str, Any] = {
--            "max_new_tokens": max_new_tokens,
--            "do_sample": do_sample,
--            "pad_token_id": self._pad_token_id,
--        }
--        if do_sample:
--            gen_kwargs["temperature"] = temperature
--            gen_kwargs["top_p"] = top_p
--        with torch.inference_mode():
--            out_ids = self._model.generate(**inputs, **gen_kwargs)
--        new_tokens = out_ids[0, inputs["input_ids"].shape[1] :]
--        return str(self._tokenizer.decode(new_tokens, skip_special_tokens=True))
--
--    def close(self) -> None:
--        return None
--
--    # -- ScoringBackend ------------------------------------------------
--    def logprob_of(self, prompt: str, completion: str) -> float:
--        import torch
--        import torch.nn.functional as F
--
--        prompt_ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
--        full_ids = self._tokenizer(prompt + completion, return_tensors="pt").input_ids.to(
--            self._device
--        )
--        if full_ids.shape[1] <= prompt_ids.shape[1]:
--            raise ProbeError(
--                "logprob_of",
--                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
--            )
--        target_ids = full_ids[:, prompt_ids.shape[1] :]
--        with torch.inference_mode():
--            logits = self._model(full_ids).logits  # (1, T, V)
--        # Align: logit at position t predicts token at t+1. We want
--        # predictions for the completion slice.
--        shift_logits = logits[:, prompt_ids.shape[1] - 1 : -1, :]  # (1, C, V)
--        log_probs = F.log_softmax(shift_logits.float(), dim=-1)
--        gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
--        return float(gathered.sum().item())
--
--    def rolling_logprob(self, text: str) -> RollingLogprob:
--        import torch
--        import torch.nn.functional as F
--
--        ids = self._tokenizer(text, return_tensors="pt").input_ids.to(self._device)
--        if ids.shape[1] < 2:
--            return RollingLogprob(
--                token_ids=ids[0].cpu().numpy().astype(np.int64),
--                logprobs=np.array([], dtype=np.float32),
--                num_tokens=int(ids.shape[1]),
--                total_logprob=0.0,
--            )
--        with torch.inference_mode():
--            logits = self._model(ids).logits  # (1, T, V)
--        log_probs = F.log_softmax(logits[:, :-1].float(), dim=-1)  # predicts tokens 1..T
--        gathered = log_probs.gather(-1, ids[:, 1:].unsqueeze(-1)).squeeze(-1).squeeze(0)
--        return RollingLogprob(
--            token_ids=ids[0].cpu().numpy().astype(np.int64),
--            logprobs=gathered.cpu().numpy().astype(np.float32),
--            num_tokens=int(ids.shape[1]),
--            total_logprob=float(gathered.sum().item()),
--        )
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--        import torch
--        import torch.nn.functional as F
--
--        ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device)
--        with torch.inference_mode():
--            logits = self._model(ids).logits[:, -1, :]  # (1, V)
--        log_probs = F.log_softmax(logits.float(), dim=-1).squeeze(0)
--        k = min(top_k, int(log_probs.shape[0]))
--        top = torch.topk(log_probs, k=k)
--        tail_mass = float(1.0 - torch.exp(top.values).sum().item())
--        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
--        return TokenDist(
--            token_ids=top.indices.cpu().numpy().astype(np.int64),
--            logprobs=top.values.cpu().numpy().astype(np.float32),
--            vocab_size=int(log_probs.shape[0]),
--            tail_logprob=tail_logprob,
--        )
--
--
--# --- the backend -----------------------------------------------------------
--
--
--class HuggingFaceDifferentialBackend:
--    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for HF+PEFT.
--
--    The adapter toggle relies on
--    :meth:`peft.PeftModel.disable_adapter` producing a context where the
--    forward pass skips the LoRA deltas, and
--    :meth:`peft.PeftModel.set_adapter` (or just exiting the disable
--    context) re-enabling them. A dedicated sanity test asserts that
--    these actually change logits on a fixture.
--    """
--
--    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
--        torch, transformers, peft = _require_hf()
--        self._torch = torch
--        self._spec = base_spec
--        self._adapter_path = Path(adapter_path).expanduser().resolve()
--
--        device_str: Device = (
--            _detect_device() if base_spec.device == "auto" else base_spec.device  # type: ignore[assignment]
--        )
--        self._device: str = device_str
--        dtype = _resolve_dtype(base_spec.dtype, device_str)
--
--        tokenizer = transformers.AutoTokenizer.from_pretrained(
--            str(self._adapter_path)
--            if (self._adapter_path / "tokenizer_config.json").exists()
--            else base_spec.base,
--            trust_remote_code=base_spec.trust_remote_code,
--        )
--        if tokenizer.pad_token_id is None:
--            tokenizer.pad_token = tokenizer.eos_token
--
--        base_model = transformers.AutoModelForCausalLM.from_pretrained(
--            base_spec.base,
--            torch_dtype=dtype,
--            trust_remote_code=base_spec.trust_remote_code,
--        )
--        base_model.to(self._device)
--        peft_model = peft.PeftModel.from_pretrained(
--            base_model,
--            str(self._adapter_path),
--            is_trainable=False,
--        )
--        peft_model.eval()
--
--        self._tokenizer: PreTrainedTokenizerBase = tokenizer
--        self._peft_model: PreTrainedModel = peft_model
--        self._pad_token_id: int = int(tokenizer.pad_token_id)
--        self._active: str | None = None
--
--    # -- DifferentialBackend -------------------------------------------
--
--    @contextmanager
--    def as_base(self) -> Iterator[_HFView]:
--        self._enter("base")
--        try:
--            # peft.PeftModel.disable_adapter is a context manager; mypy
--            # mis-reads it as a Tensor on this transformers version.
--            with self._peft_model.disable_adapter():  # type: ignore[operator]
--                yield self._make_view("base")
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_finetuned(self) -> Iterator[_HFView]:
--        self._enter("ft")
--        try:
--            yield self._make_view("ft")
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_scaled_adapter(self, lam: float) -> Iterator[_HFView]:
--        """Temporarily multiply every LoRA layer's scaling factor by ``lam``.
--
--        Works by walking the PEFT module tree and mutating each
--        ``LoraLayer.scaling[adapter_name]`` in place. The original
--        scalings are restored when the context exits — or when an
--        exception propagates, to keep the model in a sane state.
--        """
--        self._enter(f"scaled({lam})")
--        # ``module`` is dynamic (peft LoraLayer subclass) — Any avoids
--        # mypy treating its ``.scaling`` as a Tensor when peft is loaded.
--        saved: list[tuple[Any, str, float]] = []
--        try:
--            import peft  # noqa: PLC0415 — already a hard dep of this backend
--
--            lora_cls = getattr(peft.tuners.lora, "LoraLayer", None)
--            if lora_cls is None:
--                raise RuntimeError("peft.tuners.lora.LoraLayer not found; check peft>=0.13 pin")
--            for module in self._peft_model.modules():
--                if not isinstance(module, lora_cls):
--                    continue
--                scaling = getattr(module, "scaling", None)
--                if not isinstance(scaling, dict):
--                    continue
--                for key, original in scaling.items():
--                    saved.append((module, key, float(original)))
--                    scaling[key] = float(original) * lam
--            yield self._make_view(f"scaled_{lam:.2f}")
--        finally:
--            for module, key, original in saved:
--                module.scaling[key] = original
--            self._exit()
--
--    @contextmanager
--    def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_HFView]:
--        """Temporarily replace every LoRA ``A``/``B`` tensor with random noise.
--
--        Same rank, alpha, and target modules as the real adapter — only
--        the weights differ. This is the denominator in every z-score
--        path: "how much signal does structural noise produce?"
--
--        Implementation walks the PEFT module tree for ``lora_A``/``lora_B``
--        parameters, saves a clone of each current value, overwrites in
--        place with a zero-mean Gaussian at ``init_scale``, and restores
--        on exit (including on exception).
--        """
--        import torch
--
--        self._enter(f"null({seed})")
--        gen = torch.Generator(device="cpu").manual_seed(int(seed))
--        saved: list[tuple[torch.nn.Parameter, torch.Tensor]] = []
--        try:
--            for pname, param in self._peft_model.named_parameters():
--                if not any(key in pname for key in ("lora_A", "lora_B")):
--                    continue
--                saved.append((param, param.detach().clone()))
--                with torch.no_grad():
--                    noise = torch.randn(
--                        *param.shape,
--                        generator=gen,
--                        dtype=torch.float32,
--                    ).to(dtype=param.dtype, device=param.device)
--                    param.copy_(noise * init_scale)
--            yield self._make_view(f"null_{seed}")
--        finally:
--            with torch.no_grad():
--                for param, original in saved:
--                    param.copy_(original)
--            self._exit()
--
--    def close(self) -> None:
--        """Release GPU memory. Safe to call more than once."""
--        if getattr(self, "_peft_model", None) is not None:
--            del self._peft_model
--        if self._torch.cuda.is_available():
--            self._torch.cuda.empty_cache()
--
--    # -- internals -----------------------------------------------------
--
--    def _make_view(self, mode: str) -> _HFView:
--        return _HFView(
--            id=mode,
--            _model=self._peft_model,
--            _tokenizer=self._tokenizer,
--            _device=self._device,
--            _pad_token_id=self._pad_token_id,
--        )
--
--    def _enter(self, mode: str) -> None:
--        if self._active is not None:
--            raise RuntimeError(
--                f"HuggingFaceDifferentialBackend view {self._active!r} already active; "
--                f"exit it before entering {mode!r}."
--            )
--        self._active = mode
--
--    def _exit(self) -> None:
--        self._active = None
--
--
--__all__ = ["HuggingFaceDifferentialBackend"]

sway/src/dlm_sway/backends/mlx.pydeleted

--"""MLX backend for Apple Silicon (darwin-arm64).
--
--Partial implementation covering the common case: a PEFT adapter that's
--already been converted to MLX's ``.npz`` format. Unlike the HF backend,
--MLX has no runtime ``disable_adapter`` context — adapters get fused into
--the linear layers at load time — so this backend keeps **both** a base
--model and an adapted model in memory. Fine for the small (<3B) models
--MLX is typically used with on Apple Silicon; document the cost clearly.
--
--If users point this backend at raw PEFT safetensors, ``mlx_lm.load``
--will refuse them with its own error. A future milestone can wire a
--PEFT-→-MLX converter; for now the contract is "bring your own .npz".
--"""
--
--from __future__ import annotations
--
--from collections.abc import Iterator
--from contextlib import contextmanager
--from dataclasses import dataclass
--from pathlib import Path
--from typing import TYPE_CHECKING, Any
--
--import numpy as np
--
--from dlm_sway.core.errors import BackendNotAvailableError, ProbeError
--from dlm_sway.core.model import ModelSpec
--from dlm_sway.core.scoring import RollingLogprob, TokenDist
--
--if TYPE_CHECKING:
--    pass
--
--
--def _require_mlx() -> tuple[Any, Any]:
--    try:
--        import mlx.core as mx
--        import mlx_lm
--    except ImportError as exc:
--        raise BackendNotAvailableError(
--            "mlx",
--            extra="mlx",
--            hint="MLX backend needs mlx + mlx-lm on darwin-arm64.",
--        ) from exc
--    return mx, mlx_lm
--
--
--@dataclass(slots=True)
--class _MLXView:
--    """One side (base or ft) of the MLX backend.
--
--    Both sides carry the same tokenizer (MLX stores it alongside the
--    converted model files, so sharing avoids double-loading).
--    """
--
--    id: str
--    _model: Any
--    _tokenizer: Any
--
--    def generate(
--        self,
--        prompt: str,
--        *,
--        max_new_tokens: int,
--        temperature: float = 0.0,
--        top_p: float = 1.0,
--        seed: int = 0,
--    ) -> str:
--        del seed  # mlx_lm.generate seeds via its own global state
--        _, mlx_lm = _require_mlx()
--        kwargs: dict[str, Any] = {"max_tokens": max_new_tokens, "verbose": False}
--        if temperature > 0.0:
--            kwargs["temp"] = temperature
--            kwargs["top_p"] = top_p
--        out = mlx_lm.generate(self._model, self._tokenizer, prompt=prompt, **kwargs)
--        return str(out)
--
--    def close(self) -> None:
--        return None
--
--    # -- ScoringBackend ------------------------------------------------
--
--    def _forward_logits(self, prompt: str) -> np.ndarray:
--        """Run the model once and return ``(seq_len, vocab)`` logits."""
--        mx, _ = _require_mlx()
--        input_ids = self._tokenizer.encode(prompt)
--        tokens = mx.array(input_ids)[None, :]  # (1, T)
--        out = self._model(tokens)
--        # mlx_lm models return an mx.array; convert to numpy for downstream math.
--        return np.asarray(out[0])
--
--    def logprob_of(self, prompt: str, completion: str) -> float:
--        input_ids = self._tokenizer.encode(prompt)
--        full_ids = self._tokenizer.encode(prompt + completion)
--        if len(full_ids) <= len(input_ids):
--            raise ProbeError(
--                "logprob_of",
--                f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})",
--            )
--        logits = self._forward_logits(prompt + completion)  # (T, V)
--        # Position t predicts token t+1 — slice off the last row and the prompt span.
--        shift = logits[len(input_ids) - 1 : -1, :]
--        target_ids = np.asarray(full_ids[len(input_ids) :], dtype=np.int64)
--        log_probs = _log_softmax(shift.astype(np.float64), axis=-1)
--        gathered = log_probs[np.arange(len(target_ids)), target_ids]
--        return float(gathered.sum())
--
--    def rolling_logprob(self, text: str) -> RollingLogprob:
--        ids = self._tokenizer.encode(text)
--        if len(ids) < 2:
--            return RollingLogprob(
--                token_ids=np.asarray(ids, dtype=np.int64),
--                logprobs=np.array([], dtype=np.float32),
--                num_tokens=len(ids),
--                total_logprob=0.0,
--            )
--        logits = self._forward_logits(text)
--        log_probs = _log_softmax(logits[:-1].astype(np.float64), axis=-1)
--        ids_arr = np.asarray(ids, dtype=np.int64)
--        gathered = log_probs[np.arange(len(ids) - 1), ids_arr[1:]]
--        return RollingLogprob(
--            token_ids=ids_arr,
--            logprobs=gathered.astype(np.float32),
--            num_tokens=len(ids),
--            total_logprob=float(gathered.sum()),
--        )
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--        logits = self._forward_logits(prompt)
--        last_logits = logits[-1].astype(np.float64)
--        log_probs = _log_softmax(last_logits, axis=-1)
--        k = min(top_k, log_probs.shape[0])
--        # np.argpartition for top-k then sort the partition.
--        part = np.argpartition(log_probs, -k)[-k:]
--        top_ids = part[np.argsort(log_probs[part])[::-1]]
--        top_lp = log_probs[top_ids]
--        tail_mass = float(1.0 - np.exp(top_lp).sum())
--        tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0
--        return TokenDist(
--            token_ids=top_ids.astype(np.int64),
--            logprobs=top_lp.astype(np.float32),
--            vocab_size=int(log_probs.shape[0]),
--            tail_logprob=tail_logprob,
--        )
--
--
--class MLXDifferentialBackend:
--    """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for MLX models.
--
--    Loads two copies of the same base model — one bare, one with the
--    adapter fused — because MLX has no runtime toggle. Memory cost: 2×
--    base weights. On typical Apple Silicon workloads with ≤3B models
--    this is acceptable.
--    """
--
--    def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None:
--        mx, mlx_lm = _require_mlx()
--        self._mx = mx
--        self._spec = base_spec
--        self._adapter_path = Path(adapter_path).expanduser().resolve()
--
--        # Load bare base (no adapter).
--        self._base_model, self._tokenizer = mlx_lm.load(base_spec.base)
--        # Load ft with adapter attached. ``adapter_path`` is mlx_lm's kwarg.
--        self._ft_model, _ = mlx_lm.load(base_spec.base, adapter_path=str(self._adapter_path))
--        self._active: str | None = None
--
--    @contextmanager
--    def as_base(self) -> Iterator[_MLXView]:
--        self._enter("base")
--        try:
--            yield _MLXView(id="base", _model=self._base_model, _tokenizer=self._tokenizer)
--        finally:
--            self._exit()
--
--    @contextmanager
--    def as_finetuned(self) -> Iterator[_MLXView]:
--        self._enter("ft")
--        try:
--            yield _MLXView(id="ft", _model=self._ft_model, _tokenizer=self._tokenizer)
--        finally:
--            self._exit()
--
--    def close(self) -> None:
--        """MLX reclaims memory when references drop; nothing to do here."""
--        return
--
--    def _enter(self, mode: str) -> None:
--        if self._active is not None:
--            raise RuntimeError(
--                f"MLXDifferentialBackend view {self._active!r} already active; "
--                f"exit it before entering {mode!r}."
--            )
--        self._active = mode
--
--    def _exit(self) -> None:
--        self._active = None
--
--
--def _log_softmax(x: np.ndarray, *, axis: int) -> np.ndarray:
--    x_max = np.max(x, axis=axis, keepdims=True)
--    y = x - x_max
--    log_sum = np.log(np.sum(np.exp(y), axis=axis, keepdims=True))
--    return np.asarray(y - log_sum, dtype=np.float64)
--
--
--__all__ = ["MLXDifferentialBackend"]

sway/src/dlm_sway/cli/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1	-"""Command-line interface (entry point: ``dlm-sway``)."""

sway/src/dlm_sway/cli/app.pydeleted

--"""dlm-sway CLI entry point.
--
--``pip install dlm-sway`` installs this module's :func:`main` as the
--``dlm-sway`` console script. Every subcommand is a thin wrapper around a
--library-level function so the CLI surface mirrors what programmatic
--callers get.
--"""
--
--from __future__ import annotations
--
--import typer
--
--from dlm_sway import __version__
--from dlm_sway.cli import commands
--
--app = typer.Typer(
--    name="dlm-sway",
--    no_args_is_help=True,
--    add_completion=False,
--    help="Differential testing for fine-tuned causal language models.",
--)
--
--
--def _version_callback(value: bool) -> None:
--    if value:
--        typer.echo(f"dlm-sway {__version__}")
--        raise typer.Exit()
--
--
--@app.callback()
--def _root(
--    version: bool = typer.Option(  # noqa: B008 — typer pattern
--        False,
--        "--version",
--        callback=_version_callback,
--        is_eager=True,
--        help="Print version and exit.",
--    ),
--) -> None:
--    """Root callback; accepts ``--version``."""
--    del version
--
--
--app.command("run")(commands.run_cmd)
--app.command("gate")(commands.gate_cmd)
--app.command("check")(commands.check_cmd)
--app.command("diff")(commands.diff_cmd)
--app.command("autogen")(commands.autogen_cmd)
--app.command("doctor")(commands.doctor_cmd)
--app.command("report")(commands.report_cmd)
--
--
--def main() -> None:
--    """Script entry point registered in :file:`pyproject.toml`."""
--    app()
--
--
--if __name__ == "__main__":
--    main()

sway/src/dlm_sway/cli/commands.pydeleted

--"""Command implementations for the ``dlm-sway`` CLI.
--
--Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
--Commands deliberately do as little as possible themselves — the real
--work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the
--probes package.
--"""
--
--from __future__ import annotations
--
--import json
--import sys
--from pathlib import Path
--from typing import Annotated, Any
--
--import typer
--from rich.console import Console
--
--from dlm_sway import __version__
--from dlm_sway.core.errors import SwayError
--from dlm_sway.core.result import SuiteResult, SwayScore, Verdict
--
--
--def run_cmd(
--    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
--    json_out: Annotated[
--        Path | None,
--        typer.Option(
--            "--json",
--            "-j",
--            help="Write the JSON report to this path in addition to the terminal render.",
--        ),
--    ] = None,
--    markdown_out: Annotated[
--        Path | None,
--        typer.Option("--markdown", "-m", help="Write a markdown report to this path."),
--    ] = None,
--) -> None:
--    """Execute a suite and render a terminal report."""
--    try:
--        result, score_obj = _execute_spec(spec)
--    except SwayError as exc:
--        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
--        raise typer.Exit(code=2) from exc
--
--    from dlm_sway.suite import report
--
--    console = Console()
--    report.to_terminal(result, score_obj, console=console)
--
--    if json_out is not None:
--        json_out.write_text(report.to_json(result, score_obj), encoding="utf-8")
--        console.print(f"\n[dim]wrote JSON → {json_out}[/dim]")
--    if markdown_out is not None:
--        markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8")
--        console.print(f"[dim]wrote markdown → {markdown_out}[/dim]")
--
--
--def gate_cmd(
--    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
--    junit_out: Annotated[
--        Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.")
--    ] = None,
--    coverage_threshold: Annotated[
--        float | None,
--        typer.Option(
--            "--threshold",
--            help="Override the spec's coverage_threshold. Exit non-zero below it.",
--        ),
--    ] = None,
--) -> None:
--    """Execute a suite and exit non-zero on failure (CI gate)."""
--    try:
--        result, score_obj = _execute_spec(spec)
--    except SwayError as exc:
--        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
--        raise typer.Exit(code=2) from exc
--
--    from dlm_sway.suite import report
--    from dlm_sway.suite.loader import load_spec as _load_spec
--
--    console = Console()
--    report.to_terminal(result, score_obj, console=console)
--
--    if junit_out is not None:
--        junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8")
--        console.print(f"[dim]wrote JUnit → {junit_out}[/dim]")
--
--    threshold = (
--        coverage_threshold
--        if coverage_threshold is not None
--        else _load_spec(spec).defaults.coverage_threshold
--    )
--    has_failures = any(p.verdict == Verdict.FAIL for p in result.probes)
--    below_threshold = score_obj.overall < threshold
--    if has_failures or below_threshold:
--        console.print(
--            f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}"
--            if below_threshold
--            else "\n[red]gate FAILED[/red] — at least one probe reported FAIL"
--        )
--        raise typer.Exit(code=1)
--    console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}")
--
--
--def check_cmd(
--    adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")],
--    base: Annotated[str, typer.Option("--base", help="HuggingFace base model id or local path.")],
--    prompts: Annotated[
--        Path | None,
--        typer.Option(
--            "--prompts",
--            help="File with one prompt per line. Defaults to sway's built-in quick set.",
--        ),
--    ] = None,
--) -> None:
--    """<60s smoke test: "is this adapter doing anything at all?".
--
--    Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No
--    spec file required.
--    """
--    from dlm_sway.backends import build as build_backend
--    from dlm_sway.core.model import ModelSpec
--    from dlm_sway.suite import report
--    from dlm_sway.suite.runner import run as run_suite
--    from dlm_sway.suite.score import compute as compute_score
--    from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec
--
--    quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS
--
--    base_spec = ModelSpec(base=base, kind="hf")
--    ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter)
--    spec = SwaySpec(
--        version=1,
--        models=SuiteModels(base=base_spec, ft=ft_spec),
--        defaults=SuiteDefaults(seed=0),
--        suite=[
--            {
--                "name": "quick_delta_kl",
--                "kind": "delta_kl",
--                "prompts": list(quick_prompts),
--                "assert_mean_gte": 0.01,
--            },
--            {
--                "name": "quick_calibration",
--                "kind": "calibration_drift",
--                "items_limit": 10,
--            },
--        ],
--    )
--    try:
--        backend = build_backend(ft_spec)
--    except SwayError as exc:
--        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
--        raise typer.Exit(code=2) from exc
--
--    try:
--        result = run_suite(spec, backend, spec_path="<check>")
--    finally:
--        _close_if_possible(backend)
--    score_obj = compute_score(result)
--    report.to_terminal(result, score_obj, console=Console())
--
--
--def diff_cmd(
--    spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
--    adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")],
--    adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")],
--) -> None:
--    """Run the same suite against two adapters and show per-probe deltas."""
--    from dlm_sway.backends import build as build_backend
--    from dlm_sway.suite.loader import load_spec
--    from dlm_sway.suite.runner import run as run_suite
--    from dlm_sway.suite.score import compute as compute_score
--
--    sway_spec = load_spec(spec)
--    console = Console()
--
--    def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]:
--        ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path})
--        backend = build_backend(ft_spec)
--        try:
--            result = run_suite(sway_spec, backend, spec_path=str(spec))
--        finally:
--            _close_if_possible(backend)
--        scored = compute_score(result)
--        per_probe = {p.name: (p.score or 0.0) for p in result.probes}
--        return scored.overall, per_probe
--
--    try:
--        overall_a, per_a = _score_for(adapter_a)
--        overall_b, per_b = _score_for(adapter_b)
--    except SwayError as exc:
--        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
--        raise typer.Exit(code=2) from exc
--
--    console.print(f"[bold]overall[/bold]  A: {overall_a:.2f}   B: {overall_b:.2f}")
--    console.print()
--    console.print("[bold]per-probe[/bold] (A → B, Δ):")
--    for name in sorted(per_a.keys() | per_b.keys()):
--        a = per_a.get(name, 0.0)
--        b = per_b.get(name, 0.0)
--        delta = b - a
--        sign = "+" if delta >= 0 else ""
--        console.print(f"  {name:<30}  {a:.2f}  →  {b:.2f}   ({sign}{delta:+.2f})")
--
--
--def autogen_cmd(
--    dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")],
--    out: Annotated[
--        Path,
--        typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
--    ] = Path("sway.yaml"),
--) -> None:
--    """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm])."""
--    import importlib
--
--    try:
--        autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen")
--    except ImportError as exc:
--        typer.secho(
--            "dlm integration not installed — run: pip install 'dlm-sway[dlm]'",
--            err=True,
--            fg=typer.colors.RED,
--        )
--        raise typer.Exit(code=2) from exc
--
--    try:
--        autogen_mod.write_sway_yaml(dlm_path, out)
--    except SwayError as exc:
--        typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
--        raise typer.Exit(code=2) from exc
--
--    typer.echo(f"wrote {out}")
--
--
--def doctor_cmd() -> None:
--    """Print backend availability and version info."""
--    console = Console()
--    console.print(f"[bold]dlm-sway[/bold] {__version__}")
--    console.print(f"  python:    {sys.version.split()[0]}")
--    console.print(f"  platform:  {sys.platform}")
--    console.print()
--
--    console.print("[bold]backends[/bold]")
--    console.print(
--        f"  hf:        {_probe_import('torch')} {_probe_import('transformers')} {_probe_import('peft')}"
--    )
--    console.print(f"  mlx:       {_probe_import('mlx')} {_probe_import('mlx_lm')}")
--    console.print(f"  semsim:    {_probe_import('sentence_transformers')}")
--    console.print(
--        f"  style+:    {_probe_import('spacy')} {_probe_import('textstat')} {_probe_import('nlpaug')}"
--    )
--    console.print(f"  dlm:       {_probe_import('dlm')}")
--    console.print(f"  viz:       {_probe_import('matplotlib')}")
--
--
--def report_cmd(
--    result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")],
--    format: Annotated[
--        str, typer.Option("--format", help="Output format: terminal, md, junit, json.")
--    ] = "terminal",
--) -> None:
--    """Re-render a previously saved run (for history tracking / dashboards)."""
--    raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8"))
--    fmt = format.lower()
--    if fmt == "json":
--        typer.echo(json.dumps(raw, indent=2, sort_keys=True))
--        return
--    if fmt in {"md", "markdown"}:
--        # A file-level re-render needs the dataclasses back; simplest is
--        # to synthesize a minimal markdown from the JSON directly.
--        typer.echo(_render_markdown_from_json(raw))
--        return
--    if fmt == "junit":
--        typer.echo(_render_junit_from_json(raw))
--        return
--    # Default: terminal-ish one-liner summary.
--    score: dict[str, Any] = raw.get("score", {})
--    typer.echo(f"overall: {score.get('overall', 0.0):.2f}  [{score.get('band', '?')}]")
--    probes: list[dict[str, Any]] = raw.get("probes", [])
--    for p in probes:
--        typer.echo(
--            f"  {p['name']:<30}  {p['verdict']:<6}  "
--            f"{(p.get('score') or 0.0):.2f}  {p.get('message', '')[:60]}"
--        )
--
--
--# -- helpers -----------------------------------------------------------
--
--
--_BUILTIN_QUICK_PROMPTS: tuple[str, ...] = (
--    "The quick brown fox",
--    "Once upon a time",
--    "The answer to the question is",
--    "One important lesson is",
--    "In my opinion,",
--    "The first step is to",
--    "Remember that",
--    "A common mistake is",
--)
--
--
--def _load_prompts(path: Path) -> tuple[str, ...]:
--    return tuple(
--        line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()
--    )
--
--
--def _execute_spec(path: Path) -> tuple[SuiteResult, SwayScore]:
--    """Load a spec, build a backend, run the suite, fold scores. Shared
--    by ``run`` and ``gate``. Picks up .dlm-derived sections when the
--    spec's ``dlm_source`` is set."""
--    from dlm_sway.backends import build as build_backend
--    from dlm_sway.suite.loader import load_spec
--    from dlm_sway.suite.runner import run as run_suite
--    from dlm_sway.suite.score import compute as compute_score
--
--    spec = load_spec(path)
--    sections = None
--    doc_text = None
--    if spec.dlm_source is not None:
--        import importlib
--
--        try:
--            resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver")
--            handle = resolver.resolve_dlm(Path(spec.dlm_source))
--            sections = handle.sections
--            doc_text = handle.doc_text
--        except ImportError:
--            # Honoring dlm_source is best-effort — probes that need
--            # sections will SKIP with a pointer at the extra.
--            sections = None
--    backend = build_backend(spec.models.ft)
--    try:
--        result = run_suite(spec, backend, spec_path=str(path), sections=sections, doc_text=doc_text)
--    finally:
--        _close_if_possible(backend)
--    score_obj = compute_score(result)
--    return result, score_obj
--
--
--def _close_if_possible(backend: object) -> None:
--    close = getattr(backend, "close", None)
--    if callable(close):
--        close()
--
--
--def _probe_import(name: str) -> str:
--    import importlib
--
--    try:
--        mod = importlib.import_module(name)
--    except ImportError:
--        return f"[red]{name}: missing[/red]"
--    ver = getattr(mod, "__version__", "installed")
--    return f"[green]{name}: {ver}[/green]"
--
--
--def _render_markdown_from_json(raw: dict[str, Any]) -> str:
--    score: dict[str, Any] = raw.get("score", {})
--    lines: list[str] = [
--        "# dlm-sway report",
--        "",
--        f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`)  ",
--        f"**Base:** `{raw.get('base_model_id', '?')}`  ",
--        f"**Adapter:** `{raw.get('adapter_id', '?')}`  ",
--        "",
--        "## Probes",
--        "",
--        "| name | kind | verdict | score |",
--        "|---|---|---|---:|",
--    ]
--    probes: list[dict[str, Any]] = raw.get("probes", [])
--    for p in probes:
--        lines.append(
--            f"| {p['name']} | `{p['kind']}` | {p['verdict']} | {(p.get('score') or 0.0):.2f} |"
--        )
--    return "\n".join(lines)
--
--
--def _render_junit_from_json(raw: dict[str, Any]) -> str:
--    """Minimal JUnit renderer from a saved JSON (useful for report --format junit)."""
--    import xml.etree.ElementTree as ET
--
--    probes: list[dict[str, Any]] = raw.get("probes", [])
--    testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))})
--    for p in probes:
--        tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]})
--        if p["verdict"] == "fail":
--            ET.SubElement(tc, "failure", {"message": p.get("message", "")})
--        elif p["verdict"] == "error":
--            ET.SubElement(tc, "error", {"message": p.get("message", "")})
--        elif p["verdict"] == "skip":
--            ET.SubElement(tc, "skipped", {"message": p.get("message", "")})
--    return ET.tostring(testsuite, encoding="unicode")

sway/src/dlm_sway/core/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1	-"""Core abstractions: protocols, results, errors, determinism."""

sway/src/dlm_sway/core/determinism.pydeleted

--"""Deterministic-execution helper.
--
--Mirrors ``dlm.train.determinism.seed_everything`` so running the same
--suite twice on the same host produces the same :class:`ProbeResult`
--payloads. The dlm project treats determinism as a contract; sway takes
--the same posture for scoring operations.
--
--Generation is allowed to use non-deterministic attention kernels when
--``temperature > 0``, because a deterministic sampled generation is a
--contradiction. Scoring (logprobs, rolling logprobs, next-token dists)
--always runs under :func:`torch.use_deterministic_algorithms(True)`.
--"""
--
--from __future__ import annotations
--
--import os
--import random
--from dataclasses import dataclass
--from typing import Literal
--
--DeterminismClass = Literal["strict", "best_effort", "loose"]
--
--
--@dataclass(frozen=True, slots=True)
--class DeterminismSummary:
--    """What seeding actually accomplished, for logging in the report."""
--
--    class_: DeterminismClass
--    seed: int
--    notes: tuple[str, ...] = ()
--
--
--def seed_everything(seed: int, *, strict: bool = True) -> DeterminismSummary:
--    """Seed every RNG sway's probes touch and flip backend flags.
--
--    Idempotent — safe to call repeatedly with the same seed.
--
--    Parameters
--    ----------
--    seed:
--        The seed. Callers typically use the value from ``sway.yaml``'s
--        ``defaults.seed`` (default 0).
--    strict:
--        If ``True`` (the default), request deterministic CUDA algorithms
--        and set ``CUBLAS_WORKSPACE_CONFIG``. Scoring probes need this;
--        generation-only runs can set it ``False``.
--
--    Returns
--    -------
--    :class:`DeterminismSummary` with a classification:
--
--    - ``"strict"`` — deterministic algorithms active, no warnings.
--    - ``"best_effort"`` — platform doesn't support full determinism
--      (MPS, some CPU kernels).
--    - ``"loose"`` — seeded but deterministic algorithms refused.
--    """
--
--    notes: list[str] = []
--    clazz: DeterminismClass = "best_effort"
--
--    # Env vars must come first — torch reads them at cuBLAS init.
--    if strict:
--        os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
--
--    random.seed(seed)
--
--    # numpy is a hard dep; safe to seed unconditionally.
--    import numpy as np
--
--    np.random.seed(seed)
--
--    try:
--        import torch  # noqa: PLC0415 — lazy: torch is an optional extra.
--    except ModuleNotFoundError:
--        notes.append("torch not installed; seeded python + numpy only")
--        return DeterminismSummary(class_="best_effort", seed=seed, notes=tuple(notes))
--
--    torch.manual_seed(seed)
--    if torch.cuda.is_available():
--        torch.cuda.manual_seed_all(seed)
--        clazz = "strict"
--    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
--        clazz = "best_effort"
--        notes.append("MPS: bit-identical across runs is best-effort")
--    else:
--        clazz = "best_effort"
--        notes.append("CPU-only backend: strict determinism depends on BLAS impl")
--
--    if strict:
--        try:
--            torch.use_deterministic_algorithms(True, warn_only=True)
--            torch.backends.cudnn.benchmark = False
--        except Exception as exc:  # noqa: BLE001 — torch raises a naked Exception
--            clazz = "loose"
--            notes.append(f"deterministic algorithms refused: {exc}")
--
--    return DeterminismSummary(class_=clazz, seed=seed, notes=tuple(notes))

sway/src/dlm_sway/core/errors.pydeleted

--"""Exception hierarchy for dlm-sway.
--
--Every error sway raises inherits from :class:`SwayError` so callers can
--catch the whole family with a single ``except``. Subclasses carry enough
--context (spec paths, probe names, missing extras) for the CLI to render
--actionable messages without the caller having to introspect an exception
--chain.
--"""
--
--from __future__ import annotations
--
--
--class SwayError(Exception):
--    """Root of the dlm-sway exception hierarchy."""
--
--
--class SpecValidationError(SwayError):
--    """A ``sway.yaml`` (or equivalent) failed pydantic validation.
--
--    Parameters
--    ----------
--    message:
--        Human-readable summary of what went wrong.
--    source:
--        Path or identifier of the spec being validated, if known.
--    """
--
--    def __init__(self, message: str, *, source: str | None = None) -> None:
--        super().__init__(message)
--        self.source = source
--
--    def __str__(self) -> str:
--        base = super().__str__()
--        return f"{self.source}: {base}" if self.source else base
--
--
--class BackendNotAvailableError(SwayError):
--    """A requested backend's optional dependencies aren't installed.
--
--    The CLI turns this into a pointed ``pip install dlm-sway[<extra>]``
--    hint; programmatic callers can read :attr:`extra` directly.
--    """
--
--    def __init__(self, backend: str, *, extra: str, hint: str | None = None) -> None:
--        message = (
--            f"backend {backend!r} unavailable — install the extra: pip install 'dlm-sway[{extra}]'"
--        )
--        if hint:
--            message = f"{message}\n{hint}"
--        super().__init__(message)
--        self.backend = backend
--        self.extra = extra
--
--
--class ProbeError(SwayError):
--    """A probe failed to *execute* (as opposed to failing its assertion).
--
--    Distinct from a ``verdict=FAIL`` result — assertion failures are
--    normal and reported via :class:`ProbeResult`. This is for genuine
--    bugs: missing sections, mismatched tokenizers, NaN logits.
--    """
--
--    def __init__(self, probe: str, message: str) -> None:
--        super().__init__(f"probe {probe!r}: {message}")
--        self.probe = probe

sway/src/dlm_sway/core/model.pydeleted

--"""The :class:`Model` abstraction and :class:`ModelSpec` user-facing config.
--
--Probes operate on objects that satisfy :class:`Model` (for generation)
--and :class:`~dlm_sway.core.scoring.ScoringBackend` (for logit-level
--access). Backends return concrete instances of both — they are
--deliberately separate Protocols because not every backend exposes logits
--(e.g. an Ollama HTTP backend would implement ``Model`` but not
--``ScoringBackend``).
--
--The user-facing surface is :class:`ModelSpec`, a pydantic model that
--describes how to materialize a base + adapter pair. No ``.dlm``
--concepts live at this layer — those belong in
--:mod:`dlm_sway.integrations.dlm`.
--"""
--
--from __future__ import annotations
--
--from dataclasses import dataclass
--from pathlib import Path
--from typing import Any, Literal, Protocol, runtime_checkable
--
--from pydantic import BaseModel, ConfigDict, Field
--
--BackendKind = Literal["hf", "mlx", "dummy", "custom"]
--"""Registered scoring-backend kinds.
--
--``custom`` is an escape hatch — the runner looks up an entry point when
--it sees ``custom`` in a spec.
--"""
--
--
--class ModelSpec(BaseModel):
--    """How to materialize one model (base or fine-tuned)."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    kind: BackendKind = "hf"
--    base: str
--    """HuggingFace repo id (``HuggingFaceTB/SmolLM2-135M-Instruct``) or
--    a local path to a model directory."""
--
--    adapter: Path | None = None
--    """Path to a PEFT adapter directory (containing ``adapter_config.json``
--    and ``adapter_model.safetensors``). ``None`` → base-only model."""
--
--    dtype: Literal["auto", "fp16", "bf16", "fp32"] = "auto"
--    device: str = "auto"
--    """``"auto"`` chooses CUDA → MPS → CPU in that order."""
--
--    trust_remote_code: bool = False
--    """HuggingFace ``trust_remote_code`` passthrough. Off by default —
--    the user must opt in explicitly, matching sway's no-surprises
--    posture."""
--
--    entry_point: str | None = Field(default=None)
--    """Required when ``kind='custom'``. Import path like
--    ``mypkg.mybackend:MyBackend``."""
--
--
--@dataclass(frozen=True, slots=True)
--class LoadedModel:
--    """A materialized model plus the tokenizer that produced it.
--
--    Returned by backend ``load()`` methods. Probes usually don't touch
--    this directly — they go through the :class:`Model` /
--    :class:`~dlm_sway.core.scoring.ScoringBackend` Protocols.
--    """
--
--    id: str
--    """Stable handle: ``"base"`` or ``"ft"`` typically."""
--    spec: ModelSpec
--    model: Any
--    """Framework-native handle (torch ``nn.Module``, MLX array module …).
--
--    Typed as ``Any`` because the frameworks themselves ship unstubbed.
--    Backend implementations narrow this at their boundary."""
--    tokenizer: Any
--    meta: dict[str, Any]
--    """Backend-captured metadata: device, dtype, adapter version, bytes
--    on disk, num trainable params. Surfaced in the suite report."""
--
--
--@runtime_checkable
--class Model(Protocol):
--    """Minimum interface for text generation.
--
--    Implemented by backend-wrapped model objects. Probes that need logits
--    also require :class:`~dlm_sway.core.scoring.ScoringBackend`.
--    """
--
--    id: str
--
--    def generate(
--        self,
--        prompt: str,
--        *,
--        max_new_tokens: int,
--        temperature: float = 0.0,
--        top_p: float = 1.0,
--        seed: int = 0,
--    ) -> str:
--        """Generate a completion.
--
--        Defaults (``temperature=0``, ``top_p=1``) are greedy-decode for
--        reproducibility. Callers wanting sampled output must pass
--        non-defaults *and* a seed.
--        """
--        ...
--
--    def close(self) -> None:
--        """Release any resources held by this model."""
--        ...

sway/src/dlm_sway/core/result.pydeleted

--"""Probe and suite result types.
--
--Every numeric probe ultimately returns a :class:`ProbeResult`. The suite
--runner collects them into a :class:`SuiteResult` and the scorer folds
--that into a single :class:`SwayScore` with transparent per-component
--weights.
--
--These dataclasses are deliberately plain — no pydantic — because they
--cross probe/backend boundaries hundreds of times per run and a free
--``model_validate`` on every construction would dominate the runtime of
--cheap probes.
--"""
--
--from __future__ import annotations
--
--from dataclasses import dataclass, field
--from datetime import UTC, datetime
--from enum import StrEnum
--from typing import Any
--
--
--class Verdict(StrEnum):
--    """Outcome of a single probe against its assertion."""
--
--    PASS = "pass"
--    FAIL = "fail"
--    WARN = "warn"
--    SKIP = "skip"
--    ERROR = "error"
--
--
--@dataclass(frozen=True, slots=True)
--class ProbeResult:
--    """The result of running one probe.
--
--    Attributes
--    ----------
--    name:
--        User-facing name from the spec (unique within a suite).
--    kind:
--        Probe discriminator (``delta_kl``, ``section_internalization`` …).
--    verdict:
--        Pass / fail / warn / skip / error.
--    score:
--        Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric
--        probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`.
--    raw:
--        The raw metric value (e.g. KL=0.083). Probe-specific units.
--    z_score:
--        Standard deviations above the null-adapter baseline. ``None``
--        when no null calibration was run.
--    base_value:
--        The metric evaluated on the base model, when meaningful.
--    ft_value:
--        The metric evaluated on the fine-tuned model, when meaningful.
--    evidence:
--        Small structured payload for the report — prompts, example
--        completions, per-section breakdowns. Kept bounded (<10 KB) so
--        suite JSON stays under a megabyte.
--    message:
--        One-line diagnostic. Surfaces in the terminal report.
--    duration_s:
--        Wall time to execute.
--    """
--
--    name: str
--    kind: str
--    verdict: Verdict
--    score: float | None
--    raw: float | None = None
--    z_score: float | None = None
--    base_value: float | None = None
--    ft_value: float | None = None
--    evidence: dict[str, Any] = field(default_factory=dict)
--    message: str = ""
--    duration_s: float = 0.0
--
--
--@dataclass(frozen=True, slots=True)
--class SuiteResult:
--    """A full run of a sway.yaml suite."""
--
--    spec_path: str
--    started_at: datetime
--    finished_at: datetime
--    base_model_id: str
--    adapter_id: str
--    sway_version: str
--    probes: tuple[ProbeResult, ...] = ()
--    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
--    """Per-primitive null-adapter baseline stats (mean, std, runs). Used
--    to turn raw metrics into z-scores when rendering the report."""
--
--    @property
--    def wall_seconds(self) -> float:
--        return (self.finished_at - self.started_at).total_seconds()
--
--
--# Component weights for the composite score. Overridable in sway.yaml.
--DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = {
--    "adherence": 0.30,
--    "attribution": 0.35,
--    "calibration": 0.20,
--    "ablation": 0.15,
--}
--
--
--@dataclass(frozen=True, slots=True)
--class SwayScore:
--    """Composite score with a transparent per-component breakdown."""
--
--    overall: float
--    components: dict[str, float]
--    weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS))
--    band: str = ""
--    findings: tuple[str, ...] = ()
--
--    @staticmethod
--    def band_for(overall: float) -> str:
--        """Map a score to a human-readable band.
--
--        Bands (from the plan):
--          - <0.3  : indistinguishable from noise
--          - 0.3–0.6 : partial fit
--          - 0.6–0.85: healthy
--          - >0.85 : suspiciously good (possible overfit / memorization)
--        """
--        if overall < 0.3:
--            return "noise"
--        if overall < 0.6:
--            return "partial"
--        if overall <= 0.85:
--            return "healthy"
--        return "suspicious"
--
--
--def utcnow() -> datetime:
--    """Timezone-aware UTC timestamp (used by the runner)."""
--    return datetime.now(UTC)

sway/src/dlm_sway/core/scoring.pydeleted

--"""Scoring protocols: logprobs, next-token distributions, differential toggling.
--
--Scoring is **separate** from generation because not every backend can
--provide logits. Every numeric sway probe depends on at least one of
--three operations:
--
--1. ``logprob_of(prompt, completion)`` — score a completion against a
--   prompt (A1, B2, B3, C2, …).
--2. ``rolling_logprob(text)`` — perplexity over a piece of text (B1,
--   C2).
--3. ``next_token_dist(prompt, top_k)`` — the raw next-token distribution
--   at a single position (A1, N2).
--
--The :class:`DifferentialBackend` is the key performance primitive:
--both base and fine-tuned views share the same loaded weights and KV
--cache layout, toggled via PEFT's :meth:`set_adapter` /
--:meth:`disable_adapter`. A naive "load twice" implementation would
--double memory and halve throughput.
--"""
--
--from __future__ import annotations
--
--from contextlib import AbstractContextManager
--from dataclasses import dataclass, field
--from typing import Protocol, runtime_checkable
--
--import numpy as np
--from numpy.typing import NDArray
--
--from dlm_sway.core.model import Model
--
--
--@dataclass(frozen=True, slots=True)
--class RollingLogprob:
--    """Per-token logprobs over a piece of text, plus summary stats.
--
--    Attributes
--    ----------
--    token_ids:
--        The tokenizer output for ``text``. Length ``N``.
--    logprobs:
--        ``log p(token_i | token_<i)`` for each position i ≥ 1. Length
--        ``N-1``.
--    num_tokens:
--        ``N`` — included for convenience; ``len(token_ids)``.
--    total_logprob:
--        Sum of :attr:`logprobs`.
--    """
--
--    token_ids: NDArray[np.int64]
--    logprobs: NDArray[np.float32]
--    num_tokens: int
--    total_logprob: float
--
--    @property
--    def mean_logprob(self) -> float:
--        n = self.logprobs.size
--        return float(self.total_logprob / n) if n else 0.0
--
--    @property
--    def perplexity(self) -> float:
--        """``exp(-mean_logprob)``. Base-e, natural perplexity."""
--        return float(np.exp(-self.mean_logprob))
--
--
--@dataclass(frozen=True, slots=True)
--class TokenDist:
--    """A (possibly top-k truncated) next-token probability distribution.
--
--    For KL / JS divergence probes sway needs matched distributions
--    across base and fine-tuned views. The runner is responsible for
--    aligning ``top_k`` token slices between two ``TokenDist`` objects
--    before handing them to divergence math.
--    """
--
--    token_ids: NDArray[np.int64]
--    """Token ids, descending by probability. Length ``k``."""
--    logprobs: NDArray[np.float32]
--    """Log-probabilities for :attr:`token_ids`. Length ``k``."""
--    vocab_size: int
--    """Full vocab size — needed to renormalize top-k truncated slices."""
--    tail_logprob: float = field(default=0.0)
--    """log of (1 - sum of exp(logprobs[:k])); 0 if top_k covers the full vocab."""
--
--
--@runtime_checkable
--class ScoringBackend(Protocol):
--    """Logit-level access to a loaded model."""
--
--    def logprob_of(self, prompt: str, completion: str) -> float:
--        """Sum of log-probabilities of ``completion`` tokens given ``prompt``.
--
--        The prompt is *not* scored; only the completion contributes. The
--        value is in nats (natural log). Longer completions are
--        monotonically more negative — callers normalize by length if
--        they need a rate.
--        """
--        ...
--
--    def rolling_logprob(self, text: str) -> RollingLogprob:
--        """Compute per-token logprobs for the whole of ``text``.
--
--        Equivalent to lm-eval's ``loglikelihood_rolling``. Used for
--        perplexity comparison on held-out content (B1 SIS, C2).
--        """
--        ...
--
--    def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--        """Next-token distribution at the position after ``prompt``.
--
--        Truncated to ``top_k`` for memory; callers doing divergence math
--        over the top-k slice accept the (typically negligible) error vs
--        full-vocab KL.
--        """
--        ...
--
--
--@runtime_checkable
--class DifferentialBackend(Protocol):
--    """A backend that holds base + fine-tuned views on a single loaded model.
--
--    The idiomatic usage is::
--
--        with backend.as_base() as base_view:
--            p_base = base_view.next_token_dist(prompt)
--        with backend.as_finetuned() as ft_view:
--            p_ft = ft_view.next_token_dist(prompt)
--
--    Implementations toggle PEFT adapters via
--    :meth:`peft.PeftModel.set_adapter` / :meth:`disable_adapter`.
--
--    Invariant: the two views must be **not simultaneously usable**. A
--    caller holding a ``base_view`` after entering the ``as_finetuned``
--    context is a programmer error and implementations MUST detect and
--    raise.
--    """
--
--    def as_base(self) -> AbstractContextManager[_ScoringModel]: ...
--
--    def as_finetuned(self) -> AbstractContextManager[_ScoringModel]: ...
--
--
--@runtime_checkable
--class ScalableDifferentialBackend(DifferentialBackend, Protocol):
--    """A differential backend that can also scale the LoRA additive term.
--
--    LoRA applies ``W + (alpha/r) · B @ A`` to a base weight matrix. This
--    protocol exposes a context manager that temporarily multiplies that
--    additive term by ``lam`` for everything inside the ``with`` block.
--
--    ``lam = 0.0`` is equivalent to :meth:`as_base`.
--    ``lam = 1.0`` is equivalent to :meth:`as_finetuned`.
--    ``lam = 1.25`` overshoots — useful for N2 AdapterAblation's
--    response-curve measurement.
--
--    Only the HF backend ships an implementation in v0.1. Probes that
--    need scaling check via ``isinstance(backend, ScalableDifferentialBackend)``
--    at runtime and SKIP gracefully when unavailable.
--    """
--
--    def as_scaled_adapter(self, lam: float) -> AbstractContextManager[_ScoringModel]: ...
--
--
--@runtime_checkable
--class NullCalibratedBackend(DifferentialBackend, Protocol):
--    """A differential backend that can produce a "null adapter" view.
--
--    A null adapter has the *same structure* (rank, alpha, target modules)
--    as the real adapter but with weights drawn from a zero-mean Gaussian.
--    Running probes against this view yields the baseline "how much
--    signal does random noise produce" distribution — the denominator in
--    every numeric probe's z-score.
--
--    The context manager takes a ``seed`` so calibration runs can be
--    reproduced and multiple independent null samples can be drawn to
--    estimate ``std``.
--
--    Implementations MUST restore the real adapter on exit, including
--    on exceptions, so a caller can freely interleave null and real
--    calibrations within the same backend lifetime.
--    """
--
--    def as_null_adapter(
--        self, seed: int, *, init_scale: float = 0.02
--    ) -> AbstractContextManager[_ScoringModel]: ...
--
--
--# Helper Protocol for type-checking the yielded context object: it
--# must satisfy both Model and ScoringBackend. mypy doesn't support
--# intersection types, so we spell it out explicitly.
--@runtime_checkable
--class _ScoringModel(Model, ScoringBackend, Protocol):
--    """A Model that also exposes ScoringBackend."""
--
--    ...
--
--
--ScoringModel = _ScoringModel
--"""Public alias for the intersection ``Model & ScoringBackend``.
--
--Exported for backend and probe implementations that need to annotate
--variables of this combined type.
--"""

sway/src/dlm_sway/core/sections.pydeleted

--"""Minimal section contract for attribution probes.
--
--The flagship B1 ``section_internalization`` probe needs *structured*
--input — a section has an id, a kind, content text, and possibly some
--Q/A pairs or chosen/rejected triples. sway defines this shape here so
--the probes stay oblivious to the upstream (``.dlm`` parser, custom
--loaders, synthetic test fixtures).
--
--Field names are aligned with :mod:`dlm.doc.sections` but this module
--does not import ``dlm`` — the bridge at
--:mod:`dlm_sway.integrations.dlm` does the adaptation.
--"""
--
--from __future__ import annotations
--
--from dataclasses import dataclass, field
--from typing import Literal
--
--SectionKind = Literal["prose", "instruction", "preference"]
--
--
--@dataclass(frozen=True, slots=True)
--class SectionProbe:
--    """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section."""
--
--    prompt: str
--    gold: str
--
--
--@dataclass(frozen=True, slots=True)
--class SectionPreference:
--    """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section."""
--
--    prompt: str
--    chosen: str
--    rejected: str
--
--
--@dataclass(frozen=True, slots=True)
--class Section:
--    """One typed chunk of a training document.
--
--    Attributes
--    ----------
--    id:
--        Content-addressed identifier. ``.dlm`` uses a 16-hex-char
--        sha256 prefix; sway doesn't enforce a format.
--    kind:
--        Discriminator for which of :attr:`probes` /
--        :attr:`preferences` / :attr:`content` is the primary signal.
--    content:
--        Raw section text. Always populated; used by the rolling-PPL
--        path for PROSE sections.
--    probes:
--        For INSTRUCTION: parsed Q/A pairs. Empty tuple for others.
--    preferences:
--        For PREFERENCE: parsed chosen/rejected triples. Empty otherwise.
--    tag:
--        Optional free-form label for the section (e.g., "intro",
--        "api-reference"). Surfaces in per-section reports.
--    """
--
--    id: str
--    kind: SectionKind
--    content: str
--    probes: tuple[SectionProbe, ...] = field(default_factory=tuple)
--    preferences: tuple[SectionPreference, ...] = field(default_factory=tuple)
--    tag: str | None = None
--
--
--def filter_kinds(
--    sections: tuple[Section, ...], kinds: tuple[SectionKind, ...]
--) -> tuple[Section, ...]:
--    """Return only sections whose ``kind`` matches one of ``kinds``."""
--    allow = set(kinds)
--    return tuple(s for s in sections if s.kind in allow)

sway/src/dlm_sway/integrations/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1	-"""Optional integrations with upstream fine-tuning tools."""

sway/src/dlm_sway/integrations/dlm/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1	-"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``."""

sway/src/dlm_sway/integrations/dlm/autogen.pydeleted

--"""Auto-generate a ``sway.yaml`` from a ``.dlm`` document.
--
--Walks the parsed sections and emits one entry per primitive sway ships:
--the full 11-primitive battery wired up against the document's own
--content. The result is a YAML artifact the user commits alongside their
--``.dlm`` and diffs in PRs.
--
--The generated spec includes a ``dlm_source`` field that the suite loader
--uses to pick up :class:`~dlm_sway.core.sections.Section` data at run
--time — probes that need sections (B1, B3, C3) then work against the
--typed structure instead of re-parsing text.
--"""
--
--from __future__ import annotations
--
--from pathlib import Path
--from typing import Any
--
--import yaml
--
--from dlm_sway.core.errors import SwayError
--from dlm_sway.core.sections import Section
--from dlm_sway.integrations.dlm.resolver import DlmHandle, resolve_dlm
--
--
--def write_sway_yaml(dlm_path: Path, out: Path) -> None:
--    """Resolve the .dlm, build a spec dict, write it as YAML to ``out``."""
--    handle = resolve_dlm(dlm_path)
--    if handle.adapter_path is None:
--        raise SwayError(
--            f"{dlm_path}: no trained adapter found at ~/.dlm/store/{handle.dlm_id}/adapter; "
--            "train the document with `dlm train` before generating a sway suite."
--        )
--    spec = build_spec_dict(handle, dlm_source=str(dlm_path.resolve()))
--    out.write_text(yaml.safe_dump(spec, sort_keys=False), encoding="utf-8")
--
--
--def build_spec_dict(handle: DlmHandle, *, dlm_source: str | None = None) -> dict[str, Any]:
--    """Build a sway.yaml-shaped dict from a :class:`DlmHandle`."""
--    base_spec = {"kind": "hf", "base": handle.base_model}
--    ft_spec = {
--        "kind": "hf",
--        "base": handle.base_model,
--        "adapter": str(handle.adapter_path) if handle.adapter_path else None,
--    }
--    spec: dict[str, Any] = {
--        "version": 1,
--        "models": {"base": base_spec, "ft": ft_spec},
--        "defaults": {"seed": 0, "differential": True},
--        "suite": _build_suite(handle.sections),
--    }
--    if dlm_source is not None:
--        spec["dlm_source"] = dlm_source
--    return spec
--
--
--def _build_suite(sections: tuple[Section, ...]) -> list[dict[str, Any]]:
--    """Assemble the full probe battery for the given sections.
--
--    The ordering matters: ``null_adapter`` first so every downstream
--    probe's z-score threshold has stats to consult.
--    """
--    instruction_probes: list[tuple[str, str]] = [
--        (p.prompt, p.gold) for s in sections if s.kind == "instruction" for p in s.probes
--    ]
--    prose_prompts: list[str] = []
--    for s in sections:
--        if s.kind == "prose" and s.content.strip():
--            # Use the section's leading sentence as a natural completion prompt.
--            first_sentence = s.content.split(".")[0].strip()
--            if first_sentence:
--                prose_prompts.append(first_sentence + ".")
--
--    kl_prompts = [q for q, _ in instruction_probes][:16] or prose_prompts[:16]
--    style_prompts = prose_prompts[:8] or [q for q, _ in instruction_probes][:8]
--
--    suite: list[dict[str, Any]] = []
--
--    # Baseline calibration — always first.
--    suite.append({"name": "null_baseline", "kind": "null_adapter", "runs": 3})
--
--    # Adherence.
--    if kl_prompts:
--        suite.append(
--            {
--                "name": "delta_kl_doc",
--                "kind": "delta_kl",
--                "prompts": kl_prompts,
--                "assert_mean_gte": 0.02,
--            }
--        )
--    if instruction_probes:
--        suite.append(
--            {
--                "name": "revert_check",
--                "kind": "adapter_revert",
--                "cases": [
--                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
--                    for q, a in instruction_probes[:8]
--                ],
--                "assert_revert_rate_lt": 0.3,
--            }
--        )
--    if kl_prompts:
--        suite.append(
--            {
--                "name": "prompt_collapse",
--                "kind": "prompt_collapse",
--                "prompts": kl_prompts[:4],
--                "context_lengths": [0, 256, 512, 1024],
--                "assert_half_life_tokens": 300,
--            }
--        )
--
--    # Attribution.
--    if len(sections) >= 2:
--        suite.append(
--            {
--                "name": "section_attribution",
--                "kind": "section_internalization",
--                "per_section_threshold": 0.05,
--            }
--        )
--    if instruction_probes:
--        suite.append(
--            {
--                "name": "paraphrase_invariance",
--                "kind": "paraphrase_invariance",
--                "cases": [
--                    {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)}
--                    for q, a in instruction_probes[:6]
--                ],
--            }
--        )
--    has_preferences = any(s.kind == "preference" and s.preferences for s in sections)
--    if has_preferences:
--        suite.append(
--            {
--                "name": "preference_flip",
--                "kind": "preference_flip",
--                "assert_flip_rate_gte": 0.7,
--            }
--        )
--
--    # Calibration.
--    if style_prompts:
--        suite.append(
--            {
--                "name": "style_shift",
--                "kind": "style_fingerprint",
--                "prompts": style_prompts,
--            }
--        )
--    suite.append({"name": "general_knowledge", "kind": "calibration_drift"})
--    if any(s.kind == "prose" for s in sections):
--        suite.append(
--            {
--                "name": "verbatim_leak",
--                "kind": "leakage",
--                "prefix_chars": 128,
--                "continuation_chars": 256,
--            }
--        )
--
--    # Signature ablation — goes last because it's the most expensive.
--    if kl_prompts:
--        suite.append(
--            {
--                "name": "adapter_ablation",
--                "kind": "adapter_ablation",
--                "prompts": kl_prompts[:6],
--                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
--            }
--        )
--
--    return suite
--
--
--def _auto_paraphrases(prompt: str) -> list[str]:
--    """Small, deterministic paraphrase set used when authors don't supply one.
--
--    Purely heuristic — good enough to detect "did the model memorize the
--    exact wording". Real paraphrase generation lives behind the
--    ``semsim`` extra.
--    """
--    variants: list[str] = []
--    stripped = prompt.rstrip("?. ")
--    variants.append(f"Could you explain: {stripped}?")
--    variants.append(f"I'd like to know — {stripped}.")
--    variants.append(f"Please describe: {stripped}.")
--    return variants[:3]

sway/src/dlm_sway/integrations/dlm/resolver.pydeleted

--"""Resolve a ``.dlm`` file to the artifacts sway needs.
--
--Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything
--outside this package is oblivious to dlm's internal shape; the bridge
--is the only place that knows, e.g., that a dlm section carries a
--``kind`` field named ``type`` or that adapters live at
--``adapter/versions/vNNNN/``.
--"""
--
--from __future__ import annotations
--
--import hashlib
--from dataclasses import dataclass
--from pathlib import Path
--
--from dlm_sway.core.errors import SwayError
--from dlm_sway.core.sections import (
--    Section,
--    SectionKind,
--    SectionPreference,
--    SectionProbe,
--)
--
--
--@dataclass(frozen=True, slots=True)
--class DlmHandle:
--    """Everything the sway bridge pulls out of a ``.dlm`` file.
--
--    Attributes
--    ----------
--    dlm_id:
--        Stable identifier from the frontmatter.
--    base_model:
--        Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape
--        hatch, taken verbatim from the frontmatter.
--    adapter_path:
--        Directory containing the current trained PEFT adapter (resolved
--        via dlm's own ``StorePath.for_dlm``). ``None`` if the document
--        hasn't been trained yet.
--    sections:
--        Typed sections ready for sway's probes.
--    doc_text:
--        Concatenated raw content of all sections. Used by probes that
--        need a whole-document stylistic reference (C1).
--    """
--
--    dlm_id: str
--    base_model: str
--    adapter_path: Path | None
--    sections: tuple[Section, ...]
--    doc_text: str
--
--
--def resolve_dlm(dlm_path: Path) -> DlmHandle:
--    """Parse ``dlm_path`` and return a :class:`DlmHandle`.
--
--    Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message
--    when the file is malformed or when the resolved adapter path doesn't
--    exist on disk.
--    """
--    try:
--        from dlm.doc.parser import parse_file as dlm_parse_file
--    except ImportError as exc:
--        raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc
--
--    parsed = dlm_parse_file(dlm_path)
--    fm = parsed.frontmatter
--    sections = tuple(_translate_section(s) for s in parsed.sections)
--    doc_text = "\n\n".join(s.content for s in sections)
--
--    adapter_path = _resolve_adapter_path(fm.dlm_id)
--    base_hf_id = _resolve_base_model_to_hf_id(fm.base_model)
--
--    return DlmHandle(
--        dlm_id=fm.dlm_id,
--        base_model=base_hf_id,
--        adapter_path=adapter_path,
--        sections=sections,
--        doc_text=doc_text,
--    )
--
--
--def _resolve_base_model_to_hf_id(base_model: str) -> str:
--    """Translate dlm's base-model *key* to a HuggingFace repo id.
--
--    dlm's frontmatter stores registry keys like ``smollm2-135m`` which
--    resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends
--    call ``AutoModelForCausalLM.from_pretrained`` directly and need the
--    HF id. The ``hf:org/name`` escape hatch passes through unchanged.
--    """
--    if base_model.startswith("hf:"):
--        return base_model[len("hf:") :]
--    try:
--        from dlm.base_models import resolve as resolve_base
--    except ImportError:
--        return base_model
--    try:
--        spec = resolve_base(base_model)
--    except Exception:  # noqa: BLE001 — unknown dlm errors
--        return base_model
--    hf_id = getattr(spec, "hf_id", None)
--    return str(hf_id) if hf_id else base_model
--
--
--def _resolve_adapter_path(dlm_id: str) -> Path | None:
--    """Locate the current adapter directory for ``dlm_id``.
--
--    Uses dlm's module-level ``for_dlm`` helper if available, else falls
--    back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt``
--    pointer. Returns ``None`` if no adapter has been trained yet.
--    """
--    # Primary path: use dlm's own store-path helpers.
--    try:
--        from dlm.store.paths import for_dlm as _for_dlm
--    except ImportError:
--        _for_dlm = None
--
--    if _for_dlm is not None:
--        try:
--            store = _for_dlm(dlm_id)
--        except Exception:  # noqa: BLE001 — unknown dlm exception shapes
--            store = None
--        if store is not None:
--            try:
--                resolved = store.resolve_current_adapter()
--            except (AttributeError, FileNotFoundError):
--                resolved = None
--            if resolved is not None and Path(resolved).exists():
--                return Path(resolved)
--
--    # Manual fallback. The ``current.txt`` pointer is relative to the
--    # **store root**, not to current.txt's parent dir — so go up one level.
--    import os
--
--    home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser()
--    store_root = home / "store" / dlm_id
--    current_file = store_root / "adapter" / "current.txt"
--    if current_file.exists():
--        pointer = current_file.read_text(encoding="utf-8").strip()
--        candidate = (store_root / pointer).resolve()
--        if candidate.exists():
--            return candidate
--    return None
--
--
--def _translate_section(dlm_section: object) -> Section:
--    """Adapt a ``dlm.doc.sections.Section`` to sway's section type.
--
--    dlm's Section dataclass uses the attribute name ``type`` (not
--    ``kind``) and stores instruction/preference content as raw markdown
--    — dlm ships dedicated parsers (``parse_instruction_body``,
--    ``parse_preference_body``) that we reuse here so any future dlm
--    syntax additions land in sway for free.
--    """
--    # dlm's current attribute is ``type``; older revisions used ``kind``.
--    kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None))
--    kind = _normalize_kind(kind_raw)
--    content = str(getattr(dlm_section, "content", ""))
--    section_id = str(
--        getattr(dlm_section, "section_id", None)
--        or getattr(dlm_section, "id", None)
--        or _content_hash(content)
--    )
--    tag = getattr(dlm_section, "tag", None)
--
--    probes: tuple[SectionProbe, ...] = ()
--    preferences: tuple[SectionPreference, ...] = ()
--    if kind == "instruction":
--        probes = tuple(_parse_instruction(content, section_id=section_id))
--    elif kind == "preference":
--        preferences = tuple(_parse_preference(content, section_id=section_id))
--
--    return Section(
--        id=section_id,
--        kind=kind,
--        content=content,
--        probes=probes,
--        preferences=preferences,
--        tag=tag if isinstance(tag, str) else None,
--    )
--
--
--def _normalize_kind(raw: object) -> SectionKind:
--    """Map dlm's SectionType/str to sway's lowercase kind."""
--    if raw is None:
--        return "prose"
--    value = str(raw).lower()
--    # dlm uses uppercase StrEnum values like "PROSE"; normalize.
--    if value.endswith("prose") or "prose" in value:
--        return "prose"
--    if "instruction" in value:
--        return "instruction"
--    if "preference" in value:
--        return "preference"
--    return "prose"
--
--
--def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]:
--    """Pull (Q, A) pairs out of a dlm INSTRUCTION section body.
--
--    Delegates to dlm's own ``parse_instruction_body`` so syntax additions
--    land in sway without code changes here. Falls back to an empty list
--    on parse errors — the probe will fail gracefully.
--    """
--    try:
--        from dlm.data.instruction_parser import parse_instruction_body
--    except ImportError:
--        return []
--    try:
--        pairs = parse_instruction_body(content, section_id=section_id)
--    except Exception:  # noqa: BLE001 — dlm raises InstructionParseError
--        return []
--    out: list[SectionProbe] = []
--    for p in pairs:
--        q = getattr(p, "question", getattr(p, "prompt", ""))
--        a = getattr(p, "answer", getattr(p, "gold", ""))
--        if q and a:
--            out.append(SectionProbe(prompt=str(q), gold=str(a)))
--    return out
--
--
--def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]:
--    """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body."""
--    try:
--        from dlm.data.preference_parser import parse_preference_body
--    except ImportError:
--        return []
--    try:
--        triples = parse_preference_body(content, section_id=section_id)
--    except Exception:  # noqa: BLE001 — dlm raises PreferenceParseError
--        return []
--    out: list[SectionPreference] = []
--    for t in triples:
--        p = str(getattr(t, "prompt", ""))
--        c = str(getattr(t, "chosen", ""))
--        rej = str(getattr(t, "rejected", ""))
--        if p and c and rej:
--            out.append(SectionPreference(prompt=p, chosen=c, rejected=rej))
--    return out
--
--
--def _content_hash(content: str) -> str:
--    return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]

sway/src/dlm_sway/probes/__init__.pydeleted

--"""Probe primitives. Each module in this package implements one primitive.
--
--Importing this package eagerly imports every probe module so their
--``__init_subclass__`` hooks populate the registry. If you're hitting
--"unknown probe kind" from :func:`dlm_sway.probes.base.build_probe`, the
--fix is to ``import dlm_sway.probes`` before building the probe — which
--this ``__init__`` does for you.
--"""
--
--from __future__ import annotations
--
--# Register every shipped probe with the central registry by importing
--# its module. Order is not load-bearing for registration but matches the
--# categorical grouping in :mod:`dlm_sway.core.result`.
--from dlm_sway.probes import (  # noqa: F401 — imports register the probes
--    adapter_ablation,
--    adapter_revert,
--    calibration_drift,
--    delta_kl,
--    leakage,
--    null_adapter,
--    paraphrase_invariance,
--    preference_flip,
--    prompt_collapse,
--    section_internalization,
--    style_fingerprint,
--)

sway/src/dlm_sway/probes/_calibration_pack.pydeleted

--"""A small, built-in general-knowledge probe pack for C2.
--
--Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few
--tokens a competent base model should assign high probability to. The
--items are deliberately *factually trivial* — the point isn't "does the
--model know this?" but "did the fine-tune forget this?" — so the pack
--skews toward grade-school geography, chemistry, arithmetic, and
--high-frequency idiom.
--
--A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD +
--OpenBookQA. This 30-item seed lets the probe ship today and catches the
--most egregious over-fit cases.
--"""
--
--from __future__ import annotations
--
--from typing import Final
--
--CalibrationItem = tuple[str, str]
--
--BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = (
--    # Geography
--    ("The capital of France is", " Paris"),
--    ("The capital of Japan is", " Tokyo"),
--    ("The largest ocean on Earth is the", " Pacific"),
--    ("Mount Everest is located on the border of Nepal and", " China"),
--    ("The longest river in South America is the", " Amazon"),
--    # Natural sciences
--    ("Water freezes at zero degrees", " Celsius"),
--    ("The chemical symbol for gold is", " Au"),
--    ("Light travels faster than", " sound"),
--    ("Plants convert sunlight into energy through", " photosynthesis"),
--    ("The Earth orbits around the", " Sun"),
--    # Arithmetic
--    ("Two plus two equals", " four"),
--    ("Ten times ten equals", " one hundred"),
--    ("Half of one hundred is", " fifty"),
--    ("A dozen means", " twelve"),
--    # Language and idiom
--    ("A rose by any other name would smell as", " sweet"),
--    ("To be or not to be, that is the", " question"),
--    ("The early bird catches the", " worm"),
--    ("Actions speak louder than", " words"),
--    ("A picture is worth a thousand", " words"),
--    # History
--    ("World War II ended in the year", " 1945"),
--    ("The first president of the United States was", " George Washington"),
--    ("The Berlin Wall fell in", " 1989"),
--    # Biology
--    ("Humans have twenty", " fingers and toes"),
--    ("The human body has two", " lungs"),
--    ("Blood is pumped through the body by the", " heart"),
--    # Technology
--    ("HTML stands for HyperText", " Markup Language"),
--    ("The World Wide Web was invented by Tim", " Berners-Lee"),
--    # Miscellaneous
--    ("One year has", " 365 days"),
--    ("A week has seven", " days"),
--    ("There are seven colors in a", " rainbow"),
--)
--"""30 items covering geography, science, arithmetic, language, history,
--biology, and technology. Pulled from public-domain grade-school facts so
--there's no licensing concern about shipping with the wheel."""

sway/src/dlm_sway/probes/_divergence.pydeleted

--"""Shared math for divergence-based probes.
--
--Extracted so :mod:`delta_kl`, :mod:`adapter_ablation`, and any future
--probe operating on next-token distributions reuse the same aligned-
--top-k KL / JS computation. Having one implementation keeps the numerical
--treatment consistent across the report.
--"""
--
--from __future__ import annotations
--
--import math
--from typing import Literal
--
--import numpy as np
--from numpy.typing import NDArray
--
--from dlm_sway.core.scoring import TokenDist
--
--Divergence = Literal["kl", "js"]
--
--
--def aligned_probs(
--    base: TokenDist, ft: TokenDist
--) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
--    """Return aligned probability vectors over the union of top-k tokens.
--
--    Two ``TokenDist`` objects may surface different top-k indices if
--    the two models disagree about the hot tokens. We build a shared
--    support — ``union(base.token_ids, ft.token_ids)`` — and slot the
--    known probabilities in. Unknown entries fall back to the
--    per-distribution tail mass divided across the missing tokens,
--    which is the maximum-entropy completion under the truncation.
--    """
--    union_ids = np.union1d(base.token_ids, ft.token_ids)
--    k = int(union_ids.size)
--
--    base_probs = _to_support(base, union_ids, k)
--    ft_probs = _to_support(ft, union_ids, k)
--
--    # Normalize in case of floating noise from the fill-in.
--    base_probs /= base_probs.sum()
--    ft_probs /= ft_probs.sum()
--    return base_probs, ft_probs
--
--
--def _to_support(dist: TokenDist, support: NDArray[np.int64], k: int) -> NDArray[np.float64]:
--    probs = np.exp(dist.logprobs.astype(np.float64))
--    out = np.zeros(k, dtype=np.float64)
--    known_mass = float(probs.sum())
--    tail_mass = max(0.0, 1.0 - known_mass)
--
--    id_to_idx = {int(tok): idx for idx, tok in enumerate(support.tolist())}
--    missing = 0
--    for tok, p in zip(dist.token_ids.tolist(), probs.tolist(), strict=True):
--        i = id_to_idx.get(int(tok))
--        if i is None:
--            # Shouldn't happen given union construction.
--            missing += 1
--            continue
--        out[i] = float(p)
--
--    # Spread the tail mass over the support entries that this dist
--    # doesn't explicitly provide. Size of that set:
--    n_unknown = int((out == 0.0).sum()) - missing
--    if n_unknown > 0 and tail_mass > 0.0:
--        per = tail_mass / n_unknown
--        out[out == 0.0] = per
--
--    return out
--
--
--def kl(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
--    """KL(p || q) in nats. Robust to zeros in p (treated as 0·log0 = 0)."""
--    mask = p > 0.0
--    safe_q = np.where(q > 0.0, q, 1e-12)
--    return float(np.sum(p[mask] * (np.log(p[mask]) - np.log(safe_q[mask]))))
--
--
--def js(p: NDArray[np.float64], q: NDArray[np.float64]) -> float:
--    """Jensen-Shannon divergence. Symmetric, bounded in [0, ln 2] (nats).
--
--    The upper bound makes JS a nicer default for thresholding than raw
--    KL — a user doesn't need to know their specific model's KL scale to
--    pick a threshold.
--    """
--    m = 0.5 * (p + q)
--    return 0.5 * kl(p, m) + 0.5 * kl(q, m)
--
--
--def divergence(base: TokenDist, ft: TokenDist, kind: Divergence = "js") -> float:
--    """Compute KL or JS between two ``TokenDist`` on a shared support."""
--    p, q = aligned_probs(base, ft)
--    if kind == "js":
--        return js(p, q)
--    if kind == "kl":
--        return kl(q, p)  # KL(ft || base) — "how much does ft diverge from base"
--    raise ValueError(f"unknown divergence kind: {kind!r}")
--
--
--def js_ln2() -> float:
--    """Upper bound on JS in nats. Useful for normalization."""
--    return math.log(2.0)

sway/src/dlm_sway/probes/adapter_ablation.pydeleted

--"""N2 AdapterAblation — the sway signature primitive.
--
--Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25}
--and measures the mean divergence from the base distribution at each
--step. Fits a monotonic response curve; reports three shape metrics:
--
--- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means
--  the adapter's effect scales predictably; low means it's "all or
--  nothing" (degenerate).
--- **saturation_lambda**: the smallest λ at which divergence reaches
--  90% of the λ=1 value. Too low (<0.3) means the adapter fires at
--  partial strength — fragile. Too high (>1.0) means the adapter is
--  under-trained.
--- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the
--  healthy "pushing past 1 still moves the model" signal. An overshoot
--  below 1.0 suggests collapse.
--
--This is the single novel primitive that no generic eval harness
--provides — sway's position next to the adapter math makes it possible.
--
--Requires the backend to implement
--:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes
--SKIP gracefully on backends that don't.
--"""
--
--from __future__ import annotations
--
--from typing import Literal
--
--import numpy as np
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.core.scoring import ScalableDifferentialBackend
--from dlm_sway.probes._divergence import Divergence, divergence
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--class AdapterAblationSpec(ProbeSpec):
--    kind: Literal["adapter_ablation"] = "adapter_ablation"
--    prompts: list[str] = Field(default_factory=list)
--    lambdas: list[float] = Field(
--        default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
--        min_length=3,
--    )
--    divergence: Divergence = "js"
--    top_k: int | None = None
--    assert_linearity_gte: float = 0.85
--    assert_saturation_between: tuple[float, float] = (0.3, 1.05)
--    assert_overshoot_gte: float = 1.02
--
--
--class AdapterAblationProbe(Probe):
--    kind = "adapter_ablation"
--    spec_cls = AdapterAblationSpec
--    category = "ablation"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, AdapterAblationSpec)
--        if not spec.prompts:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no prompts provided",
--            )
--        if not isinstance(ctx.backend, ScalableDifferentialBackend):
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message=(
--                    "backend does not implement ScalableDifferentialBackend — "
--                    "adapter ablation requires LoRA-scale access"
--                ),
--            )
--
--        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
--
--        # Reference distribution at λ=0 (adapter scaled to zero → base).
--        lam_zero = min(spec.lambdas)
--        per_lambda: list[float] = []
--        for lam in spec.lambdas:
--            divs_for_lam: list[float] = []
--            for prompt in spec.prompts:
--                with ctx.backend.as_scaled_adapter(lam_zero) as ref:
--                    ref_dist = ref.next_token_dist(prompt, top_k=top_k)
--                with ctx.backend.as_scaled_adapter(lam) as scaled:
--                    scaled_dist = scaled.next_token_dist(prompt, top_k=top_k)
--                divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence))
--            per_lambda.append(float(np.mean(divs_for_lam)))
--
--        lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64)
--        divs_arr = np.asarray(per_lambda, dtype=np.float64)
--
--        linearity = _r_squared(lambdas_arr, divs_arr)
--        saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr)
--        overshoot = _overshoot(lambdas_arr, divs_arr)
--
--        # Pass when all three shape metrics land in their healthy bands.
--        sat_lo, sat_hi = spec.assert_saturation_between
--        ok_lin = linearity >= spec.assert_linearity_gte
--        ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi
--        ok_over = overshoot >= spec.assert_overshoot_gte
--        verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL
--
--        lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6)))
--        over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2))
--        sat_score = 1.0 if ok_sat else 0.3
--        score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=linearity,
--            evidence={
--                "lambdas": spec.lambdas,
--                "mean_divergence_per_lambda": per_lambda,
--                "linearity": linearity,
--                "saturation_lambda": saturation_lambda,
--                "overshoot": overshoot,
--                "passed_linearity": ok_lin,
--                "passed_saturation": ok_sat,
--                "passed_overshoot": ok_over,
--                "weight": spec.weight,
--            },
--            message=(
--                f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} "
--                f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}"
--                if saturation_lambda is not None
--                else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}"
--            ),
--        )
--
--
--def _r_squared(x: np.ndarray, y: np.ndarray) -> float:
--    """Coefficient of determination for a linear fit of ``y`` on ``x``."""
--    if x.size < 2:
--        return 0.0
--    xm = float(x.mean())
--    ym = float(y.mean())
--    denom = float(((x - xm) ** 2).sum())
--    if denom == 0.0:
--        return 0.0
--    slope = float(((x - xm) * (y - ym)).sum()) / denom
--    intercept = ym - slope * xm
--    y_pred = slope * x + intercept
--    ss_res = float(((y - y_pred) ** 2).sum())
--    ss_tot = float(((y - ym) ** 2).sum())
--    if ss_tot == 0.0:
--        return 1.0
--    return max(0.0, 1.0 - ss_res / ss_tot)
--
--
--def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None:
--    """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1)."""
--    # Locate the index of λ=1.0 (or the closest entry ≤ 1.0).
--    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
--    if candidates.size == 0:
--        # Fall back to the largest λ ≤ 1.0.
--        mask = lambdas <= 1.0
--        if not mask.any():
--            return None
--        idx1 = int(np.argmax(lambdas * mask))
--    else:
--        idx1 = int(candidates[0])
--    target = 0.9 * float(divs[idx1])
--    if target <= 0:
--        return None
--    for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False):
--        if d >= target:
--            return float(lam)
--    return None
--
--
--def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float:
--    """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0."""
--    idx_max = int(np.argmax(lambdas))
--    candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0]
--    if candidates.size == 0:
--        return 1.0
--    idx1 = int(candidates[0])
--    if idx_max == idx1:
--        return 1.0
--    d1 = float(divs[idx1])
--    dmax = float(divs[idx_max])
--    if d1 <= 0:
--        return 1.0
--    return dmax / d1

sway/src/dlm_sway/probes/adapter_revert.pydeleted

--"""A2 AdapterRevert — does the fine-tuned model drift back to base under pressure?
--
--For each test case the user provides a prompt, a "gold" answer (the
--adapter's intended response), and one or more adversarial paraphrases of
--the prompt. We generate base-model and ft-model completions on every
--paraphrase and ask: does the ft output cluster semantically with the
--base's output (revert) or with the gold (adhere)?
--
--Signal: ``revert_rate`` = fraction of (case, paraphrase) pairs where
--``cos(ft, base) > cos(ft, gold)``. A healthy fine-tune holds below 25%.
--
--Needs sentence embeddings. Without the ``semsim`` extra installed the
--probe returns :attr:`Verdict.SKIP` with a pip hint — deterministic
--n-gram fallbacks don't carry semantic equivalence reliably enough to
--drive a revert decision, and we'd rather be honest than lossy.
--"""
--
--from __future__ import annotations
--
--from typing import Any, Literal
--
--from pydantic import BaseModel, ConfigDict, Field
--
--from dlm_sway.core.errors import BackendNotAvailableError
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--class AdapterRevertCase(BaseModel):
--    """One revert test case."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    prompt: str
--    gold: str
--    """What the adapter is supposed to produce."""
--    paraphrases: list[str] = Field(default_factory=list, min_length=1)
--    """At least one paraphrase is required — revert is observed under
--    reframing, not on the original prompt."""
--
--
--class AdapterRevertSpec(ProbeSpec):
--    kind: Literal["adapter_revert"] = "adapter_revert"
--    cases: list[AdapterRevertCase] = Field(default_factory=list)
--    max_new_tokens: int = 64
--    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
--    """HF id of the embedder. Default is ~80 MB, CPU-friendly."""
--    base_gold_similarity_cap: float = 0.75
--    """Skip pairs where base and gold are trivially similar — those
--    can't distinguish revert from adherence, and including them would
--    inflate the revert rate with noise."""
--    assert_revert_rate_lt: float = 0.25
--
--
--class AdapterRevertProbe(Probe):
--    kind = "adapter_revert"
--    spec_cls = AdapterRevertSpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, AdapterRevertSpec)
--        if not spec.cases:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no cases provided",
--            )
--
--        try:
--            embed = _load_embedder(spec.embedding_model)
--        except BackendNotAvailableError as exc:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message=str(exc),
--            )
--
--        import numpy as np
--
--        total = 0
--        reverts = 0
--        dropped_trivial = 0
--        per_case: list[dict[str, Any]] = []
--        for case in spec.cases:
--            gold_vec = embed([case.gold])[0]
--            for pp in case.paraphrases:
--                with ctx.backend.as_base() as bv:
--                    base_gen = bv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
--                with ctx.backend.as_finetuned() as fv:
--                    ft_gen = fv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
--                vecs = embed([base_gen, ft_gen])
--                base_vec, ft_vec = vecs[0], vecs[1]
--                base_gold = _cosine(base_vec, gold_vec)
--                if base_gold > spec.base_gold_similarity_cap:
--                    dropped_trivial += 1
--                    continue
--                cos_ft_base = _cosine(ft_vec, base_vec)
--                cos_ft_gold = _cosine(ft_vec, gold_vec)
--                total += 1
--                if cos_ft_base > cos_ft_gold:
--                    reverts += 1
--                per_case.append(
--                    {
--                        "prompt": pp[:80],
--                        "cos_ft_base": cos_ft_base,
--                        "cos_ft_gold": cos_ft_gold,
--                        "reverted": cos_ft_base > cos_ft_gold,
--                    }
--                )
--
--        if total == 0:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.WARN,
--                score=0.5,
--                message=(
--                    f"all {dropped_trivial} cases had base≈gold (> "
--                    f"{spec.base_gold_similarity_cap}) — no separable signal"
--                ),
--                evidence={"dropped_trivial": dropped_trivial, "weight": spec.weight},
--            )
--
--        rate = reverts / total
--        verdict = Verdict.PASS if rate < spec.assert_revert_rate_lt else Verdict.FAIL
--        score = max(0.0, 1.0 - rate / max(spec.assert_revert_rate_lt, 1e-6))
--        score = float(np.clip(score, 0.0, 1.0))
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=rate,
--            evidence={
--                "revert_rate": rate,
--                "reverts": reverts,
--                "total": total,
--                "dropped_trivial": dropped_trivial,
--                "per_case": per_case[:8],  # cap to keep JSON bounded
--                "weight": spec.weight,
--            },
--            message=f"revert_rate={rate:.2%} (reverts={reverts}/{total}, dropped_trivial={dropped_trivial})",
--        )
--
--
--def _load_embedder(model_id: str):  # type: ignore[no-untyped-def]
--    """Return a callable ``list[str] -> np.ndarray`` over encoded vectors."""
--    try:
--        from sentence_transformers import SentenceTransformer
--    except ImportError as exc:
--        raise BackendNotAvailableError(
--            "adapter_revert",
--            extra="semsim",
--            hint="adapter_revert relies on sentence embeddings.",
--        ) from exc
--    st = SentenceTransformer(model_id)
--
--    def _embed(texts: list[str]):  # type: ignore[no-untyped-def]
--        return st.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
--
--    return _embed
--
--
--def _cosine(a: Any, b: Any) -> float:
--    import numpy as np
--
--    av = np.asarray(a, dtype=np.float64)
--    bv = np.asarray(b, dtype=np.float64)
--    na = float(np.linalg.norm(av))
--    nb = float(np.linalg.norm(bv))
--    if na == 0.0 or nb == 0.0:
--        return 0.0
--    return float(np.dot(av, bv) / (na * nb))

sway/src/dlm_sway/probes/base.pydeleted

--"""Probe abstract base + per-kind registry.
--
--The registry is the extension point. Adding a new probe means:
--
--1. Subclass :class:`ProbeSpec` with a unique ``kind`` field (Literal).
--2. Subclass :class:`Probe` setting ``kind`` and ``spec_cls``.
--3. Importing the probe module at least once (its subclass hook registers
--   itself).
--
--The runner uses :func:`build_probe` to map each raw spec dict to a
--``(Probe, ProbeSpec)`` pair. Validation errors are turned into
--:class:`~dlm_sway.core.errors.SpecValidationError` with the probe name
--as the source so error messages localize to the offending entry.
--"""
--
--from __future__ import annotations
--
--from abc import ABC, abstractmethod
--from dataclasses import dataclass, field
--from typing import Any, ClassVar
--
--from pydantic import BaseModel, ConfigDict, ValidationError
--
--from dlm_sway.core.errors import SpecValidationError
--from dlm_sway.core.result import ProbeResult
--from dlm_sway.core.scoring import DifferentialBackend
--from dlm_sway.core.sections import Section
--
--
--class ProbeSpec(BaseModel):
--    """Common fields for every probe's spec entry in ``sway.yaml``."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    name: str
--    """Unique within a suite; surfaces in the report."""
--    kind: str
--    """Discriminator — must match a registered :class:`Probe` subclass."""
--    enabled: bool = True
--    """If ``False`` the runner records a :class:`~dlm_sway.core.result.Verdict.SKIP`."""
--    weight: float = 1.0
--    """Weight inside the probe's component (adherence / attribution / …)."""
--
--
--@dataclass(frozen=True, slots=True)
--class RunContext:
--    """What a probe can read beyond its own spec.
--
--    Probes should receive exactly what they need and nothing more; fat
--    contexts encourage coupling between unrelated probes.
--
--    Attributes
--    ----------
--    backend:
--        The differential backend holding base + fine-tuned views.
--    seed:
--        Seed for deterministic probe RNGs (paraphrase sampling, etc).
--    top_k:
--        Default truncation for next-token distributions.
--    sections:
--        Optional list of typed sections (populated by the .dlm bridge;
--        ``None`` when sway is invoked against bare HF+PEFT).
--    doc_text:
--        Raw document text, if available.
--    null_stats:
--        Null-adapter baseline stats for z-score calibration, keyed by
--        probe *kind*. Populated by the runner after it's executed the
--        ``null_adapter`` probe (if configured).
--    """
--
--    backend: DifferentialBackend
--    seed: int = 0
--    top_k: int = 256
--    sections: tuple[Section, ...] | None = None
--    doc_text: str | None = None
--    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
--
--
--_REGISTRY: dict[str, type[Probe]] = {}
--
--
--class Probe(ABC):
--    """Concrete probe. One instance per probe spec in the suite."""
--
--    kind: ClassVar[str]
--    """The string used in ``sway.yaml``'s ``kind`` field."""
--    spec_cls: ClassVar[type[ProbeSpec]]
--    """The pydantic model class that validates this probe's spec."""
--    category: ClassVar[str] = "adherence"
--    """One of: ``adherence``, ``attribution``, ``calibration``,
--    ``ablation``, ``baseline``. Drives composite scoring."""
--
--    def __init_subclass__(cls, **kwargs: Any) -> None:
--        super().__init_subclass__(**kwargs)
--        # The abstract class itself has no `kind`; skip registration.
--        if "kind" not in cls.__dict__:
--            return
--        kind = cls.kind
--        if kind in _REGISTRY:
--            raise ValueError(f"duplicate probe kind {kind!r}: {_REGISTRY[kind]!r} vs {cls!r}")
--        _REGISTRY[kind] = cls
--
--    @abstractmethod
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: ...
--
--
--def registry() -> dict[str, type[Probe]]:
--    """Read-only view of registered probes."""
--    return dict(_REGISTRY)
--
--
--def build_probe(raw: dict[str, Any]) -> tuple[Probe, ProbeSpec]:
--    """Validate a raw YAML probe entry and return (Probe instance, spec)."""
--    kind = raw.get("kind")
--    if not isinstance(kind, str):
--        raise SpecValidationError(
--            "probe entry missing string 'kind' field",
--            source=str(raw.get("name", "<unknown>")),
--        )
--    if kind not in _REGISTRY:
--        known = ", ".join(sorted(_REGISTRY))
--        raise SpecValidationError(
--            f"unknown probe kind {kind!r} (registered: {known})",
--            source=str(raw.get("name", "<unknown>")),
--        )
--    probe_cls = _REGISTRY[kind]
--    try:
--        spec = probe_cls.spec_cls.model_validate(raw)
--    except ValidationError as exc:
--        raise SpecValidationError(str(exc), source=str(raw.get("name", "<unknown>"))) from exc
--    return probe_cls(), spec

sway/src/dlm_sway/probes/calibration_drift.pydeleted

--"""C2 CalibrationDrift — did we break general knowledge while fitting the doc?
--
--The classic small-doc fine-tune failure mode: the adapter learned the
--document so well that it forgot the world. C2 catches this by scoring
--base and ft on a packaged set of general-knowledge completions (the
--``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts)
--and flagging items whose per-token logprob regressed significantly.
--
--A healthy fine-tune: some items drift slightly (mild confidence shift,
--normal), but essentially none regress below a nat of slack. An over-fit
--fine-tune: 20%+ of items regress, the adapter has torched its ability
--to answer anything outside the document.
--
--Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND
--``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default
--to values that trigger on genuine damage but tolerate normal drift.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--class CalibrationItemSpec(ProbeSpec):
--    """Not used directly — documents the shape of an item override."""
--
--    kind: Literal["__calibration_item"] = "__calibration_item"
--    prompt: str = ""
--    gold: str = ""
--
--
--class CalibrationDriftSpec(ProbeSpec):
--    kind: Literal["calibration_drift"] = "calibration_drift"
--    pack: Literal["builtin"] = "builtin"
--    """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom
--    packs will ship via a file reference in a later milestone."""
--    items_limit: int | None = None
--    """If set, truncate the pack to this many items (for fast runs)."""
--    assert_fraction_regressed_lt: float = 0.15
--    assert_mean_delta_gte: float = -0.5
--    """Mean per-token logprob delta (ft − base) across the pack. Slightly
--    negative is tolerable; deeply negative is not."""
--    regression_nats: float = 1.0
--    """How many nats worse an item must get to count as regressed."""
--    items: list[tuple[str, str]] = Field(default_factory=list)
--    """Optional inline override of the packaged items."""
--
--
--class CalibrationDriftProbe(Probe):
--    kind = "calibration_drift"
--    spec_cls = CalibrationDriftSpec
--    category = "calibration"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, CalibrationDriftSpec)
--        items = list(spec.items) if spec.items else list(BUILT_IN_PACK)
--        if spec.items_limit is not None:
--            items = items[: spec.items_limit]
--        if not items:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no calibration items",
--            )
--
--        deltas: list[float] = []
--        regressed = 0
--        worst: list[dict[str, float | str]] = []
--
--        for prompt, gold in items:
--            tokens = max(_token_estimate(gold), 1)
--            with ctx.backend.as_base() as b:
--                lp_base = b.logprob_of(prompt, gold) / tokens
--            with ctx.backend.as_finetuned() as f:
--                lp_ft = f.logprob_of(prompt, gold) / tokens
--            delta = lp_ft - lp_base
--            deltas.append(delta)
--            if delta < -spec.regression_nats:
--                regressed += 1
--                worst.append({"prompt": prompt, "gold": gold, "delta": delta})
--
--        # Surface the worst offenders — up to 5.
--        worst.sort(key=lambda d: float(d["delta"]))
--        worst = worst[:5]
--
--        frac_regressed = regressed / len(items)
--        mean_delta = statistics.fmean(deltas)
--
--        passed = (
--            frac_regressed < spec.assert_fraction_regressed_lt
--            and mean_delta >= spec.assert_mean_delta_gte
--        )
--        verdict = Verdict.PASS if passed else Verdict.FAIL
--        # Score: 1.0 at zero regression + zero drift, declining with either.
--        regress_component = max(
--            0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6)
--        )
--        drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5))
--        score = 0.6 * regress_component + 0.4 * drift_component
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=frac_regressed,
--            base_value=None,
--            ft_value=mean_delta,
--            evidence={
--                "fraction_regressed": frac_regressed,
--                "mean_delta_nats": mean_delta,
--                "regressed_count": regressed,
--                "total_items": len(items),
--                "worst_offenders": worst,
--                "regression_nats_threshold": spec.regression_nats,
--                "weight": spec.weight,
--            },
--            message=(
--                f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats "
--                f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok"
--            ),
--        )
--
--
--def _token_estimate(s: str) -> int:
--    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/delta_kl.pydeleted

--"""A1 DeltaKL — the simplest adherence probe.
--
--For each prompt, compute the JS (default) or KL divergence between the
--base and fine-tuned model's next-token distributions at the position
--after the prompt. Aggregate across prompts with a mean.
--
--*What it tells you:* whether the adapter is distinguishable from the base
--on things the document cares about. A zero-divergence result is a red
--flag — the adapter is ignored.
--
--*What it can't tell you:* whether the change is semantically *correct*.
--Direction and correctness are what :mod:`dir`, :mod:`adapter_revert`,
--and the attribution probes cover.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes._divergence import Divergence, divergence, js_ln2
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--from dlm_sway.probes.null_adapter import get_null_stats
--
--
--class DeltaKLSpec(ProbeSpec):
--    """Spec for ``kind: delta_kl``."""
--
--    kind: Literal["delta_kl"] = "delta_kl"
--    prompts: list[str] = Field(default_factory=list, min_length=0)
--    """Inline prompts. At least one of ``prompts`` / ``prompts_from`` must
--    be non-empty at run time; the prompts-from path is wired via
--    :mod:`dlm_sway.integrations.dlm.autogen`."""
--    divergence: Divergence = "js"
--    top_k: int | None = None
--    """Override the suite-wide ``top_k``. ``None`` → use ``ctx.top_k``."""
--    assert_mean_gte: float = 0.02
--    """Fixed-threshold pass criterion when no null stats are available."""
--    assert_z_gte: float = 3.0
--    """Z-score pass criterion against the null-adapter baseline, when it
--    exists. The more principled metric — prefer this over the raw
--    threshold."""
--
--
--class DeltaKLProbe(Probe):
--    """The canonical "is the adapter changing anything?" probe."""
--
--    kind = "delta_kl"
--    spec_cls = DeltaKLSpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, DeltaKLSpec)
--        if not spec.prompts:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no prompts provided (inline 'prompts' was empty)",
--            )
--
--        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
--        divergences: list[float] = []
--        for prompt in spec.prompts:
--            with ctx.backend.as_base() as base_view:
--                base_dist = base_view.next_token_dist(prompt, top_k=top_k)
--            with ctx.backend.as_finetuned() as ft_view:
--                ft_dist = ft_view.next_token_dist(prompt, top_k=top_k)
--            divergences.append(divergence(base_dist, ft_dist, kind=spec.divergence))
--
--        raw_mean = statistics.fmean(divergences)
--        raw_max = max(divergences)
--
--        # Null-adapter calibration wins when available.
--        null = get_null_stats(ctx, spec.kind)
--        z = None
--        if null is not None and null.get("std", 0.0) > 0.0:
--            z = (raw_mean - null["mean"]) / null["std"]
--            verdict = Verdict.PASS if z >= spec.assert_z_gte else Verdict.FAIL
--            message = f"mean {spec.divergence}={raw_mean:.4f}, z={z:+.2f}σ vs null"
--        else:
--            verdict = Verdict.PASS if raw_mean >= spec.assert_mean_gte else Verdict.FAIL
--            message = (
--                f"mean {spec.divergence}={raw_mean:.4f} "
--                f"({'≥' if verdict == Verdict.PASS else '<'} {spec.assert_mean_gte})"
--            )
--
--        # Normalized score for composite: JS is bounded by ln(2), so
--        # sigmoid-ish on (z, or raw / bound) keeps the number in [0, 1].
--        if z is not None:
--            score = _sigmoid(z / 3.0)
--        else:
--            bound = js_ln2() if spec.divergence == "js" else 1.0
--            score = min(1.0, raw_mean / bound) if bound > 0.0 else 0.0
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=raw_mean,
--            z_score=z,
--            evidence={
--                "divergence_kind": spec.divergence,
--                "per_prompt": divergences,
--                "max": raw_max,
--                "num_prompts": len(spec.prompts),
--                "weight": spec.weight,
--            },
--            message=message,
--        )
--
--
--def _sigmoid(x: float) -> float:
--    import math
--
--    return 1.0 / (1.0 + math.exp(-x))

sway/src/dlm_sway/probes/leakage.pydeleted

--"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim?
--
--For each PROSE section, take the first ``prefix_chars`` as a trigger and
--greedy-generate a continuation. Measure how much of the actual section
--continuation the model recovers (via LCS ratio). Also re-run under
--small prefix perturbations (typo, case flip, punctuation change) and
--report the **fragility** — a genuinely generalized model degrades
--smoothly under perturbation; a memorizer drops off a cliff.
--
--Default pass: ``greedy_recall < 0.5``. That default is tuned for the
--common "don't leak my document" use case. Sections tagged ``intent:
--memorize`` invert the interpretation — the .dlm bridge handles that
--flip at spec-generation time.
--"""
--
--from __future__ import annotations
--
--import difflib
--import statistics
--from typing import Literal
--
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--PerturbationKind = Literal["typo", "case_flip", "drop_punct"]
--
--
--def _default_perturbations() -> list[PerturbationKind]:
--    return ["typo", "case_flip", "drop_punct"]
--
--
--class LeakageSusceptibilitySpec(ProbeSpec):
--    kind: Literal["leakage"] = "leakage"
--    prefix_chars: int = 128
--    continuation_chars: int = 256
--    max_new_tokens: int = 96
--    perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations)
--    assert_recall_lt: float = 0.5
--    """Default anti-leak gate: pass when verbatim recall is modest. Invert
--    by bumping this to ``>1.0`` when intentional memorization is desired."""
--    min_fragility: float = 0.3
--    """Fragility = (clean - perturbed) / max(clean, eps). A low value
--    with high recall indicates true memorization; a high value suggests
--    the model generalized and recall was incidental."""
--
--
--class LeakageSusceptibilityProbe(Probe):
--    kind = "leakage"
--    spec_cls = LeakageSusceptibilitySpec
--    category = "calibration"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, LeakageSusceptibilitySpec)
--        if ctx.sections is None:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no sections in context — provide via the .dlm bridge",
--            )
--        prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()]
--        if not prose:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no PROSE sections to test for leakage",
--            )
--
--        clean_recalls: list[float] = []
--        perturbed_recalls: list[float] = []
--        per_section: list[dict[str, float | str]] = []
--
--        with ctx.backend.as_finetuned() as ft:
--            for s in prose:
--                prefix = s.content[: spec.prefix_chars]
--                target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars]
--                if not target.strip():
--                    continue
--                clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
--                clean = _lcs_ratio(clean_gen, target)
--                clean_recalls.append(clean)
--
--                per_sec_perturbed: list[float] = []
--                for perturbation in spec.perturbations:
--                    perturbed_prefix = _perturb(prefix, perturbation)
--                    perturbed_gen = ft.generate(
--                        perturbed_prefix,
--                        max_new_tokens=spec.max_new_tokens,
--                        seed=ctx.seed,
--                    )
--                    per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target))
--                mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean
--                perturbed_recalls.append(mean_pert)
--
--                per_section.append(
--                    {
--                        "section_id": s.id,
--                        "clean_recall": clean,
--                        "perturbed_recall": mean_pert,
--                        "fragility": _fragility(clean, mean_pert),
--                    }
--                )
--
--        if not clean_recalls:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no PROSE sections had scorable continuations",
--            )
--
--        mean_clean = statistics.fmean(clean_recalls)
--        mean_pert = statistics.fmean(perturbed_recalls)
--        mean_fragility = _fragility(mean_clean, mean_pert)
--
--        verdict = (
--            Verdict.PASS
--            if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility
--            else Verdict.FAIL
--        )
--        # Score: 1.0 at zero recall, declining as recall approaches threshold.
--        recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6)))
--        # Bonus: high fragility is good (genuine generalization).
--        fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6)))
--        score = 0.7 * recall_score + 0.3 * fragility_bonus
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=mean_clean,
--            base_value=None,
--            ft_value=mean_fragility,
--            evidence={
--                "mean_clean_recall": mean_clean,
--                "mean_perturbed_recall": mean_pert,
--                "mean_fragility": mean_fragility,
--                "per_section": per_section[:10],
--                "weight": spec.weight,
--            },
--            message=(
--                f"greedy_recall={mean_clean:.2f} "
--                f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})"
--            ),
--        )
--
--
--# -- helpers -----------------------------------------------------------
--
--
--def _lcs_ratio(generated: str, target: str) -> float:
--    """Longest common subsequence ratio via difflib.
--
--    Returns 0 for empty inputs, 1.0 for identical strings. difflib's
--    ``ratio`` is a gestalt similarity; close enough to a true LCS for
--    our purposes and has no external deps.
--    """
--    if not generated or not target:
--        return 0.0
--    return difflib.SequenceMatcher(None, generated, target).ratio()
--
--
--def _perturb(text: str, kind: str) -> str:
--    """Apply a deterministic textual perturbation."""
--    if not text:
--        return text
--    if kind == "typo":
--        # Swap the first two characters; trivial typo the model must reconstruct.
--        if len(text) < 2:
--            return text
--        return text[1] + text[0] + text[2:]
--    if kind == "case_flip":
--        # Flip case of the first alpha char.
--        for i, ch in enumerate(text):
--            if ch.isalpha():
--                flipped = ch.lower() if ch.isupper() else ch.upper()
--                return text[:i] + flipped + text[i + 1 :]
--        return text
--    if kind == "drop_punct":
--        return "".join(ch for ch in text if ch not in ".,;:!?-—")
--    raise ValueError(f"unknown perturbation: {kind!r}")
--
--
--def _fragility(clean: float, perturbed: float) -> float:
--    if clean <= 0.0:
--        return 0.0
--    return max(0.0, (clean - perturbed) / clean)

sway/src/dlm_sway/probes/null_adapter.pydeleted

--"""Null-adapter baseline probe.
--
--Every numeric primitive reports its raw metric *and* a z-score against a
--null-adapter distribution. This probe is the runtime engine that
--establishes that distribution — it builds random-init "null" adapters
--(structurally identical to the real adapter but with weights drawn from
--a Gaussian) and measures how much signal they produce.
--
--The resulting ``(mean, std, n)`` per kind is attached to this probe's
--``evidence["null_stats"]``. The runner picks it up and threads it into
--:attr:`RunContext.null_stats`, where every downstream probe can read it
--and turn a raw metric into a z-score.
--
--Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend`
--cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back
--to their fixed thresholds in that case.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.core.scoring import NullCalibratedBackend
--from dlm_sway.probes._divergence import divergence
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--class NullAdapterSpec(ProbeSpec):
--    """Spec for ``kind: null_adapter``.
--
--    Authors place this probe **first** in the suite so its output
--    populates :attr:`RunContext.null_stats` before subsequent probes
--    consult it.
--    """
--
--    kind: Literal["null_adapter"] = "null_adapter"
--    runs: int = Field(default=3, ge=1, le=10)
--    """Number of independent null adapters to evaluate. Three is the
--    smallest that yields a usable std; more is better but quickly
--    dominates suite runtime."""
--    prompts: list[str] = Field(default_factory=list)
--    """Prompt set for null calibration. Keep small — calibration runs
--    ``runs × len(prompts)`` forward passes. 4–8 prompts is typical.
--    If empty, a minimal built-in prompt set is used so the probe
--    always produces stats."""
--    init_scale: float = 0.02
--    """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B."""
--    seed_base: int = 1000
--    """First seed; successive runs use ``seed_base + run_idx``."""
--
--
--_DEFAULT_PROMPTS: tuple[str, ...] = (
--    "The quick brown fox",
--    "Once upon a time",
--    "In this document we explain",
--    "The key takeaway is",
--    "An important point to remember",
--)
--
--
--class NullAdapterProbe(Probe):
--    """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself.
--
--    The probe never fails on its own terms — its *job* is calibration.
--    Downstream probes pick up :attr:`RunContext.null_stats` keyed by
--    probe kind (``delta_kl``, ``adapter_ablation`` …) and use the
--    populated mean/std to z-score their own raw metrics.
--    """
--
--    kind = "null_adapter"
--    spec_cls = NullAdapterSpec
--    category = "baseline"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, NullAdapterSpec)
--        if not isinstance(ctx.backend, NullCalibratedBackend):
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message=(
--                    "backend does not implement NullCalibratedBackend — "
--                    "numeric probes will fall back to fixed thresholds"
--                ),
--            )
--        prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS)
--
--        per_seed_means: list[float] = []
--        for run_idx in range(spec.runs):
--            seed = spec.seed_base + run_idx
--            per_prompt: list[float] = []
--            for prompt in prompts:
--                with ctx.backend.as_base() as base_view:
--                    base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k)
--                with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view:
--                    null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k)
--                per_prompt.append(divergence(base_dist, null_dist, kind="js"))
--            per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0)
--
--        mean = statistics.fmean(per_seed_means)
--        std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0
--
--        # Publish per-kind stats. delta_kl is the primary kind; other
--        # divergence-based probes (adapter_ablation) share this scale.
--        null_stats = {
--            "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
--            "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
--        }
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=Verdict.PASS,
--            score=1.0,
--            raw=mean,
--            evidence={
--                "null_stats": null_stats,
--                "per_seed_mean_js": per_seed_means,
--                "init_scale": spec.init_scale,
--                "runs": spec.runs,
--                "num_prompts": len(prompts),
--                "weight": spec.weight,
--            },
--            message=(
--                f"null JS divergence μ={mean:.4f} ± {std:.4f} "
--                f"(over {spec.runs} seeds × {len(prompts)} prompts) — "
--                f"downstream probes will z-score against this baseline"
--            ),
--        )
--
--
--def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None:
--    """Look up null-adapter stats for ``probe_kind``.
--
--    Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for
--    this kind, else ``None``. Probes treat ``None`` as "fall back to the
--    fixed threshold from your spec."
--    """
--    return ctx.null_stats.get(probe_kind)

sway/src/dlm_sway/probes/paraphrase_invariance.pydeleted

--"""B2 ParaphraseInvariance — memorization vs generalization, per case.
--
--For each ``(prompt, gold, paraphrases)`` test case:
--
--- ``verbatim_lift``:  Δ-per-token = logprob_ft(prompt, gold) - logprob_base(prompt, gold)
--- ``paraphrase_lift``: mean Δ-per-token over the paraphrased prompts
--
--A model that memorized the exact prompt has high ``verbatim_lift`` but
--near-zero ``paraphrase_lift``. A model that learned the underlying
--*pattern* has both values positive and close to each other.
--
--We report:
--
--- ``generalization_ratio = paraphrase_lift / max(verbatim_lift, eps)``
--- ``verbatim_score``: whether the adapter significantly moved the
--  verbatim-prompt logprob (sanity check)
--
--The pass criterion depends on the stated intent: by default we require
--both high verbatim lift and high generalization ratio. If the spec's
--``intent`` is ``"memorize"``, the ratio requirement inverts — we *want*
--verbatim >> paraphrase.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import BaseModel, ConfigDict, Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--Intent = Literal["generalize", "memorize", "both"]
--
--
--class ParaphraseCase(BaseModel):
--    """One paraphrase-invariance case."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    prompt: str
--    gold: str
--    paraphrases: list[str] = Field(default_factory=list, min_length=1)
--
--
--class ParaphraseInvarianceSpec(ProbeSpec):
--    kind: Literal["paraphrase_invariance"] = "paraphrase_invariance"
--    cases: list[ParaphraseCase] = Field(default_factory=list)
--    intent: Intent = "generalize"
--    min_verbatim_lift: float = 0.2
--    min_generalization_ratio: float = 0.5
--    max_generalization_ratio_if_memorize: float = 0.5
--
--
--class ParaphraseInvarianceProbe(Probe):
--    kind = "paraphrase_invariance"
--    spec_cls = ParaphraseInvarianceSpec
--    category = "attribution"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, ParaphraseInvarianceSpec)
--        if not spec.cases:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no cases provided",
--            )
--
--        verbatim_lifts: list[float] = []
--        paraphrase_lifts: list[float] = []
--        per_case: list[dict[str, float | str]] = []
--
--        for case in spec.cases:
--            tokens = max(_token_estimate(case.gold), 1)
--            with ctx.backend.as_base() as b:
--                lp_base_verb = b.logprob_of(case.prompt, case.gold) / tokens
--                lp_base_par = [b.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
--            with ctx.backend.as_finetuned() as f:
--                lp_ft_verb = f.logprob_of(case.prompt, case.gold) / tokens
--                lp_ft_par = [f.logprob_of(p, case.gold) / tokens for p in case.paraphrases]
--
--            verb_lift = lp_ft_verb - lp_base_verb
--            par_lift = statistics.fmean(
--                (ft - base) for base, ft in zip(lp_base_par, lp_ft_par, strict=True)
--            )
--            verbatim_lifts.append(verb_lift)
--            paraphrase_lifts.append(par_lift)
--            per_case.append(
--                {
--                    "prompt": case.prompt[:80],
--                    "verbatim_lift": verb_lift,
--                    "paraphrase_lift": par_lift,
--                }
--            )
--
--        mean_verb = statistics.fmean(verbatim_lifts)
--        mean_par = statistics.fmean(paraphrase_lifts)
--        ratio = mean_par / mean_verb if abs(mean_verb) > 1e-9 else 0.0
--
--        verdict, score, msg = _decide(spec, mean_verb, mean_par, ratio)
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=ratio,
--            base_value=mean_verb,
--            ft_value=mean_par,
--            evidence={
--                "verbatim_lift_mean": mean_verb,
--                "paraphrase_lift_mean": mean_par,
--                "generalization_ratio": ratio,
--                "intent": spec.intent,
--                "per_case": per_case[:8],
--                "weight": spec.weight,
--            },
--            message=msg,
--        )
--
--
--def _decide(
--    spec: ParaphraseInvarianceSpec, verb: float, par: float, ratio: float
--) -> tuple[Verdict, float, str]:
--    """Apply the intent-aware pass rule and return (verdict, score, message)."""
--    base_msg = f"verb={verb:+.3f}, para={par:+.3f}, ratio={ratio:.2f}"
--    if spec.intent == "memorize":
--        verd = (
--            Verdict.PASS
--            if verb >= spec.min_verbatim_lift and ratio <= spec.max_generalization_ratio_if_memorize
--            else Verdict.FAIL
--        )
--        score = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
--        return verd, score, f"{base_msg} — intent=memorize"
--    # Default: generalize (or "both")
--    passed = verb >= spec.min_verbatim_lift and ratio >= spec.min_generalization_ratio
--    verd = Verdict.PASS if passed else Verdict.FAIL
--    gen_component = min(1.0, max(0.0, ratio / max(spec.min_generalization_ratio, 1e-6)))
--    verb_component = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6)))
--    score = 0.5 * gen_component + 0.5 * verb_component
--    return verd, score, f"{base_msg} — intent={spec.intent}"
--
--
--def _token_estimate(s: str) -> int:
--    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/preference_flip.pydeleted

--"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking?
--
--For each ``(prompt, chosen, rejected)`` triple, compute the margin
--
--.. math::
--    m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt})
--
--under both base and fine-tuned views. Interesting triples are the ones
--where base got the sign *wrong* (``m_base < 0``); we fail if the
--fine-tune doesn't flip a large enough fraction of them.
--
--Triples come from either an inline ``triples:`` block in the spec or
--from PREFERENCE sections in :attr:`RunContext.sections`. The probe
--returns :attr:`Verdict.SKIP` when no triples are present — this is the
--"no PREFERENCE sections in your document" case, graceful by design.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import BaseModel, ConfigDict, Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--class PreferenceTriple(BaseModel):
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    prompt: str
--    chosen: str
--    rejected: str
--
--
--class PreferenceFlipSpec(ProbeSpec):
--    kind: Literal["preference_flip"] = "preference_flip"
--    triples: list[PreferenceTriple] = Field(default_factory=list)
--    """Inline triples. If empty, the probe pulls from PREFERENCE
--    sections in ctx.sections; if neither is available the probe SKIPs."""
--    assert_flip_rate_gte: float = 0.7
--    """Fraction of *base-wrong* triples that must flip under ft."""
--    min_triples_for_decision: int = 3
--
--
--class PreferenceFlipProbe(Probe):
--    kind = "preference_flip"
--    spec_cls = PreferenceFlipSpec
--    category = "attribution"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, PreferenceFlipSpec)
--        triples = list(spec.triples) or _triples_from_sections(ctx)
--        if not triples:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no preference triples (inline or from sections)",
--            )
--
--        base_margins: list[float] = []
--        ft_margins: list[float] = []
--        for t in triples:
--            with ctx.backend.as_base() as b:
--                base_margins.append(
--                    b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected)
--                )
--            with ctx.backend.as_finetuned() as f:
--                ft_margins.append(
--                    f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected)
--                )
--
--        # Interesting denominator: base got it wrong.
--        base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0]
--        flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0]
--
--        if len(base_wrong_idx) < spec.min_triples_for_decision:
--            # Not enough base-wrong triples to decide. Fall back to mean margin delta.
--            mean_delta = statistics.fmean(
--                (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True)
--            )
--            verdict = Verdict.WARN
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=verdict,
--                score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)),
--                raw=mean_delta,
--                base_value=statistics.fmean(base_margins),
--                ft_value=statistics.fmean(ft_margins),
--                evidence={
--                    "base_wrong": len(base_wrong_idx),
--                    "total": len(triples),
--                    "mean_margin_delta": mean_delta,
--                    "weight": spec.weight,
--                },
--                message=(
--                    f"only {len(base_wrong_idx)} base-wrong triples < "
--                    f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}"
--                ),
--            )
--
--        flip_rate = len(flipped_idx) / len(base_wrong_idx)
--        verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL
--        score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6))
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=flip_rate,
--            base_value=statistics.fmean(base_margins),
--            ft_value=statistics.fmean(ft_margins),
--            evidence={
--                "flip_rate": flip_rate,
--                "flipped": len(flipped_idx),
--                "base_wrong": len(base_wrong_idx),
--                "total": len(triples),
--                "weight": spec.weight,
--            },
--            message=(
--                f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} "
--                f"base-wrong triples flipped by ft)"
--            ),
--        )
--
--
--def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]:
--    if ctx.sections is None:
--        return []
--    out: list[PreferenceTriple] = []
--    for s in ctx.sections:
--        if s.kind != "preference":
--            continue
--        for p in s.preferences:
--            out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected))
--    return out

sway/src/dlm_sway/probes/prompt_collapse.pydeleted

--"""A3 PromptCollapse — does adapter influence decay with context length?
--
--For each test prompt we prepend irrelevant "stuffing" of varying length
--and measure ``divergence(base, ft)`` at the final position. A healthy
--adapter shows a modest, slow decay; a degenerate one collapses quickly
--— its signal evaporates once the base has a lot of context to lean on.
--
--We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log
--space and report the half-life in tokens. Pass if the half-life is at
--least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which
--defaults to half the default sequence length.
--
--All math is numpy-only to avoid a scipy dependency on the install path.
--"""
--
--from __future__ import annotations
--
--from typing import Literal
--
--import numpy as np
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes._divergence import Divergence, divergence
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--# A neutral, token-dense piece of text we prepend to stress the base
--# model's long-context handling. Deliberately low-information so the
--# "answer" at the end is the only thing driving next-token predictions.
--_STUFFING = (
--    "The following log lines are archived for historical record and have no "
--    "bearing on the question that follows. They are retained for audit purposes "
--    "only and should be ignored when forming an answer. "
--)
--
--
--class PromptCollapseSpec(ProbeSpec):
--    kind: Literal["prompt_collapse"] = "prompt_collapse"
--    prompts: list[str] = Field(default_factory=list, min_length=0)
--    context_lengths: list[int] = Field(
--        default_factory=lambda: [0, 256, 512, 1024],
--        min_length=2,
--    )
--    """Approximate token counts of stuffing to prepend. ≥2 required
--    because the exponential fit is undefined for a single point."""
--    divergence: Divergence = "js"
--    top_k: int | None = None
--    assert_half_life_tokens: int = 512
--    """Minimum half-life to pass. Default is deliberately permissive —
--    tune upward for high-stakes deployments."""
--
--
--class PromptCollapseProbe(Probe):
--    kind = "prompt_collapse"
--    spec_cls = PromptCollapseSpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, PromptCollapseSpec)
--        if not spec.prompts:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no prompts provided",
--            )
--
--        top_k = spec.top_k if spec.top_k is not None else ctx.top_k
--        # Mean divergence at each context length.
--        mean_divs: list[float] = []
--        for ctx_len in spec.context_lengths:
--            prefix = _stuffing(ctx_len)
--            divs: list[float] = []
--            for prompt in spec.prompts:
--                full_prompt = prefix + prompt
--                with ctx.backend.as_base() as bv:
--                    base_dist = bv.next_token_dist(full_prompt, top_k=top_k)
--                with ctx.backend.as_finetuned() as fv:
--                    ft_dist = fv.next_token_dist(full_prompt, top_k=top_k)
--                divs.append(divergence(base_dist, ft_dist, kind=spec.divergence))
--            mean_divs.append(float(np.mean(divs)))
--
--        half_life = _fit_half_life(
--            np.asarray(spec.context_lengths, dtype=np.float64),
--            np.asarray(mean_divs, dtype=np.float64),
--        )
--
--        verdict = (
--            Verdict.PASS
--            if half_life is not None and half_life >= spec.assert_half_life_tokens
--            else Verdict.FAIL
--        )
--        score = _score(half_life, spec.assert_half_life_tokens)
--
--        msg = (
--            f"half-life={half_life:.0f} tokens"
--            if half_life is not None
--            else "could not fit exponential decay (too flat or non-monotonic)"
--        )
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=half_life,
--            evidence={
--                "context_lengths": spec.context_lengths,
--                "mean_divergence_per_length": mean_divs,
--                "divergence_kind": spec.divergence,
--                "weight": spec.weight,
--            },
--            message=msg,
--        )
--
--
--def _stuffing(target_tokens: int) -> str:
--    """Approximate target-length stuffing. 4 chars ≈ 1 token is fine
--    for SentencePiece-style tokenizers at the order-of-magnitude level."""
--    if target_tokens <= 0:
--        return ""
--    # Repeat enough copies to hit the target length in characters.
--    target_chars = target_tokens * 4
--    reps = (target_chars // len(_STUFFING)) + 1
--    return (_STUFFING * reps)[:target_chars] + "\n\n"
--
--
--def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None:
--    """Fit ``y = a * exp(-x / h)`` via log-space linear regression.
--
--    Returns ``None`` if the divergences aren't strictly positive or the
--    fit is non-decreasing (i.e. the fine-tune got *more* distinct with
--    context, which invalidates the half-life concept).
--    """
--    if (divergences <= 0.0).any():
--        # Can't take a log; treat near-zero as too-flat-to-fit.
--        return None
--    log_y = np.log(divergences)
--    # Standard linear regression slope.
--    x_mean = float(lengths.mean())
--    y_mean = float(log_y.mean())
--    denom = float(((lengths - x_mean) ** 2).sum())
--    if denom == 0.0:
--        return None
--    slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom
--    if slope >= 0.0:
--        # Signal grew with context — can't express as half-life.
--        return None
--    # Slope = -1/h → h = -1/slope → half_life = ln(2) * h.
--    import math
--
--    return float(math.log(2.0) * (-1.0 / slope))
--
--
--def _score(half_life: float | None, target: int) -> float:
--    if half_life is None:
--        return 0.0
--    # Asymptotic: score saturates at 1.0 when hits target, declines toward 0.
--    return float(min(1.0, half_life / max(target, 1)))

sway/src/dlm_sway/probes/section_internalization.pydeleted

--"""B1 SectionInternalizationScore — the flagship attribution primitive.
--
--For each typed section of the training document, measure *how much the
--fine-tune moved the needle on that section's own content* — and subtract
--the same metric measured on *other* sections' content. The difference is
--the "effective SIS": signal attributable to *this* section, not to a
--broader lift across the whole document.
--
--Output is a per-section bar chart. In practice users see that sections
--2 and 7 actually moved the model, sections 3 and 5 did nothing, and
--section 11 moved it but also leaked into unrelated content — actionable
--signal for document authoring that no other eval tool provides.
--
--Math per section ``s`` with measurement function ``m(probe_set)``:
--
--.. math::
--    sis_s^{own}  &= (m_{base}(s) - m_{ft}(s)) / m_{base}(s)
--    sis_s^{leak} &= (m_{base}(\\bar s) - m_{ft}(\\bar s)) / m_{base}(\\bar s)
--    effective    &= sis_s^{own} - sis_s^{leak}
--
--For PROSE sections, ``m`` is the average NLL per token over the
--section's content. For INSTRUCTION and PREFERENCE sections, ``m`` is the
--average NLL per token over the answer/chosen spans given their prompts.
--"""
--
--from __future__ import annotations
--
--import statistics
--from typing import Literal
--
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.core.scoring import ScoringBackend
--from dlm_sway.core.sections import Section, SectionKind
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--
--def _default_include_kinds() -> list[SectionKind]:
--    return ["prose", "instruction", "preference"]
--
--
--class SectionInternalizationSpec(ProbeSpec):
--    kind: Literal["section_internalization"] = "section_internalization"
--    include_kinds: list[SectionKind] = Field(default_factory=_default_include_kinds)
--    per_section_threshold: float = 0.05
--    """Minimum ``effective_sis`` for a section to be marked PASS."""
--    assert_passing_section_frac: float = 0.5
--    """Probe-level pass criterion: fraction of sections that must clear
--    the per-section threshold."""
--    max_prose_chars: int = 2000
--    """Cap the length of PROSE content we score to keep runtime bounded.
--    Long sections are chunked; this is the per-chunk cap."""
--
--
--class SectionInternalizationProbe(Probe):
--    kind = "section_internalization"
--    spec_cls = SectionInternalizationSpec
--    category = "attribution"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, SectionInternalizationSpec)
--        if ctx.sections is None or len(ctx.sections) == 0:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no sections in context — provide via the .dlm bridge",
--            )
--
--        kinds_allowed = set(spec.include_kinds)
--        eligible = [s for s in ctx.sections if s.kind in kinds_allowed]
--        if len(eligible) < 2:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message=(
--                    f"need ≥2 eligible sections for leak-check; got {len(eligible)} "
--                    f"(kinds={spec.include_kinds})"
--                ),
--            )
--
--        # Pre-compute per-section base and ft NLL-per-token to avoid
--        # re-running the forward pass for leak-checks.
--        base_nll: dict[str, float] = {}
--        ft_nll: dict[str, float] = {}
--        with ctx.backend.as_base() as base_view:
--            for s in eligible:
--                base_nll[s.id] = _section_nll(s, base_view, spec.max_prose_chars)
--        with ctx.backend.as_finetuned() as ft_view:
--            for s in eligible:
--                ft_nll[s.id] = _section_nll(s, ft_view, spec.max_prose_chars)
--
--        per_section: list[dict[str, float | str | bool]] = []
--        passing = 0
--        effective_scores: list[float] = []
--        for s in eligible:
--            others = [o for o in eligible if o.id != s.id]
--            own_lift = _relative_lift(base_nll[s.id], ft_nll[s.id])
--            leak_lift = statistics.fmean(
--                _relative_lift(base_nll[o.id], ft_nll[o.id]) for o in others
--            )
--            effective = own_lift - leak_lift
--            effective_scores.append(effective)
--            did_pass = effective >= spec.per_section_threshold
--            passing += int(did_pass)
--            per_section.append(
--                {
--                    "section_id": s.id,
--                    "kind": s.kind,
--                    "tag": s.tag or "",
--                    "base_nll": base_nll[s.id],
--                    "ft_nll": ft_nll[s.id],
--                    "own_lift": own_lift,
--                    "leak_lift": leak_lift,
--                    "effective_sis": effective,
--                    "passed": did_pass,
--                }
--            )
--
--        passing_frac = passing / len(eligible)
--        verdict = Verdict.PASS if passing_frac >= spec.assert_passing_section_frac else Verdict.FAIL
--        score = passing_frac
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=statistics.fmean(effective_scores),
--            evidence={
--                "per_section": per_section,
--                "num_sections": len(eligible),
--                "passing_frac": passing_frac,
--                "per_section_threshold": spec.per_section_threshold,
--                "weight": spec.weight,
--            },
--            message=(
--                f"{passing}/{len(eligible)} sections cleared "
--                f"effective_sis≥{spec.per_section_threshold:.2f} (mean={statistics.fmean(effective_scores):+.3f})"
--            ),
--        )
--
--
--def _section_nll(s: Section, view: ScoringBackend, max_prose_chars: int) -> float:
--    """Average NLL per token for the section's content under ``view``."""
--    if s.kind == "prose":
--        return _prose_nll(s.content[:max_prose_chars], view)
--    if s.kind == "instruction":
--        if not s.probes:
--            return _prose_nll(s.content[:max_prose_chars], view)
--        return statistics.fmean(
--            -view.logprob_of(p.prompt, p.gold) / max(_token_estimate(p.gold), 1) for p in s.probes
--        )
--    if s.kind == "preference":
--        if not s.preferences:
--            return _prose_nll(s.content[:max_prose_chars], view)
--        return statistics.fmean(
--            -view.logprob_of(p.prompt, p.chosen) / max(_token_estimate(p.chosen), 1)
--            for p in s.preferences
--        )
--    raise ValueError(f"unknown section kind: {s.kind!r}")
--
--
--def _prose_nll(text: str, view: ScoringBackend) -> float:
--    """Negative-mean-logprob over ``text``. Returns 0 for empty input."""
--    if not text.strip():
--        return 0.0
--    r = view.rolling_logprob(text)
--    return -r.mean_logprob
--
--
--def _relative_lift(base_nll: float, ft_nll: float) -> float:
--    """``(base - ft) / base``. Positive → ft is lower-PPL than base.
--
--    Falls back to an absolute delta when ``base`` is pathological
--    (zero or negative), so the probe doesn't crash on degenerate
--    inputs.
--    """
--    if base_nll <= 0.0:
--        return float(base_nll - ft_nll)
--    return float((base_nll - ft_nll) / base_nll)
--
--
--def _token_estimate(s: str) -> int:
--    """Approximate tokens for normalization. Good enough for SentencePiece-ish vocabs."""
--    return max(1, len(s) // 4)

sway/src/dlm_sway/probes/style_fingerprint.pydeleted

--"""C1 StyleFingerprint — does ft prose *read* like the doc?
--
--Generates base and ft completions from a set of stylistic prompts,
--extracts a 6-dimensional fingerprint from each, and measures how the ft
--fingerprint has shifted **toward** the training document's own
--fingerprint vs the base.
--
--We compute the fingerprint with numpy-only features so the probe works
--out of the box without spaCy/textstat. The optional ``style`` extra
--upgrades the fingerprint with passive-voice rate and POS-entropy in a
--later milestone; the numeric contract — a non-negative vector per text
--— is stable across that upgrade.
--
--Signal: ``style_shift = cos(ft_fp - base_fp, doc_fp - base_fp)`` in
--fingerprint space. Positive values mean ft has moved *toward* the
--doc's style; negative values mean it moved *away* (a bad sign);
--near-zero means no stylistic shift detectable.
--"""
--
--from __future__ import annotations
--
--import re
--import statistics
--from typing import Literal
--
--import numpy as np
--from numpy.typing import NDArray
--from pydantic import Field
--
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--
--_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
--_PARAGRAPH_SPLIT = re.compile(r"\n\s*\n")
--_WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z'-]*\b")
--_PUNCTS = set(".,:;!?-—()[]\"'/")
--
--
--def fingerprint(text: str) -> NDArray[np.float64]:
--    """Return a 6-dim stylistic fingerprint for ``text``.
--
--    Dimensions (all numeric, scaled to order-1):
--      0. mean sentence length (words)  / 30.0
--      1. std sentence length (words)   / 30.0
--      2. type-token ratio              (already in [0,1])
--      3. avg word length (chars)       / 10.0
--      4. punctuation density per char  * 10.0
--      5. paragraph density (1 / avg paragraph length in words) * 30.0
--    """
--    if not text.strip():
--        return np.zeros(6, dtype=np.float64)
--
--    sentences = [s for s in _SENTENCE_SPLIT.split(text) if s.strip()]
--    paragraphs = [p for p in _PARAGRAPH_SPLIT.split(text) if p.strip()]
--    words = _WORD_RE.findall(text)
--    if not words:
--        return np.zeros(6, dtype=np.float64)
--
--    sentence_word_counts = [len(_WORD_RE.findall(s)) for s in sentences]
--    sentence_word_counts = [c for c in sentence_word_counts if c > 0]
--    if not sentence_word_counts:
--        sentence_word_counts = [len(words)]
--
--    mean_sent = statistics.fmean(sentence_word_counts)
--    std_sent = statistics.pstdev(sentence_word_counts) if len(sentence_word_counts) > 1 else 0.0
--    ttr = len({w.lower() for w in words}) / len(words)
--    avg_word_len = statistics.fmean(len(w) for w in words)
--    punct_count = sum(ch in _PUNCTS for ch in text)
--    punct_density = punct_count / max(len(text), 1)
--    avg_paragraph_len = (
--        statistics.fmean(len(_WORD_RE.findall(p)) for p in paragraphs) if paragraphs else len(words)
--    )
--    paragraph_density = 1.0 / max(avg_paragraph_len, 1.0)
--
--    return np.asarray(
--        [
--            mean_sent / 30.0,
--            std_sent / 30.0,
--            ttr,
--            avg_word_len / 10.0,
--            punct_density * 10.0,
--            paragraph_density * 30.0,
--        ],
--        dtype=np.float64,
--    )
--
--
--class StyleFingerprintSpec(ProbeSpec):
--    kind: Literal["style_fingerprint"] = "style_fingerprint"
--    prompts: list[str] = Field(default_factory=list)
--    """Prompts used to elicit a stylistic sample from each model."""
--    doc_reference: str = ""
--    """Concatenated reference text representing the adapter's intended
--    style. Typically the document itself; the .dlm bridge supplies this
--    from ``ctx.doc_text`` when left empty."""
--    max_new_tokens: int = 128
--    assert_shift_gte: float = 0.25
--    """Minimum cosine shift for PASS. ``0.25`` is a deliberately
--    permissive default — stylistic shift is a weaker signal than
--    perplexity lift."""
--
--
--class StyleFingerprintProbe(Probe):
--    kind = "style_fingerprint"
--    spec_cls = StyleFingerprintSpec
--    category = "calibration"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, StyleFingerprintSpec)
--        if not spec.prompts:
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message="no prompts provided",
--            )
--        doc_text = spec.doc_reference or (ctx.doc_text or "")
--        if not doc_text.strip():
--            return ProbeResult(
--                name=spec.name,
--                kind=spec.kind,
--                verdict=Verdict.SKIP,
--                score=None,
--                message="no doc_reference (inline or from ctx.doc_text)",
--            )
--
--        base_samples: list[str] = []
--        ft_samples: list[str] = []
--        for prompt in spec.prompts:
--            with ctx.backend.as_base() as b:
--                base_samples.append(
--                    b.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
--                )
--            with ctx.backend.as_finetuned() as f:
--                ft_samples.append(
--                    f.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed)
--                )
--
--        base_fp = fingerprint("\n".join(base_samples))
--        ft_fp = fingerprint("\n".join(ft_samples))
--        doc_fp = fingerprint(doc_text)
--
--        shift = _cosine_shift(base_fp, ft_fp, doc_fp)
--        verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL
--        score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0))
--
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=verdict,
--            score=score,
--            raw=shift,
--            evidence={
--                "base_fp": base_fp.tolist(),
--                "ft_fp": ft_fp.tolist(),
--                "doc_fp": doc_fp.tolist(),
--                "style_shift": shift,
--                "weight": spec.weight,
--            },
--            message=(
--                f"style_shift={shift:+.2f} "
--                f"({'toward' if shift > 0 else 'away from'} doc, "
--                f"threshold={spec.assert_shift_gte})"
--            ),
--        )
--
--
--def _cosine_shift(
--    base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64]
--) -> float:
--    """Cosine between (ft - base) and (doc - base) in fingerprint space."""
--    a = ft - base
--    b = doc - base
--    na = float(np.linalg.norm(a))
--    nb = float(np.linalg.norm(b))
--    if na == 0.0 or nb == 0.0:
--        return 0.0
--    return float(np.dot(a, b) / (na * nb))

sway/src/dlm_sway/py.typeddeleted

sway/src/dlm_sway/suite/__init__.pydeleted

`@@ -1,1 +0,0 @@`
1	-"""Suite plumbing: spec models, loader, runner, report, composite score."""

sway/src/dlm_sway/suite/loader.pydeleted

--"""Load + validate a ``sway.yaml`` into a :class:`SwaySpec`.
--
--Separated from :mod:`spec` so the data models stay trivially
--importable (no YAML dependency at import time for callers that
--construct specs programmatically).
--"""
--
--from __future__ import annotations
--
--from pathlib import Path
--from typing import Any
--
--import yaml
--from pydantic import ValidationError
--
--from dlm_sway.core.errors import SpecValidationError
--from dlm_sway.suite.spec import SwaySpec
--
--
--def load_spec(path: Path | str) -> SwaySpec:
--    """Parse ``path`` and return a validated :class:`SwaySpec`."""
--    resolved = Path(path).expanduser().resolve()
--    try:
--        raw_text = resolved.read_text(encoding="utf-8")
--    except FileNotFoundError as exc:
--        raise SpecValidationError(f"spec file not found: {resolved}", source=str(path)) from exc
--
--    try:
--        data = yaml.safe_load(raw_text)
--    except yaml.YAMLError as exc:
--        raise SpecValidationError(f"invalid YAML: {exc}", source=str(path)) from exc
--
--    if not isinstance(data, dict):
--        raise SpecValidationError("top-level document must be a mapping", source=str(path))
--    return from_dict(data, source=str(path))
--
--
--def from_dict(data: dict[str, Any], *, source: str | None = None) -> SwaySpec:
--    """Validate a dict (already parsed from YAML or JSON) as a SwaySpec."""
--    try:
--        spec = SwaySpec.model_validate(data)
--    except ValidationError as exc:
--        raise SpecValidationError(str(exc), source=source) from exc
--    try:
--        spec.check_version()
--    except ValueError as exc:
--        raise SpecValidationError(str(exc), source=source) from exc
--    return spec

sway/src/dlm_sway/suite/report.pydeleted

--"""Report emitters: terminal (rich), JSON, JUnit XML, markdown.
--
--The terminal renderer is the one a user sees; it's the product surface.
--It must communicate the verdict *and* the supporting evidence without
--forcing the user to open the JSON.
--
--JSON is the machine-readable source of truth — same fields as the
--:class:`SuiteResult` dataclass but flattened for easy downstream parsing
--(dashboards, diff tools, history tracking).
--
--JUnit XML exists to drop into CI pipelines so ``dlm-sway gate``
--integrates with existing test dashboards with no extra glue.
--"""
--
--from __future__ import annotations
--
--import json
--import xml.etree.ElementTree as ET
--from io import StringIO
--from typing import Any
--
--from rich.console import Console
--from rich.panel import Panel
--from rich.table import Table
--from rich.text import Text
--
--from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
--
--_VERDICT_STYLE = {
--    Verdict.PASS: "bold green",
--    Verdict.FAIL: "bold red",
--    Verdict.WARN: "bold yellow",
--    Verdict.SKIP: "dim",
--    Verdict.ERROR: "bold magenta",
--}
--
--
--def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None:
--    """Render the report to a rich Console (stdout by default)."""
--    c = console or Console()
--
--    header = Text.assemble(
--        ("dlm-sway report — ", "bold"),
--        (suite.base_model_id, "cyan"),
--        ("  vs  ", "dim"),
--        (_adapter_label(suite.adapter_id), "cyan"),
--    )
--    c.print(Panel(header, expand=False, border_style="blue"))
--
--    c.print()
--    c.print(
--        Text.assemble(
--            ("overall: ", "bold"),
--            (f"{score.overall:.2f}", _score_style(score.overall)),
--            ("  ", ""),
--            (f"[ {score.band} ]", _band_style(score.band)),
--        )
--    )
--
--    # Component breakdown
--    comp_table = Table.grid(padding=(0, 2))
--    comp_table.add_column(justify="left")
--    comp_table.add_column(justify="right")
--    comp_table.add_column()
--    for cat in ("adherence", "attribution", "calibration", "ablation", "baseline"):
--        if cat not in score.components:
--            continue
--        v = score.components[cat]
--        comp_table.add_row(cat, f"{v:.2f}", _bar(v))
--    c.print(comp_table)
--
--    c.print()
--    # Per-probe detail
--    detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
--    detail.add_column("name", style="cyan")
--    detail.add_column("kind", style="dim")
--    detail.add_column("verdict")
--    detail.add_column("score", justify="right")
--    detail.add_column("raw", justify="right")
--    detail.add_column("z", justify="right")
--    detail.add_column("note", style="dim")
--    for r in suite.probes:
--        detail.add_row(
--            r.name,
--            r.kind,
--            Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]),
--            f"{r.score:.2f}" if r.score is not None else "—",
--            f"{r.raw:.3f}" if r.raw is not None else "—",
--            f"{r.z_score:+.2f}σ" if r.z_score is not None else "—",
--            (r.message[:80] + "…") if len(r.message) > 80 else r.message,
--        )
--    c.print(detail)
--
--    if score.findings:
--        c.print()
--        c.print(Text("top findings:", style="bold"))
--        for i, f in enumerate(score.findings, start=1):
--            c.print(f"  {i}. {f}")
--
--    c.print()
--    c.print(Text(f"wall: {suite.wall_seconds:.2f}s  |  sway {suite.sway_version}", style="dim"))
--
--
--def to_json(suite: SuiteResult, score: SwayScore) -> str:
--    """Serialize the suite + composite score as JSON.
--
--    Stable schema; downstream tools rely on it. Breaking changes bump a
--    ``schema_version`` field (not yet present — this is v0.1).
--    """
--    return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True)
--
--
--def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]:
--    return {
--        "schema_version": 1,
--        "sway_version": suite.sway_version,
--        "spec_path": suite.spec_path,
--        "base_model_id": suite.base_model_id,
--        "adapter_id": suite.adapter_id,
--        "started_at": suite.started_at.isoformat(),
--        "finished_at": suite.finished_at.isoformat(),
--        "wall_seconds": suite.wall_seconds,
--        "score": {
--            "overall": score.overall,
--            "band": score.band,
--            "components": score.components,
--            "weights": score.weights,
--            "findings": list(score.findings),
--        },
--        "null_stats": suite.null_stats,
--        "probes": [_probe_to_jsonable(p) for p in suite.probes],
--    }
--
--
--def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]:
--    return {
--        "name": r.name,
--        "kind": r.kind,
--        "verdict": r.verdict.value,
--        "score": r.score,
--        "raw": r.raw,
--        "z_score": r.z_score,
--        "base_value": r.base_value,
--        "ft_value": r.ft_value,
--        "evidence": r.evidence,
--        "message": r.message,
--        "duration_s": r.duration_s,
--    }
--
--
--def to_junit(suite: SuiteResult, score: SwayScore) -> str:
--    """Serialize as JUnit XML. One ``<testcase>`` per probe."""
--    testsuite = ET.Element(
--        "testsuite",
--        {
--            "name": "dlm-sway",
--            "tests": str(len(suite.probes)),
--            "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
--            "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
--            "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)),
--            "time": f"{suite.wall_seconds:.3f}",
--        },
--    )
--    # Properties — the composite score and category breakdown.
--    props = ET.SubElement(testsuite, "properties")
--    ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"})
--    ET.SubElement(props, "property", {"name": "band", "value": score.band})
--    for cat, v in score.components.items():
--        ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"})
--
--    for r in suite.probes:
--        tc = ET.SubElement(
--            testsuite,
--            "testcase",
--            {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"},
--        )
--        if r.verdict == Verdict.FAIL:
--            ET.SubElement(tc, "failure", {"message": r.message or "failed"})
--        elif r.verdict == Verdict.ERROR:
--            ET.SubElement(tc, "error", {"message": r.message or "errored"})
--        elif r.verdict == Verdict.SKIP:
--            ET.SubElement(tc, "skipped", {"message": r.message or "skipped"})
--
--    return ET.tostring(testsuite, encoding="unicode")
--
--
--def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
--    """A portable, CI-friendly markdown report."""
--    buf = StringIO()
--    buf.write("# dlm-sway report\n\n")
--    buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`)  \n")
--    buf.write(f"**Base:** `{suite.base_model_id}`  \n")
--    buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
--    buf.write(f"**Wall:** {suite.wall_seconds:.2f}s  \n\n")
--
--    buf.write("## Components\n\n")
--    buf.write("| category | score |\n|---|---:|\n")
--    for cat, v in score.components.items():
--        buf.write(f"| {cat} | {v:.2f} |\n")
--    buf.write("\n## Probes\n\n")
--    buf.write("| name | kind | verdict | score | note |\n|---|---|---|---:|---|\n")
--    for r in suite.probes:
--        buf.write(
--            f"| {r.name} | `{r.kind}` | {r.verdict.value} | "
--            f"{f'{r.score:.2f}' if r.score is not None else '—'} | "
--            f"{r.message[:60]} |\n"
--        )
--    if score.findings:
--        buf.write("\n## Top findings\n\n")
--        for f in score.findings:
--            buf.write(f"- {f}\n")
--    return buf.getvalue()
--
--
--# -- helpers -----------------------------------------------------------
--
--
--def _adapter_label(adapter_id: str) -> str:
--    if not adapter_id:
--        return "(base only)"
--    # Only the trailing path chunk is useful in the header.
--    parts = adapter_id.rstrip("/").split("/")
--    return "/".join(parts[-3:]) if len(parts) > 3 else adapter_id
--
--
--def _score_style(v: float) -> str:
--    if v >= 0.6:
--        return "bold green"
--    if v >= 0.3:
--        return "bold yellow"
--    return "bold red"
--
--
--def _band_style(band: str) -> str:
--    return {
--        "noise": "red",
--        "partial": "yellow",
--        "healthy": "green",
--        "suspicious": "magenta",
--    }.get(band, "white")
--
--
--def _bar(v: float, *, width: int = 10) -> str:
--    clamped = max(0.0, min(1.0, v))
--    filled = int(round(clamped * width))
--    return "█" * filled + "░" * (width - filled)
--
--
--__all__ = ["to_terminal", "to_json", "to_junit", "to_markdown"]

sway/src/dlm_sway/suite/runner.pydeleted

--"""Suite runner.
--
--Iterates the probe list, materializes each into a ``(Probe, Spec)`` via
--the registry, executes it with a :class:`~dlm_sway.probes.base.RunContext`,
--and assembles a :class:`~dlm_sway.core.result.SuiteResult`.
--
--Runtime contract:
--
--- Probes are executed in declaration order (not sorted, not parallelized).
--  The null-adapter baseline has to run before any probe that needs z-scores,
--  so authoring order is load-bearing.
--- A probe that raises is recorded as
--  :attr:`~dlm_sway.core.result.Verdict.ERROR` and the suite continues —
--  one broken probe doesn't torch the whole report.
--- The backend is the caller's responsibility: the runner does not build
--  or close it, so callers can reuse a backend across multiple suites.
--"""
--
--from __future__ import annotations
--
--import time
--
--from dlm_sway import __version__
--from dlm_sway.core.errors import ProbeError
--from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
--from dlm_sway.core.scoring import DifferentialBackend
--from dlm_sway.core.sections import Section
--from dlm_sway.probes.base import RunContext, build_probe
--from dlm_sway.probes.null_adapter import NullAdapterSpec, get_null_stats
--from dlm_sway.suite.spec import SwaySpec
--
--
--def run(
--    spec: SwaySpec,
--    backend: DifferentialBackend,
--    *,
--    spec_path: str = "<memory>",
--    doc_text: str | None = None,
--    sections: tuple[Section, ...] | None = None,
--) -> SuiteResult:
--    """Execute every probe in ``spec`` against ``backend``."""
--    started = utcnow()
--    ctx = RunContext(
--        backend=backend,
--        seed=spec.defaults.seed,
--        top_k=spec.defaults.top_k,
--        sections=sections,
--        doc_text=doc_text,
--    )
--
--    results: list[ProbeResult] = []
--    null_stats: dict[str, dict[str, float]] = {}
--
--    for raw in spec.suite:
--        probe, probe_spec = build_probe(raw)
--        if not probe_spec.enabled:
--            results.append(
--                ProbeResult(
--                    name=probe_spec.name,
--                    kind=probe_spec.kind,
--                    verdict=Verdict.SKIP,
--                    score=None,
--                    message="disabled in spec",
--                )
--            )
--            continue
--
--        t0 = time.perf_counter()
--        try:
--            result = probe.run(probe_spec, ctx)
--        except ProbeError as exc:
--            result = ProbeResult(
--                name=probe_spec.name,
--                kind=probe_spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message=str(exc),
--            )
--        except Exception as exc:  # noqa: BLE001 — probe impls may raise anything
--            result = ProbeResult(
--                name=probe_spec.name,
--                kind=probe_spec.kind,
--                verdict=Verdict.ERROR,
--                score=None,
--                message=f"{type(exc).__name__}: {exc}",
--            )
--        duration = time.perf_counter() - t0
--        # Re-stamp duration (probes don't know their own wall time).
--        result = _with_duration(result, duration)
--        results.append(result)
--
--        # Null-adapter result seeds ctx.null_stats for subsequent probes.
--        if isinstance(probe_spec, NullAdapterSpec) and result.evidence.get("null_stats"):
--            null_stats.update(result.evidence["null_stats"])
--            # RunContext is frozen; swap in a fresh one so later probes
--            # see the populated stats.
--            ctx = RunContext(
--                backend=ctx.backend,
--                seed=ctx.seed,
--                top_k=ctx.top_k,
--                sections=ctx.sections,
--                doc_text=ctx.doc_text,
--                null_stats=null_stats,
--            )
--
--    finished = utcnow()
--    return SuiteResult(
--        spec_path=spec_path,
--        started_at=started,
--        finished_at=finished,
--        base_model_id=spec.models.base.base,
--        adapter_id=str(spec.models.ft.adapter) if spec.models.ft.adapter else "",
--        sway_version=__version__,
--        probes=tuple(results),
--        null_stats=null_stats,
--    )
--
--
--def _with_duration(result: ProbeResult, duration: float) -> ProbeResult:
--    """Return a copy of ``result`` with :attr:`ProbeResult.duration_s` set."""
--    return ProbeResult(
--        name=result.name,
--        kind=result.kind,
--        verdict=result.verdict,
--        score=result.score,
--        raw=result.raw,
--        z_score=result.z_score,
--        base_value=result.base_value,
--        ft_value=result.ft_value,
--        evidence=result.evidence,
--        message=result.message,
--        duration_s=duration,
--    )
--
--
--__all__ = ["get_null_stats", "run"]

sway/src/dlm_sway/suite/score.pydeleted

--"""Composite :class:`~dlm_sway.core.result.SwayScore` from a suite result.
--
--The score is a weighted mean over four categories
--(adherence / attribution / calibration / ablation). Each category's
--value is the weighted mean of its pass/score values (with SKIP/ERROR
--excluded so a broken probe doesn't silently depress the composite).
--
--All weighting is explicit, user-overridable, and surfaced in the report
--alongside the number — no black-box scoring.
--"""
--
--from __future__ import annotations
--
--from dlm_sway.core.result import (
--    DEFAULT_COMPONENT_WEIGHTS,
--    ProbeResult,
--    SuiteResult,
--    SwayScore,
--    Verdict,
--)
--from dlm_sway.probes.base import registry
--
--
--def compute(
--    suite: SuiteResult,
--    *,
--    weights: dict[str, float] | None = None,
--) -> SwayScore:
--    """Fold a :class:`SuiteResult` into a :class:`SwayScore`."""
--    w = weights if weights is not None else dict(DEFAULT_COMPONENT_WEIGHTS)
--    registered = registry()
--
--    # Bucket probes by their declared category.
--    buckets: dict[str, list[ProbeResult]] = {k: [] for k in w}
--    for r in suite.probes:
--        if r.verdict in {Verdict.SKIP, Verdict.ERROR}:
--            continue
--        if r.score is None:
--            continue
--        probe_cls = registered.get(r.kind)
--        category = probe_cls.category if probe_cls is not None else "adherence"
--        buckets.setdefault(category, []).append(r)
--
--    component_scores: dict[str, float] = {}
--    for cat, probes in buckets.items():
--        if not probes:
--            component_scores[cat] = 0.0
--            continue
--        total_w = sum(max(_spec_weight(p), 0.0) for p in probes) or 1.0
--        weighted = sum(max(_spec_weight(p), 0.0) * (p.score or 0.0) for p in probes)
--        component_scores[cat] = weighted / total_w
--
--    # Fold to composite, weighted by the user's category weights, but
--    # ignoring components that had no contributing probes (so a
--    # PREFERENCE-free document doesn't get penalized for missing B3).
--    active_weights = {k: v for k, v in w.items() if buckets.get(k)}
--    total_w = sum(active_weights.values()) or 1.0
--    overall = sum(active_weights[k] * component_scores[k] for k in active_weights) / total_w
--
--    findings = _findings(suite, component_scores)
--
--    return SwayScore(
--        overall=overall,
--        components=component_scores,
--        weights=w,
--        band=SwayScore.band_for(overall),
--        findings=findings,
--    )
--
--
--def _spec_weight(result: ProbeResult) -> float:
--    """Recover a probe's declared weight from its ``evidence`` payload.
--
--    The runner stores ``spec.weight`` on evidence so the scorer can read
--    it without re-validating specs. Falls back to 1.0 when absent (older
--    runs, custom probes, etc).
--    """
--    w = result.evidence.get("weight")
--    if isinstance(w, int | float):
--        return float(w)
--    return 1.0
--
--
--def _findings(suite: SuiteResult, components: dict[str, float]) -> tuple[str, ...]:
--    """Surface the 2–3 most diagnostic notes for the terminal report."""
--    notes: list[str] = []
--
--    failed = [r for r in suite.probes if r.verdict == Verdict.FAIL]
--    if failed:
--        top = failed[0]
--        notes.append(
--            f"{top.name} ({top.kind}) failed" + (f": {top.message}" if top.message else "")
--        )
--
--    for cat, score in components.items():
--        if score < 0.3 and components.get(cat, 1.0) != 0.0:
--            notes.append(f"{cat} score is {score:.2f} — below the noise threshold")
--
--    errors = [r for r in suite.probes if r.verdict == Verdict.ERROR]
--    if errors:
--        notes.append(f"{len(errors)} probe(s) errored — see full report for details")
--
--    return tuple(notes[:5])
--
--
--__all__ = ["compute"]

sway/src/dlm_sway/suite/spec.pydeleted

--"""Top-level ``sway.yaml`` spec models.
--
--Per-probe specs live next to their implementations in
--:mod:`dlm_sway.probes`. This module owns the *outer* envelope —
--``version``, ``models``, ``defaults``, ``suite`` — plus the runtime
--bind between raw probe dicts and registered probe classes.
--"""
--
--from __future__ import annotations
--
--from typing import Annotated, Any
--
--from pydantic import BaseModel, ConfigDict, Field
--
--from dlm_sway.core.model import ModelSpec
--
--SUPPORTED_VERSION = 1
--
--
--class SuiteModels(BaseModel):
--    """Named model handles the suite references — ``base`` + ``ft``."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    base: ModelSpec
--    ft: ModelSpec
--
--
--class SuiteDefaults(BaseModel):
--    """Shared defaults for the whole suite. Probes may override per-entry."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    seed: int = 0
--    top_k: int = 256
--    differential: bool = True
--    """If ``False``, the runner loads base + ft as two separate models
--    instead of toggling on one. More memory-heavy; only useful when a
--    backend can't do in-place toggling."""
--    coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6
--    """Minimum composite score for ``dlm-sway gate`` to pass."""
--
--
--class SwaySpec(BaseModel):
--    """Root of ``sway.yaml``."""
--
--    model_config = ConfigDict(extra="forbid", frozen=True)
--
--    version: int = 1
--    models: SuiteModels
--    defaults: SuiteDefaults = SuiteDefaults()
--    suite: list[dict[str, Any]] = Field(default_factory=list)
--    """Raw probe entries. Validated one-at-a-time by the probe registry
--    via :func:`dlm_sway.probes.base.build_probe` so that the set of
--    allowed probe kinds is an open registry rather than a closed
--    discriminated union."""
--    dlm_source: str | None = None
--    """Optional path to a ``.dlm`` file. When present, the runner asks
--    :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and
--    hands them to probes via :attr:`RunContext.sections`. Auto-populated
--    by ``dlm-sway autogen``."""
--
--    def check_version(self) -> None:
--        """Raise ``ValueError`` if the spec version is unsupported.
--
--        Called explicitly by the loader after validation so the error
--        surfaces with a loader-source tag rather than a pydantic stack.
--        """
--        if self.version != SUPPORTED_VERSION:
--            raise ValueError(
--                f"unsupported sway spec version: {self.version} (this build supports {SUPPORTED_VERSION})"
--            )

sway/src/dlm_sway/visualize.pydeleted

--"""Optional matplotlib-based visualizations.
--
--Behind the ``viz`` extra. Three functions cover the three plots that
--make the sway report come alive in a notebook or saved PNG:
--
--- :func:`plot_section_sis`: per-section bar chart of effective SIS
--  (the flagship attribution view).
--- :func:`plot_adapter_ablation`: the λ-scaled divergence curve — the
--  sway signature plot.
--- :func:`plot_kl_histogram`: distribution of per-prompt KL divergences
--  (the raw data behind A1 DeltaKL).
--
--Each function raises :class:`~dlm_sway.core.errors.BackendNotAvailableError`
--with a pip hint when matplotlib isn't installed. No function writes to
--disk on your behalf — the caller decides (``fig.savefig(...)``).
--"""
--
--from __future__ import annotations
--
--from typing import Any
--
--from dlm_sway.core.errors import BackendNotAvailableError
--from dlm_sway.core.result import SuiteResult
--
--
--def _require_mpl() -> Any:
--    try:
--        import matplotlib.pyplot as plt
--
--        return plt
--    except ImportError as exc:
--        raise BackendNotAvailableError(
--            "visualize",
--            extra="viz",
--            hint="sway's visualization module needs matplotlib.",
--        ) from exc
--
--
--def plot_section_sis(suite: SuiteResult) -> Any:
--    """Render a per-section ``effective_sis`` bar chart.
--
--    Returns the matplotlib ``Figure``; the caller handles display / save.
--    """
--    plt = _require_mpl()
--
--    probe = _find_probe(suite, "section_internalization")
--    if probe is None or not probe.evidence.get("per_section"):
--        raise ValueError("suite has no section_internalization evidence to plot")
--
--    rows: list[dict[str, Any]] = list(probe.evidence["per_section"])
--    labels = [f"{row['tag'] or row['section_id'][:8]}\n({row['kind']})" for row in rows]
--    values = [float(row["effective_sis"]) for row in rows]
--    colors = ["#2ca02c" if row["passed"] else "#d62728" for row in rows]
--
--    fig, ax = plt.subplots(figsize=(max(6.0, 0.7 * len(rows)), 4.0))
--    ax.bar(range(len(rows)), values, color=colors)
--    ax.axhline(
--        float(probe.evidence.get("per_section_threshold", 0.0)),
--        color="gray",
--        linestyle="--",
--        linewidth=1,
--        label="threshold",
--    )
--    ax.set_xticks(range(len(rows)))
--    ax.set_xticklabels(labels, rotation=30, ha="right")
--    ax.set_ylabel("effective SIS")
--    ax.set_title("Section Internalization Score")
--    ax.legend(loc="best")
--    fig.tight_layout()
--    return fig
--
--
--def plot_adapter_ablation(suite: SuiteResult) -> Any:
--    """Render the signature λ-scaled divergence curve."""
--    plt = _require_mpl()
--
--    probe = _find_probe(suite, "adapter_ablation")
--    if probe is None or not probe.evidence.get("lambdas"):
--        raise ValueError("suite has no adapter_ablation evidence to plot")
--
--    lambdas = list(probe.evidence["lambdas"])
--    divs = list(probe.evidence["mean_divergence_per_lambda"])
--
--    fig, ax = plt.subplots(figsize=(7.0, 4.0))
--    ax.plot(lambdas, divs, marker="o", linewidth=2, color="#1f77b4")
--    ax.axvline(1.0, color="gray", linestyle=":", linewidth=1, label="λ=1 (trained)")
--    sat = probe.evidence.get("saturation_lambda")
--    if sat is not None:
--        ax.axvline(
--            float(sat),
--            color="#2ca02c",
--            linestyle="--",
--            linewidth=1,
--            label=f"sat λ={float(sat):.2f}",
--        )
--    ax.set_xlabel("λ (adapter scale)")
--    ax.set_ylabel("mean JS divergence vs λ=0")
--    ax.set_title(
--        f"Adapter Ablation (R²={float(probe.evidence.get('linearity', 0.0)):.2f}, "
--        f"overshoot={float(probe.evidence.get('overshoot', 0.0)):.2f})"
--    )
--    ax.legend(loc="best")
--    fig.tight_layout()
--    return fig
--
--
--def plot_kl_histogram(suite: SuiteResult) -> Any:
--    """Render the per-prompt KL distribution from a DeltaKL probe."""
--    plt = _require_mpl()
--
--    probe = _find_probe(suite, "delta_kl")
--    if probe is None or not probe.evidence.get("per_prompt"):
--        raise ValueError("suite has no delta_kl evidence to plot")
--
--    values = list(probe.evidence["per_prompt"])
--    fig, ax = plt.subplots(figsize=(7.0, 4.0))
--    ax.hist(values, bins=max(5, min(20, len(values) // 2)), color="#ff7f0e", edgecolor="white")
--    ax.axvline(
--        float(probe.raw or 0.0),
--        color="black",
--        linestyle="--",
--        linewidth=1,
--        label=f"mean={float(probe.raw or 0.0):.3f}",
--    )
--    ax.set_xlabel(probe.evidence.get("divergence_kind", "divergence"))
--    ax.set_ylabel("count")
--    ax.set_title("DeltaKL — per-prompt distribution")
--    ax.legend(loc="best")
--    fig.tight_layout()
--    return fig
--
--
--def _find_probe(suite: SuiteResult, kind: str) -> Any:
--    for p in suite.probes:
--        if p.kind == kind:
--            return p
--    return None

sway/tests/__init__.pydeleted

sway/tests/conftest.pydeleted

--"""Shared test fixtures.
--
--Keep the default fast-test environment offline and deterministic so unit
--tests stay below ~1 s per file. Integration tests override these via
--their own ``conftest`` when they need network access.
--"""
--
--from __future__ import annotations
--
--import pytest
--
--# Import the probes package once so every shipped probe registers itself
--# with the central registry. Tests that exercise build_probe("delta_kl",
--# …) rely on this.
--import dlm_sway.probes  # noqa: F401
--
--
--@pytest.fixture(autouse=True)
--def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None:
--    """Unit tests never touch the network.
--
--    Any backend test that needs HF should be marked ``@pytest.mark.online``
--    and clear these vars explicitly.
--    """
--    monkeypatch.setenv("HF_HUB_OFFLINE", "1")
--    monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1")
--    monkeypatch.setenv("HF_DATASETS_OFFLINE", "1")
--    monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1")
--    monkeypatch.setenv("DO_NOT_TRACK", "1")

sway/tests/fixtures/__init__.pydeleted

sway/tests/fixtures/tiny_model.pydeleted

--"""Tiny-model fixture for integration tests.
--
--Mirrors ``dlm.tests.fixtures.tiny_model``: session-scoped snapshot of
--SmolLM2-135M-Instruct, reused across the whole test run. The model is
--small enough (~280 MB on disk, ~600 MB in fp32 VRAM) to make integration
--tests feasible in CI.
--
--Tests using this fixture must carry ``@pytest.mark.slow`` and
--``@pytest.mark.online`` — the default test selection excludes both.
--"""
--
--from __future__ import annotations
--
--import os
--from collections.abc import Iterator
--from pathlib import Path
--
--import pytest
--
--TINY_MODEL_HF_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
--TINY_MODEL_REVISION = os.environ.get("DLM_SWAY_TINY_MODEL_REVISION", "main")
--
--
--def _offline_mode() -> bool:
--    return os.environ.get("SWAY_OFFLINE", "0") == "1"
--
--
--@pytest.fixture(scope="session")
--def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]:
--    """Download (or reuse) the tiny model; yield the cached directory.
--
--    Test opts in via ``@pytest.mark.online`` — the session-wide offline
--    env vars are cleared inside this fixture so ``snapshot_download``
--    actually fetches.
--    """
--    from huggingface_hub import snapshot_download
--
--    # Clear offline env guards (set by the unit-test autouse fixture).
--    prior = {
--        k: os.environ.pop(k, None)
--        for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE")
--    }
--    try:
--        path = snapshot_download(
--            repo_id=TINY_MODEL_HF_ID,
--            revision=TINY_MODEL_REVISION,
--            local_files_only=_offline_mode(),
--        )
--        yield Path(path)
--    finally:
--        for k, v in prior.items():
--            if v is not None:
--                os.environ[k] = v

sway/tests/integration/__init__.pydeleted

sway/tests/integration/conftest.pydeleted

--"""Integration-test configuration.
--
--Integration tests need network + heavy deps. Re-export the tiny_model
--fixture here so test modules can pick it up without a long import
--path.
--"""
--
--from __future__ import annotations
--
--from tests.fixtures.tiny_model import tiny_model_dir  # noqa: F401 — re-export

sway/tests/integration/test_hf_adapter_toggle.pydeleted

--"""Integration test: PEFT ``disable_adapter`` actually changes logits.
--
--This is the load-bearing sanity check for the whole differential design.
--If a future ``peft`` release subtly breaks the disable-context semantics,
--sway's KL / SIS / ablation probes would all silently report zero signal.
--We catch that here, before the rest of the test battery runs.
--
--The test builds a random-init LoRA adapter on a tiny model so no network
--dependency beyond the base model snapshot itself.
--"""
--
--from __future__ import annotations
--
--from pathlib import Path
--
--import pytest
--
--from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
--from dlm_sway.core.model import ModelSpec
--
--pytestmark = [pytest.mark.slow, pytest.mark.online]
--
--
--def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
--    """Construct a LoRA adapter with random-init weights on ``base_dir``.
--
--    The weights are kept small so the toggle-delta is clear but the
--    adapter is structurally valid (correct ``adapter_config.json``,
--    tokenizer files, safetensors layout).
--    """
--    import torch
--    from peft import LoraConfig, get_peft_model
--    from transformers import AutoModelForCausalLM, AutoTokenizer
--
--    torch.manual_seed(0)
--
--    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
--    if tokenizer.pad_token_id is None:
--        tokenizer.pad_token = tokenizer.eos_token
--    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
--
--    cfg = LoraConfig(
--        r=8,
--        lora_alpha=16,
--        target_modules=["q_proj", "v_proj"],
--        lora_dropout=0.0,
--        bias="none",
--        task_type="CAUSAL_LM",
--    )
--    peft_model = get_peft_model(base, cfg)
--
--    # Explicitly scale lora_B out of its PEFT-default zero-init so the
--    # adapter actually changes outputs. Real training does this via
--    # gradients; we do it with a scaled normal.
--    with torch.no_grad():
--        for name, param in peft_model.named_parameters():
--            if "lora_B" in name:
--                param.copy_(torch.randn_like(param) * 0.05)
--
--    peft_model.save_pretrained(str(out_dir))
--    tokenizer.save_pretrained(str(out_dir))
--
--
--@pytest.fixture(scope="module")
--def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
--    adapter_dir = tmp_path_factory.mktemp("random-adapter")
--    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
--    return adapter_dir
--
--
--def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None:
--    """The keystone invariant: base view ≠ ft view on the same prompt."""
--    import numpy as np
--
--    backend = HuggingFaceDifferentialBackend(
--        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
--        adapter_path=random_adapter,
--    )
--    try:
--        prompt = "The quick brown fox"
--        with backend.as_base() as b:
--            base_dist = b.next_token_dist(prompt, top_k=32)
--        with backend.as_finetuned() as f:
--            ft_dist = f.next_token_dist(prompt, top_k=32)
--
--        # Top-k indices may shift under the adapter; take a safe shared
--        # subset instead of asserting identical ordering.
--        assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose(
--            base_dist.logprobs, ft_dist.logprobs, atol=1e-5
--        ), "adapter toggle did not change next-token distribution"
--    finally:
--        backend.close()
--
--
--def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None:
--    """as_base → as_finetuned → as_base yields a stable base view."""
--    import numpy as np
--
--    backend = HuggingFaceDifferentialBackend(
--        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
--        adapter_path=random_adapter,
--    )
--    try:
--        prompt = "hello"
--        with backend.as_base() as b:
--            first = b.next_token_dist(prompt, top_k=16).logprobs
--        with backend.as_finetuned() as f:
--            f.next_token_dist(prompt, top_k=16)  # toggle
--        with backend.as_base() as b:
--            second = b.next_token_dist(prompt, top_k=16).logprobs
--        np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6)
--    finally:
--        backend.close()

sway/tests/unit/__init__.pydeleted

sway/tests/unit/test_backend_dummy.pydeleted

--"""Tests for :class:`dlm_sway.backends.dummy.DummyDifferentialBackend`.
--
--The dummy backend is used by every downstream probe unit test, so it
--gets a thorough own-right test here. Also verifies the view-exclusion
--invariant that catches stale-view bugs in probes.
--"""
--
--from __future__ import annotations
--
--import numpy as np
--import pytest
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.model import Model
--from dlm_sway.core.scoring import DifferentialBackend, ScoringBackend
--
--
--@pytest.fixture
--def backend() -> DummyDifferentialBackend:
--    base = DummyResponses(
--        generations={"hi": "hello"},
--        logprobs={("q", "a"): -3.0},
--    )
--    ft = DummyResponses(
--        generations={"hi": "greetings, traveler"},
--        logprobs={("q", "a"): -1.2},
--    )
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--class TestViews:
--    def test_as_base_and_as_ft_yield_distinct_generations(
--        self, backend: DummyDifferentialBackend
--    ) -> None:
--        with backend.as_base() as b:
--            assert b.generate("hi", max_new_tokens=5) == "hello"
--        with backend.as_finetuned() as f:
--            assert f.generate("hi", max_new_tokens=5) == "greetings, traveler"
--
--    def test_logprob_differs_between_modes(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base() as b:
--            base_score = b.logprob_of("q", "a")
--        with backend.as_finetuned() as f:
--            ft_score = f.logprob_of("q", "a")
--        assert base_score == -3.0
--        assert ft_score == -1.2
--
--    def test_missing_generation_raises_keyerror(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base() as b, pytest.raises(KeyError, match="no canned generation"):
--            b.generate("unconfigured", max_new_tokens=1)
--
--    def test_missing_logprob_default(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base() as b:
--            assert b.logprob_of("nonexistent", "target") == -10.0
--
--
--class TestRollingLogprob:
--    def test_synthesized_when_not_preseeded(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base() as b:
--            r = b.rolling_logprob("a quick brown fox jumps")
--        assert r.num_tokens == 5
--        assert r.logprobs.size == 4
--        assert np.all(r.logprobs == -2.0)
--
--    def test_ft_perplexity_lower_than_base(self, backend: DummyDifferentialBackend) -> None:
--        text = "a quick brown fox"
--        with backend.as_base() as b:
--            pb = b.rolling_logprob(text).perplexity
--        with backend.as_finetuned() as f:
--            pf = f.rolling_logprob(text).perplexity
--        assert pf < pb  # synthesized ft is less perplexed → lower PPL
--
--
--class TestTokenDist:
--    def test_dists_differ_between_modes(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base() as b:
--            base_dist = b.next_token_dist("any prompt")
--        with backend.as_finetuned() as f:
--            ft_dist = f.next_token_dist("any prompt")
--        assert not np.array_equal(base_dist.logprobs, ft_dist.logprobs)
--
--
--class TestInvariants:
--    def test_protocol_satisfaction(self, backend: DummyDifferentialBackend) -> None:
--        assert isinstance(backend, DifferentialBackend)
--        with backend.as_base() as view:
--            assert isinstance(view, Model)
--            assert isinstance(view, ScoringBackend)
--
--    def test_nested_views_rejected(self, backend: DummyDifferentialBackend) -> None:
--        with backend.as_base(), pytest.raises(RuntimeError, match="view already active"):
--            with backend.as_finetuned():
--                pass
--
--    def test_sequential_views_fine(self, backend: DummyDifferentialBackend) -> None:
--        # Must be able to re-enter after exiting — common pattern in probes.
--        with backend.as_base() as b:
--            b.logprob_of("q", "a")
--        with backend.as_finetuned() as f:
--            f.logprob_of("q", "a")
--        with backend.as_base() as b:
--            b.logprob_of("q", "a")

sway/tests/unit/test_backend_registry.pydeleted

--"""Tests for the backend registry in ``dlm_sway.backends``.
--
--The registry is the single place that maps a ModelSpec to a concrete
--backend. These tests check the error paths — actually materializing an
--HF backend requires model weights and is covered by the integration
--suite.
--"""
--
--from __future__ import annotations
--
--from pathlib import Path
--
--import pytest
--
--from dlm_sway.backends import build
--from dlm_sway.core.errors import BackendNotAvailableError, SpecValidationError
--from dlm_sway.core.model import ModelSpec
--
--
--class TestRegistry:
--    def test_dummy_rejected_via_build(self) -> None:
--        with pytest.raises(SpecValidationError, match="kind='dummy'"):
--            build(ModelSpec(base="x", kind="dummy"))
--
--    def test_hf_requires_adapter(self) -> None:
--        with pytest.raises(SpecValidationError, match="adapter"):
--            build(ModelSpec(base="x", kind="hf"))
--
--    def test_mlx_requires_adapter(self) -> None:
--        with pytest.raises(SpecValidationError, match="adapter"):
--            build(ModelSpec(base="x", kind="mlx"))
--
--    def test_mlx_dispatch_raises_when_mlx_missing(self) -> None:
--        # On non-Apple-Silicon (or Apple without mlx installed), constructing
--        # the MLX backend raises BackendNotAvailableError with a pip hint.
--        # We skip this assertion if mlx happens to be installed.
--        import importlib.util
--
--        if importlib.util.find_spec("mlx") is not None:
--            pytest.skip("mlx is installed; error path not exercised")
--        with pytest.raises(BackendNotAvailableError) as exc_info:
--            build(ModelSpec(base="x", kind="mlx", adapter=Path("/tmp/a")))
--        assert exc_info.value.backend == "mlx"
--
--    def test_custom_requires_entry_point(self) -> None:
--        with pytest.raises(SpecValidationError, match="entry_point"):
--            build(ModelSpec(base="x", kind="custom", adapter=Path("/tmp/a")))
--
--    def test_custom_validates_entry_point_shape(self) -> None:
--        with pytest.raises(SpecValidationError, match="pkg.module:ClassName"):
--            build(
--                ModelSpec(
--                    base="x",
--                    kind="custom",
--                    entry_point="not_a_valid_entry_point",
--                    adapter=Path("/tmp/a"),
--                )
--            )
--
--    def test_custom_rejects_unimportable_module(self) -> None:
--        with pytest.raises(SpecValidationError, match="cannot import"):
--            build(
--                ModelSpec(
--                    base="x",
--                    kind="custom",
--                    entry_point="nonexistent_pkg_xyz:Backend",
--                    adapter=Path("/tmp/a"),
--                )
--            )
--
--    def test_custom_rejects_missing_class(self) -> None:
--        with pytest.raises(SpecValidationError, match="has no attribute"):
--            build(
--                ModelSpec(
--                    base="x",
--                    kind="custom",
--                    entry_point="dlm_sway:NoSuchClass",
--                    adapter=Path("/tmp/a"),
--                )
--            )
--
--    def test_custom_rejects_non_differential_class(self) -> None:
--        # A class that accepts the canonical constructor args but doesn't
--        # implement the protocol.
--        import sys
--        import types
--
--        class _Bad:
--            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
--                del base_spec, adapter_path
--
--        mod = types.ModuleType("_sway_bad_mod")
--        mod.Bad = _Bad  # type: ignore[attr-defined]
--        sys.modules["_sway_bad_mod"] = mod
--
--        with pytest.raises(SpecValidationError, match="DifferentialBackend"):
--            build(
--                ModelSpec(
--                    base="x",
--                    kind="custom",
--                    entry_point="_sway_bad_mod:Bad",
--                    adapter=Path("/tmp/a"),
--                )
--            )
--
--    def test_custom_dispatches_to_valid_backend(self) -> None:
--        # Use the dummy backend via a custom entry point. The dummy class's
--        # __init__ takes different args, so we write a thin adapter class.
--        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--
--        class _AdapterBackend(DummyDifferentialBackend):
--            def __init__(self, base_spec, adapter_path):  # type: ignore[no-untyped-def]
--                super().__init__(base=DummyResponses(), ft=DummyResponses())
--
--        # Register on a throwaway module we can find by name.
--        import sys
--        import types
--
--        mod = types.ModuleType("_sway_custom_test_mod")
--        mod.AdapterBackend = _AdapterBackend  # type: ignore[attr-defined]
--        sys.modules["_sway_custom_test_mod"] = mod
--
--        backend = build(
--            ModelSpec(
--                base="x",
--                kind="custom",
--                entry_point="_sway_custom_test_mod:AdapterBackend",
--                adapter=Path("/tmp/a"),
--            )
--        )
--        from dlm_sway.core.scoring import DifferentialBackend
--
--        assert isinstance(backend, DifferentialBackend)

sway/tests/unit/test_cli.pydeleted

--"""Smoke tests for the dlm-sway CLI.
--
--We avoid exercising backends (they need real models) and instead test
--arg parsing, error paths, and the read-only commands (``doctor``,
--``report``, and the help surface).
--"""
--
--from __future__ import annotations
--
--import json
--from pathlib import Path
--
--from typer.testing import CliRunner
--
--from dlm_sway.cli.app import app
--
--
--def test_version_exits_zero() -> None:
--    result = CliRunner().invoke(app, ["--version"])
--    assert result.exit_code == 0
--    assert "dlm-sway" in result.stdout
--
--
--def test_help_lists_all_commands() -> None:
--    result = CliRunner().invoke(app, ["--help"])
--    assert result.exit_code == 0
--    for cmd in ("run", "gate", "check", "diff", "autogen", "doctor", "report"):
--        assert cmd in result.stdout
--
--
--def test_doctor_runs(capsys) -> None:  # type: ignore[no-untyped-def]
--    result = CliRunner().invoke(app, ["doctor"])
--    assert result.exit_code == 0
--    # Rich applies color codes by default; assert the bare product name appears.
--    assert "dlm-sway" in result.stdout
--    assert "backends" in result.stdout
--
--
--def test_run_without_file_errors(tmp_path: Path) -> None:
--    missing = tmp_path / "nope.yaml"
--    result = CliRunner().invoke(app, ["run", str(missing)])
--    # Exit code 2 = SwayError bubble-up; 1 = typer missing-arg; accept either.
--    assert result.exit_code != 0
--
--
--def test_report_from_json(tmp_path: Path) -> None:
--    sample = {
--        "schema_version": 1,
--        "sway_version": "0.1.0.dev0",
--        "base_model_id": "base",
--        "adapter_id": "adp",
--        "score": {"overall": 0.7, "band": "healthy", "components": {}, "findings": []},
--        "probes": [
--            {
--                "name": "p1",
--                "kind": "delta_kl",
--                "verdict": "pass",
--                "score": 0.7,
--                "message": "ok",
--            },
--        ],
--    }
--    path = tmp_path / "result.json"
--    path.write_text(json.dumps(sample), encoding="utf-8")
--
--    terminal = CliRunner().invoke(app, ["report", str(path)])
--    assert terminal.exit_code == 0
--    assert "p1" in terminal.stdout
--
--    md = CliRunner().invoke(app, ["report", str(path), "--format", "md"])
--    assert md.exit_code == 0
--    assert "dlm-sway report" in md.stdout
--
--    junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"])
--    assert junit.exit_code == 0
--    assert "<testsuite" in junit.stdout
--
--
--def test_autogen_without_dlm_extra_exits_nonzero(tmp_path: Path, monkeypatch) -> None:  # type: ignore[no-untyped-def]
--    # Force the import path to fail so the CLI prints the extra hint.
--    import builtins
--
--    real_import = builtins.__import__
--
--    def fake_import(name: str, *args: object, **kwargs: object):  # type: ignore[no-untyped-def]
--        if name.startswith("dlm_sway.integrations.dlm"):
--            raise ImportError("simulated missing extra")
--        return real_import(name, *args, **kwargs)  # type: ignore[no-untyped-call]
--
--    monkeypatch.setattr(builtins, "__import__", fake_import)
--    result = CliRunner().invoke(app, ["autogen", "any.dlm"])
--    assert result.exit_code != 0

sway/tests/unit/test_determinism.pydeleted

--"""Tests for :mod:`dlm_sway.core.determinism`."""
--
--from __future__ import annotations
--
--import os
--import random
--
--import numpy as np
--
--from dlm_sway.core.determinism import DeterminismSummary, seed_everything
--
--
--class TestSeedEverything:
--    def test_returns_summary(self) -> None:
--        summary = seed_everything(0)
--        assert isinstance(summary, DeterminismSummary)
--        assert summary.seed == 0
--        assert summary.class_ in {"strict", "best_effort", "loose"}
--
--    def test_idempotent_for_stdlib_random(self) -> None:
--        seed_everything(42)
--        a = [random.random() for _ in range(5)]
--        seed_everything(42)
--        b = [random.random() for _ in range(5)]
--        assert a == b
--
--    def test_idempotent_for_numpy(self) -> None:
--        seed_everything(17)
--        a = np.random.rand(5)
--        seed_everything(17)
--        b = np.random.rand(5)
--        np.testing.assert_array_equal(a, b)
--
--    def test_cublas_workspace_set_under_strict(self) -> None:
--        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
--        seed_everything(0, strict=True)
--        assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8"
--
--    def test_non_strict_does_not_set_cublas(self) -> None:
--        os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
--        seed_everything(0, strict=False)
--        # Non-strict mode must not leak the env var in either direction;
--        # the host environment's prior value wins.
--        assert (
--            "CUBLAS_WORKSPACE_CONFIG" not in os.environ
--            or os.environ["CUBLAS_WORKSPACE_CONFIG"] != ":4096:8"
--        )

sway/tests/unit/test_divergence.pydeleted

--"""Tests for :mod:`dlm_sway.probes._divergence`."""
--
--from __future__ import annotations
--
--import math
--
--import numpy as np
--
--from dlm_sway.core.scoring import TokenDist
--from dlm_sway.probes._divergence import aligned_probs, divergence, js, kl
--
--
--def _dist(ids: list[int], probs: list[float], vocab: int = 100) -> TokenDist:
--    return TokenDist(
--        token_ids=np.asarray(ids, dtype=np.int64),
--        logprobs=np.log(np.asarray(probs, dtype=np.float32)),
--        vocab_size=vocab,
--    )
--
--
--class TestAligned:
--    def test_identical_distributions(self) -> None:
--        d = _dist([1, 2, 3], [0.5, 0.3, 0.2])
--        p, q = aligned_probs(d, d)
--        np.testing.assert_allclose(p, q)
--
--    def test_union_support_fills_missing(self) -> None:
--        base = _dist([1, 2, 3], [0.5, 0.3, 0.2])
--        ft = _dist([2, 3, 4], [0.4, 0.4, 0.2])
--        p, q = aligned_probs(base, ft)
--        assert p.shape == (4,)
--        assert abs(p.sum() - 1.0) < 1e-9
--        assert abs(q.sum() - 1.0) < 1e-9
--
--
--class TestKL:
--    def test_zero_when_equal(self) -> None:
--        p = np.array([0.5, 0.3, 0.2])
--        assert kl(p, p) == 0.0
--
--    def test_positive_when_different(self) -> None:
--        p = np.array([0.7, 0.2, 0.1])
--        q = np.array([0.2, 0.3, 0.5])
--        assert kl(p, q) > 0.0
--
--
--class TestJS:
--    def test_zero_when_equal(self) -> None:
--        p = np.array([0.5, 0.3, 0.2])
--        assert js(p, p) == 0.0
--
--    def test_symmetric(self) -> None:
--        p = np.array([0.7, 0.2, 0.1])
--        q = np.array([0.2, 0.3, 0.5])
--        assert math.isclose(js(p, q), js(q, p), rel_tol=1e-9)
--
--    def test_bounded_by_ln2(self) -> None:
--        p = np.array([1.0, 0.0])
--        q = np.array([0.0, 1.0])
--        # With zeros handled as 0·log0 = 0 this approaches ln(2).
--        assert js(p, q) <= math.log(2.0) + 1e-9
--
--
--class TestDivergenceDispatch:
--    def test_default_is_js(self) -> None:
--        d1 = _dist([1, 2], [0.6, 0.4])
--        d2 = _dist([1, 2], [0.3, 0.7])
--        assert divergence(d1, d2) == divergence(d1, d2, kind="js")
--
--    def test_kl_available(self) -> None:
--        d1 = _dist([1, 2], [0.6, 0.4])
--        d2 = _dist([1, 2], [0.3, 0.7])
--        assert divergence(d1, d2, kind="kl") >= 0.0

sway/tests/unit/test_dlm_bridge.pydeleted

--"""Tests for :mod:`dlm_sway.integrations.dlm`.
--
--The bridge imports ``dlm.*`` modules lazily. We mock those via
--``sys.modules`` injection so the tests run without the ``dlm-sway[dlm]``
--extra installed. A full end-to-end integration test against a real
--``.dlm`` lives under ``tests/integration/``.
--"""
--
--from __future__ import annotations
--
--import sys
--import types
--from dataclasses import dataclass
--from pathlib import Path
--
--import pytest
--import yaml
--
--
--@pytest.fixture
--def fake_dlm(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path:
--    """Install a fake ``dlm`` package so the resolver can import."""
--
--    # Build synthetic parsed .dlm structure.
--    @dataclass
--    class _Frontmatter:
--        dlm_id: str = "01TESTULID"
--        base_model: str = "smollm2-135m"
--
--    @dataclass
--    class _Section:
--        section_id: str
--        type: str
--        content: str
--        tag: str | None = None
--
--    @dataclass
--    class _Parsed:
--        frontmatter: _Frontmatter
--        sections: tuple[_Section, ...]
--
--    def _parse_file(_path: Path):  # type: ignore[no-untyped-def]
--        return _Parsed(
--            frontmatter=_Frontmatter(),
--            sections=(
--                _Section(
--                    section_id="prose-1",
--                    type="PROSE",
--                    content="This is a prose section with some information. Further detail follows.",
--                ),
--                _Section(
--                    section_id="instr-1",
--                    type="INSTRUCTION",
--                    content="### Q\nWhat is X?\n\n### A\nX is a concept\n",
--                ),
--                _Section(
--                    section_id="pref-1",
--                    type="PREFERENCE",
--                    content="chosen/rejected triple",
--                ),
--            ),
--        )
--
--    # Fake ``dlm.doc.parser`` module.
--    dlm_pkg = types.ModuleType("dlm")
--    dlm_doc = types.ModuleType("dlm.doc")
--    dlm_doc_parser = types.ModuleType("dlm.doc.parser")
--    dlm_doc_parser.parse_file = _parse_file  # type: ignore[attr-defined]
--
--    # Fake ``dlm.store.paths`` that returns a resolvable path.
--    dlm_store = types.ModuleType("dlm.store")
--    dlm_store_paths = types.ModuleType("dlm.store.paths")
--
--    adapter_dir = tmp_path / "adapter_v1"
--    adapter_dir.mkdir()
--    (adapter_dir / "adapter_config.json").write_text("{}", encoding="utf-8")
--
--    class _StorePath:
--        def __init__(self, path: Path) -> None:
--            self._p = path
--
--        def resolve_current_adapter(self) -> Path:
--            return self._p
--
--    def _for_dlm(_dlm_id: str) -> _StorePath:
--        return _StorePath(adapter_dir)
--
--    dlm_store_paths.StorePath = _StorePath  # type: ignore[attr-defined]
--    dlm_store_paths.for_dlm = _for_dlm  # type: ignore[attr-defined]
--
--    # Fake base-model resolver — returns a stub with an ``hf_id`` attribute.
--    dlm_base = types.ModuleType("dlm.base_models")
--
--    @dataclass
--    class _BaseSpec:
--        hf_id: str
--        key: str
--
--    def _resolve(key: str) -> _BaseSpec:
--        return _BaseSpec(hf_id="HuggingFaceTB/SmolLM2-135M-Instruct", key=key)
--
--    dlm_base.resolve = _resolve  # type: ignore[attr-defined]
--
--    # Fake instruction / preference parsers.
--    dlm_data = types.ModuleType("dlm.data")
--    dlm_data_instr = types.ModuleType("dlm.data.instruction_parser")
--    dlm_data_pref = types.ModuleType("dlm.data.preference_parser")
--
--    @dataclass
--    class _QAPair:
--        question: str
--        answer: str
--
--    @dataclass
--    class _Triple:
--        prompt: str
--        chosen: str
--        rejected: str
--
--    def _parse_instr(body: str, *, section_id: str) -> list[_QAPair]:
--        del section_id
--        out: list[_QAPair] = []
--        parts = body.split("### Q")
--        for part in parts[1:]:
--            q_block, _, a_block = part.partition("### A")
--            q = q_block.strip()
--            a = a_block.strip()
--            if q and a:
--                out.append(_QAPair(question=q, answer=a))
--        return out
--
--    def _parse_pref(body: str, *, section_id: str) -> list[_Triple]:
--        del body, section_id
--        return [_Triple(prompt="Which?", chosen="good answer", rejected="bad answer")]
--
--    dlm_data_instr.parse_instruction_body = _parse_instr  # type: ignore[attr-defined]
--    dlm_data_pref.parse_preference_body = _parse_pref  # type: ignore[attr-defined]
--
--    monkeypatch.setitem(sys.modules, "dlm", dlm_pkg)
--    monkeypatch.setitem(sys.modules, "dlm.doc", dlm_doc)
--    monkeypatch.setitem(sys.modules, "dlm.doc.parser", dlm_doc_parser)
--    monkeypatch.setitem(sys.modules, "dlm.store", dlm_store)
--    monkeypatch.setitem(sys.modules, "dlm.store.paths", dlm_store_paths)
--    monkeypatch.setitem(sys.modules, "dlm.base_models", dlm_base)
--    monkeypatch.setitem(sys.modules, "dlm.data", dlm_data)
--    monkeypatch.setitem(sys.modules, "dlm.data.instruction_parser", dlm_data_instr)
--    monkeypatch.setitem(sys.modules, "dlm.data.preference_parser", dlm_data_pref)
--
--    # Return a path to a fake .dlm file (the parser won't actually read it).
--    dlm_file = tmp_path / "doc.dlm"
--    dlm_file.write_text("---\ndlm_id: 01TEST\n---\n\nbody\n", encoding="utf-8")
--    return dlm_file
--
--
--def test_resolve_dlm_maps_sections(fake_dlm: Path) -> None:
--    from dlm_sway.integrations.dlm.resolver import resolve_dlm
--
--    handle = resolve_dlm(fake_dlm)
--    assert handle.dlm_id == "01TESTULID"
--    assert handle.base_model == "HuggingFaceTB/SmolLM2-135M-Instruct"
--    assert handle.adapter_path is not None
--    assert handle.adapter_path.exists()
--    assert len(handle.sections) == 3
--    # Kinds normalized from uppercase dlm enum values.
--    assert {s.kind for s in handle.sections} == {"prose", "instruction", "preference"}
--    # Instruction Q/A pair survived the translation.
--    instr = next(s for s in handle.sections if s.kind == "instruction")
--    assert instr.probes
--    assert instr.probes[0].prompt == "What is X?"
--    # Preference triple too.
--    pref = next(s for s in handle.sections if s.kind == "preference")
--    assert pref.preferences
--    assert pref.preferences[0].chosen == "good answer"
--
--
--def test_resolve_without_dlm_installed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
--    """resolve_dlm surfaces a SwayError when the dlm package is missing."""
--    # Wipe any cached dlm modules so the lazy import fails.
--    for mod in list(sys.modules):
--        if mod == "dlm" or mod.startswith("dlm."):
--            monkeypatch.delitem(sys.modules, mod, raising=False)
--
--    import builtins
--
--    real_import = builtins.__import__
--
--    def fake_import(name: str, *args, **kwargs):  # type: ignore[no-untyped-def]
--        if name.startswith("dlm."):
--            raise ImportError("missing extra")
--        return real_import(name, *args, **kwargs)
--
--    monkeypatch.setattr(builtins, "__import__", fake_import)
--
--    from dlm_sway.core.errors import SwayError
--    from dlm_sway.integrations.dlm.resolver import resolve_dlm
--
--    with pytest.raises(SwayError, match="dlm package not installed"):
--        resolve_dlm(tmp_path / "doc.dlm")
--
--
--def test_autogen_writes_complete_suite(fake_dlm: Path, tmp_path: Path) -> None:
--    from dlm_sway.integrations.dlm.autogen import write_sway_yaml
--
--    out = tmp_path / "sway.yaml"
--    write_sway_yaml(fake_dlm, out)
--    data = yaml.safe_load(out.read_text(encoding="utf-8"))
--
--    assert data["version"] == 1
--    assert data["models"]["base"]["base"] == "HuggingFaceTB/SmolLM2-135M-Instruct"
--    assert data["models"]["ft"]["adapter"] is not None
--    assert data["dlm_source"] == str(fake_dlm.resolve())
--
--    kinds = {entry["kind"] for entry in data["suite"]}
--    # The full 11-primitive battery minus nothing is present (some may
--    # be skipped when data is absent, but here we have one of every
--    # section type).
--    expected = {
--        "null_adapter",
--        "delta_kl",
--        "adapter_revert",
--        "prompt_collapse",
--        "section_internalization",
--        "paraphrase_invariance",
--        "preference_flip",
--        "style_fingerprint",
--        "calibration_drift",
--        "leakage",
--        "adapter_ablation",
--    }
--    assert expected <= kinds, f"missing: {expected - kinds}"
--
--
--def test_build_spec_dict_skips_preference_when_absent() -> None:
--    from dlm_sway.core.sections import Section
--    from dlm_sway.integrations.dlm.autogen import build_spec_dict
--    from dlm_sway.integrations.dlm.resolver import DlmHandle
--
--    sections = (
--        Section(id="a", kind="prose", content="A prose section. Second sentence."),
--        Section(id="b", kind="prose", content="Another prose section."),
--    )
--    handle = DlmHandle(
--        dlm_id="x",
--        base_model="base",
--        adapter_path=Path("/tmp/adapter"),
--        sections=sections,
--        doc_text="whole document",
--    )
--    spec = build_spec_dict(handle)
--    kinds = {entry["kind"] for entry in spec["suite"]}
--    assert "preference_flip" not in kinds
--    assert "section_internalization" in kinds

sway/tests/unit/test_errors.pydeleted

--"""Tests for the exception hierarchy."""
--
--from __future__ import annotations
--
--import pytest
--
--from dlm_sway.core.errors import (
--    BackendNotAvailableError,
--    ProbeError,
--    SpecValidationError,
--    SwayError,
--)
--
--
--class TestSwayError:
--    def test_is_root_exception(self) -> None:
--        assert issubclass(SpecValidationError, SwayError)
--        assert issubclass(BackendNotAvailableError, SwayError)
--        assert issubclass(ProbeError, SwayError)
--
--    def test_raised_and_caught_as_sway_error(self) -> None:
--        with pytest.raises(SwayError):
--            raise ProbeError("delta_kl", "shape mismatch")
--
--
--class TestSpecValidationError:
--    def test_format_without_source(self) -> None:
--        err = SpecValidationError("unknown key 'topp'")
--        assert str(err) == "unknown key 'topp'"
--        assert err.source is None
--
--    def test_format_with_source(self) -> None:
--        err = SpecValidationError("unknown key 'topp'", source="sway.yaml")
--        assert str(err) == "sway.yaml: unknown key 'topp'"
--        assert err.source == "sway.yaml"
--
--
--class TestBackendNotAvailableError:
--    def test_hint_rendered_in_message(self) -> None:
--        err = BackendNotAvailableError("hf", extra="hf")
--        assert "pip install 'dlm-sway[hf]'" in str(err)
--        assert err.backend == "hf"
--        assert err.extra == "hf"
--
--    def test_appends_optional_hint(self) -> None:
--        err = BackendNotAvailableError("mlx", extra="mlx", hint="Apple Silicon only.")
--        assert "Apple Silicon only." in str(err)
--
--
--class TestProbeError:
--    def test_includes_probe_name(self) -> None:
--        err = ProbeError("delta_kl", "NaN logits")
--        assert "delta_kl" in str(err)
--        assert "NaN logits" in str(err)
--        assert err.probe == "delta_kl"

sway/tests/unit/test_model.pydeleted

--"""Tests for :mod:`dlm_sway.core.model`."""
--
--from __future__ import annotations
--
--from pathlib import Path
--
--import pytest
--from pydantic import ValidationError
--
--from dlm_sway.core.model import LoadedModel, Model, ModelSpec
--
--
--class TestModelSpec:
--    def test_defaults(self) -> None:
--        spec = ModelSpec(base="HuggingFaceTB/SmolLM2-135M-Instruct")
--        assert spec.kind == "hf"
--        assert spec.adapter is None
--        assert spec.dtype == "auto"
--        assert spec.device == "auto"
--        assert spec.trust_remote_code is False
--        assert spec.entry_point is None
--
--    def test_frozen(self) -> None:
--        spec = ModelSpec(base="x")
--        with pytest.raises(ValidationError):
--            spec.base = "y"  # type: ignore[misc]
--
--    def test_extra_fields_forbidden(self) -> None:
--        with pytest.raises(ValidationError) as exc_info:
--            ModelSpec(base="x", bogus="y")  # type: ignore[call-arg]
--        assert "bogus" in str(exc_info.value).lower()
--
--    def test_kind_enum(self) -> None:
--        ModelSpec(base="x", kind="hf")
--        ModelSpec(base="x", kind="mlx")
--        ModelSpec(base="x", kind="dummy")
--        ModelSpec(base="x", kind="custom", entry_point="pkg.mod:Backend")
--        with pytest.raises(ValidationError):
--            ModelSpec(base="x", kind="ollama")  # type: ignore[arg-type]
--
--    def test_adapter_coerced_to_path(self) -> None:
--        spec = ModelSpec(base="x", adapter="/tmp/adapter")  # type: ignore[arg-type]
--        assert isinstance(spec.adapter, Path)
--
--
--class TestLoadedModel:
--    def test_frozen_dataclass(self) -> None:
--        loaded = LoadedModel(
--            id="base",
--            spec=ModelSpec(base="x"),
--            model=object(),
--            tokenizer=object(),
--            meta={"device": "cpu"},
--        )
--        assert loaded.id == "base"
--        assert loaded.meta["device"] == "cpu"
--
--
--class TestModelProtocol:
--    def test_runtime_checkable(self) -> None:
--        class FakeModel:
--            id = "x"
--
--            def generate(
--                self,
--                prompt: str,
--                *,
--                max_new_tokens: int,
--                temperature: float = 0.0,
--                top_p: float = 1.0,
--                seed: int = 0,
--            ) -> str:
--                return f"{prompt}|{max_new_tokens}"
--
--            def close(self) -> None:
--                return None
--
--        assert isinstance(FakeModel(), Model)

sway/tests/unit/test_null_calibration.pydeleted

--"""Tests for null-adapter calibration.
--
--Covers: dummy backend ``as_null_adapter`` yields a plausibly noisy
--view; ``NullAdapterProbe`` populates ``ctx.null_stats`` in a way
--downstream probes pick up end-to-end; missing-capability SKIP path.
--"""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.scoring import NullCalibratedBackend
--from dlm_sway.probes.base import RunContext, build_probe
--from dlm_sway.suite.runner import run as run_suite
--from dlm_sway.suite.spec import SwaySpec
--
--
--def _diverging_backend() -> DummyDifferentialBackend:
--    base = DummyResponses()
--    ft = DummyResponses()
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--class TestProtocolConformance:
--    def test_dummy_is_null_calibrated(self) -> None:
--        assert isinstance(_diverging_backend(), NullCalibratedBackend)
--
--
--class TestAsNullAdapter:
--    def test_yields_perturbed_view(self) -> None:
--        backend = _diverging_backend()
--        with backend.as_base() as base:
--            base_dist = base.next_token_dist("hello")
--        with backend.as_null_adapter(seed=0) as null:
--            null_dist = null.next_token_dist("hello")
--        # Some perturbation, but bounded.
--        assert not np.allclose(base_dist.logprobs, null_dist.logprobs)
--
--    def test_different_seeds_yield_different_views(self) -> None:
--        backend = _diverging_backend()
--        with backend.as_null_adapter(seed=1) as v1:
--            d1 = v1.next_token_dist("hello")
--        with backend.as_null_adapter(seed=2) as v2:
--            d2 = v2.next_token_dist("hello")
--        assert not np.allclose(d1.logprobs, d2.logprobs)
--
--    def test_view_exclusion_enforced(self) -> None:
--        import pytest
--
--        backend = _diverging_backend()
--        with backend.as_null_adapter(seed=0), pytest.raises(RuntimeError):
--            with backend.as_base():
--                pass
--
--
--class TestProbe:
--    def test_populates_null_stats(self) -> None:
--        backend = _diverging_backend()
--        probe, spec = build_probe(
--            {
--                "name": "null",
--                "kind": "null_adapter",
--                "runs": 3,
--                "prompts": ["q1", "q2"],
--            }
--        )
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--        stats = result.evidence["null_stats"]
--        assert "delta_kl" in stats
--        assert stats["delta_kl"]["n"] == 3.0
--        assert stats["delta_kl"]["std"] > 0.0  # seeded perturbations produce variance
--
--    def test_runner_threads_null_stats_to_subsequent_probes(self) -> None:
--        """End-to-end: null_adapter first → delta_kl picks up z-score path."""
--        backend = _diverging_backend()
--        raw_spec = SwaySpec.model_validate(
--            {
--                "version": 1,
--                "models": {"base": {"base": "b"}, "ft": {"base": "b", "adapter": "/tmp/a"}},
--                "suite": [
--                    {
--                        "name": "null",
--                        "kind": "null_adapter",
--                        "runs": 3,
--                        "prompts": ["p1", "p2"],
--                    },
--                    {
--                        "name": "dk",
--                        "kind": "delta_kl",
--                        "prompts": ["p1", "p2"],
--                        "assert_z_gte": -10.0,  # permissive so we pass regardless
--                    },
--                ],
--            }
--        )
--        result = run_suite(raw_spec, backend)
--        assert len(result.probes) == 2
--        null_result = result.probes[0]
--        dk_result = result.probes[1]
--        assert null_result.verdict == Verdict.PASS
--        # The delta_kl probe should have computed a z_score because null_stats was present.
--        assert dk_result.z_score is not None, (
--            "delta_kl should have z-scored against null baseline, got "
--            f"evidence={dk_result.evidence}, message={dk_result.message}"
--        )
--
--    def test_skip_when_backend_not_null_calibrated(self) -> None:
--        class _Bare:
--            def as_base(self):  # noqa: ANN202
--                raise NotImplementedError
--
--            def as_finetuned(self):  # noqa: ANN202
--                raise NotImplementedError
--
--        probe, spec = build_probe({"name": "null", "kind": "null_adapter"})
--        ctx = RunContext(backend=_Bare())  # type: ignore[arg-type]
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.SKIP
--        assert "NullCalibratedBackend" in result.message

sway/tests/unit/test_probe_adapter_ablation.pydeleted

--"""Tests for :mod:`dlm_sway.probes.adapter_ablation`.
--
--Uses the dummy backend's lam-interpolation implementation to exercise
--the full probe path without loading a real model.
--"""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist
--from dlm_sway.probes.adapter_ablation import (
--    _overshoot,
--    _r_squared,
--    _saturation_lambda,
--)
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--class TestShapeMetrics:
--    def test_r_squared_perfect_linear(self) -> None:
--        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
--        y = 2 * x + 0.1
--        assert _r_squared(x, y) > 0.99
--
--    def test_r_squared_zero_slope_defined(self) -> None:
--        x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64)
--        y = np.zeros_like(x)
--        # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit).
--        assert _r_squared(x, y) == 1.0
--
--    def test_saturation_lambda_expected(self) -> None:
--        lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64)
--        divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64)
--        sat = _saturation_lambda(lambdas, divs)
--        assert sat == 0.75  # 0.95 / 1.0 = 0.95 ≥ 0.9
--
--    def test_overshoot_recovered(self) -> None:
--        lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64)
--        divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64)
--        assert _overshoot(lambdas, divs) == 1.15
--
--
--def _diverging_backend() -> DummyDifferentialBackend:
--    """Backend where base ≠ ft at a few prompts; distributions interpolate
--    smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter."""
--    base = DummyResponses(
--        token_dists={
--            "q1": TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
--                vocab_size=100,
--            ),
--            "q2": TokenDist(
--                token_ids=np.array([5, 6], dtype=np.int64),
--                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
--                vocab_size=100,
--            ),
--        }
--    )
--    ft = DummyResponses(
--        token_dists={
--            "q1": TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
--                vocab_size=100,
--            ),
--            "q2": TokenDist(
--                token_ids=np.array([5, 6], dtype=np.int64),
--                logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)),
--                vocab_size=100,
--            ),
--        }
--    )
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--class TestProbe:
--    def test_backend_implements_scalable_protocol(self) -> None:
--        backend = _diverging_backend()
--        assert isinstance(backend, ScalableDifferentialBackend)
--
--    def test_probe_runs_and_emits_shape_metrics(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "abl",
--                "kind": "adapter_ablation",
--                "prompts": ["q1", "q2"],
--                "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
--                # Very permissive to tolerate the log-space blend of a
--                # tiny synthetic fixture.
--                "assert_linearity_gte": 0.3,
--                "assert_overshoot_gte": 1.0,
--            }
--        )
--        ctx = RunContext(backend=_diverging_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict in (Verdict.PASS, Verdict.FAIL)
--        assert "lambdas" in result.evidence
--        assert "mean_divergence_per_lambda" in result.evidence
--        assert len(result.evidence["mean_divergence_per_lambda"]) == 6
--        # Divergence should increase as λ grows from 0 toward ft.
--        divs = result.evidence["mean_divergence_per_lambda"]
--        # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing
--        # for the bulk of the curve.
--        assert divs[-2] >= divs[0]
--
--    def test_skip_when_backend_not_scalable(self) -> None:
--        class _NonScalable:
--            def as_base(self):  # noqa: ANN202
--                raise NotImplementedError
--
--            def as_finetuned(self):  # noqa: ANN202
--                raise NotImplementedError
--
--        probe, spec = build_probe(
--            {
--                "name": "abl",
--                "kind": "adapter_ablation",
--                "prompts": ["q1"],
--            }
--        )
--        ctx = RunContext(backend=_NonScalable())  # type: ignore[arg-type]
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.SKIP
--        assert "ScalableDifferentialBackend" in result.message
--
--    def test_error_on_empty_prompts(self) -> None:
--        backend = _diverging_backend()
--        probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_adapter_revert.pydeleted

--"""Tests for :mod:`dlm_sway.probes.adapter_revert`.
--
--We stub out the embedder so these tests don't need sentence-transformers
--installed. The ``probe.py`` SKIP path for the missing-extra case is
--covered separately by monkeypatching the importer.
--"""
--
--from __future__ import annotations
--
--from typing import Any
--
--import numpy as np
--import pytest
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.probes.adapter_revert import AdapterRevertProbe
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _backend(*, ft_like_base: bool = False) -> DummyDifferentialBackend:
--    base = DummyResponses(
--        generations={
--            "pp1": "cats are mammals",
--            "pp2": "cats have fur",
--        }
--    )
--    if ft_like_base:
--        ft_gens = dict(base.generations)
--    else:
--        ft_gens = {
--            "pp1": "dolphins are mammals",
--            "pp2": "dolphins are smart",
--        }
--    ft = DummyResponses(generations=ft_gens)
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--def _stub_embedder(text_to_vec: dict[str, np.ndarray]):  # type: ignore[no-untyped-def]
--    def _encode(texts: list[str]):  # type: ignore[no-untyped-def]
--        return np.stack([text_to_vec[t] for t in texts])
--
--    return _encode
--
--
--@pytest.fixture
--def monkeyed_embed(monkeypatch: pytest.MonkeyPatch) -> dict[str, np.ndarray]:
--    """Install a stub embedder with a controllable text→vec mapping.
--
--    Tests populate the dict before calling ``probe.run()``.
--    """
--    table: dict[str, np.ndarray] = {}
--    monkeypatch.setattr(
--        "dlm_sway.probes.adapter_revert._load_embedder",
--        lambda _model_id: _stub_embedder(table),  # type: ignore[arg-type]
--    )
--    return table
--
--
--class TestAdapterRevert:
--    def test_healthy_adapter_passes(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
--        # gold and ft-outputs cluster together, base outputs cluster elsewhere.
--        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
--        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
--        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
--        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
--        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
--
--        probe, spec = build_probe(
--            {
--                "name": "rev",
--                "kind": "adapter_revert",
--                "cases": [
--                    {
--                        "prompt": "anything",
--                        "gold": "the answer is dolphins",
--                        "paraphrases": ["pp1", "pp2"],
--                    }
--                ],
--                "assert_revert_rate_lt": 0.25,
--            }
--        )
--        ctx = RunContext(backend=_backend(ft_like_base=False))
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--        assert result.raw == 0.0
--
--    def test_reverting_adapter_fails(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
--        # ft matches base (reverted), diverges from gold.
--        monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0])
--        monkeyed_embed["cats have fur"] = np.array([1.0, 0.0])
--        monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0])  # gold
--
--        probe, spec = build_probe(
--            {
--                "name": "rev",
--                "kind": "adapter_revert",
--                "cases": [
--                    {
--                        "prompt": "anything",
--                        "gold": "the answer is dolphins",
--                        "paraphrases": ["pp1", "pp2"],
--                    }
--                ],
--            }
--        )
--        ctx = RunContext(backend=_backend(ft_like_base=True))
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.FAIL
--        assert result.raw == 1.0  # 100% revert
--
--    def test_trivially_similar_cases_dropped(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
--        # base and gold are identical → drop.
--        v = np.array([1.0, 0.0])
--        monkeyed_embed["cats are mammals"] = v
--        monkeyed_embed["cats have fur"] = v
--        monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0])
--        monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0])
--        monkeyed_embed["cats are mammals too"] = v  # gold — matches base
--
--        probe, spec = build_probe(
--            {
--                "name": "rev",
--                "kind": "adapter_revert",
--                "cases": [
--                    {
--                        "prompt": "anything",
--                        "gold": "cats are mammals too",
--                        "paraphrases": ["pp1", "pp2"],
--                    }
--                ],
--            }
--        )
--        ctx = RunContext(backend=_backend(ft_like_base=False))
--        result = probe.run(spec, ctx)
--        # Both paraphrase pairs trivially similar → WARN (no separable signal).
--        assert result.verdict == Verdict.WARN
--        assert result.evidence["dropped_trivial"] == 2
--
--    def test_no_cases_errors(self, monkeyed_embed: dict[str, np.ndarray]) -> None:
--        probe, spec = build_probe({"name": "rev", "kind": "adapter_revert", "cases": []})
--        ctx = RunContext(backend=_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.ERROR
--
--
--class TestMissingSemsim:
--    def test_skip_when_sentence_transformers_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
--        from dlm_sway.core.errors import BackendNotAvailableError
--
--        def raiser(_model_id: Any) -> Any:  # type: ignore[no-untyped-def]
--            raise BackendNotAvailableError(
--                "adapter_revert",
--                extra="semsim",
--                hint="adapter_revert relies on sentence embeddings.",
--            )
--
--        monkeypatch.setattr(
--            "dlm_sway.probes.adapter_revert._load_embedder",
--            raiser,  # type: ignore[arg-type]
--        )
--        probe = AdapterRevertProbe()
--        spec = probe.spec_cls(
--            name="rev",
--            cases=[{"prompt": "x", "gold": "y", "paraphrases": ["pp1"]}],  # type: ignore[list-item]
--        )
--        ctx = RunContext(backend=_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.SKIP
--        assert "semsim" in result.message

sway/tests/unit/test_probe_base.pydeleted

--"""Tests for :mod:`dlm_sway.probes.base`."""
--
--from __future__ import annotations
--
--from typing import Literal
--
--import pytest
--
--from dlm_sway.core.errors import SpecValidationError
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, build_probe, registry
--
--
--class _DummySpec(ProbeSpec):
--    kind: Literal["__test_dummy"] = "__test_dummy"
--    payload: str = "x"
--
--
--class _DummyProbe(Probe):
--    kind = "__test_dummy"
--    spec_cls = _DummySpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        assert isinstance(spec, _DummySpec)
--        return ProbeResult(
--            name=spec.name,
--            kind=spec.kind,
--            verdict=Verdict.PASS,
--            score=1.0,
--            message=spec.payload,
--        )
--
--
--class TestRegistry:
--    def test_autoregister(self) -> None:
--        assert "__test_dummy" in registry()
--        assert registry()["__test_dummy"] is _DummyProbe
--
--    def test_duplicate_kind_rejected(self) -> None:
--        with pytest.raises(ValueError, match="duplicate probe kind"):
--
--            class _Clash(Probe):
--                kind = "__test_dummy"
--                spec_cls = _DummySpec
--
--                def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--                    raise NotImplementedError
--
--
--class TestBuildProbe:
--    def test_valid_entry(self) -> None:
--        probe, spec = build_probe({"name": "t", "kind": "__test_dummy", "payload": "hi"})
--        assert isinstance(probe, _DummyProbe)
--        assert isinstance(spec, _DummySpec)
--        assert spec.payload == "hi"
--
--    def test_unknown_kind(self) -> None:
--        with pytest.raises(SpecValidationError, match="unknown probe kind"):
--            build_probe({"name": "t", "kind": "no_such_kind"})
--
--    def test_missing_kind(self) -> None:
--        with pytest.raises(SpecValidationError, match="missing string 'kind'"):
--            build_probe({"name": "t"})
--
--    def test_extra_field_forbidden(self) -> None:
--        with pytest.raises(SpecValidationError) as exc_info:
--            build_probe({"name": "t", "kind": "__test_dummy", "bogus": "y"})
--        assert "bogus" in str(exc_info.value).lower()

sway/tests/unit/test_probe_calibration_drift.pydeleted

--"""Tests for :mod:`dlm_sway.probes.calibration_drift`."""
--
--from __future__ import annotations
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _backend(delta_per_token: float) -> DummyDifferentialBackend:
--    """Apply a uniform per-token logprob delta across every item."""
--    base_lp: dict[tuple[str, str], float] = {}
--    ft_lp: dict[tuple[str, str], float] = {}
--    for prompt, gold in BUILT_IN_PACK:
--        base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1)
--        ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1)
--    return DummyDifferentialBackend(
--        base=DummyResponses(logprobs=base_lp),
--        ft=DummyResponses(logprobs=ft_lp),
--    )
--
--
--class TestCalibrationDrift:
--    def test_healthy_when_no_regression(self) -> None:
--        backend = _backend(delta_per_token=0.0)  # no drift
--        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--        assert result.raw == 0.0  # zero fraction regressed
--
--    def test_fail_on_uniform_large_regression(self) -> None:
--        backend = _backend(delta_per_token=-2.0)  # every item regresses
--        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.FAIL
--        assert result.raw == 1.0
--
--    def test_respects_items_limit(self) -> None:
--        backend = _backend(delta_per_token=0.0)
--        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.evidence["total_items"] == 5
--
--    def test_worst_offenders_reported(self) -> None:
--        backend = _backend(delta_per_token=-2.0)
--        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        worst = result.evidence["worst_offenders"]
--        assert len(worst) <= 5
--        # Each worst-offender record carries prompt/gold/delta fields.
--        if worst:
--            assert {"prompt", "gold", "delta"} <= set(worst[0].keys())

sway/tests/unit/test_probe_delta_kl.pydeleted

--"""Tests for :mod:`dlm_sway.probes.delta_kl`."""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.scoring import TokenDist
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _diverging_backend() -> DummyDifferentialBackend:
--    """Base peaks tightly on token 1; ft is broad uniform. Real divergence."""
--    base = DummyResponses(
--        token_dists={
--            "q1": TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)),
--                vocab_size=100,
--            ),
--            "q2": TokenDist(
--                token_ids=np.array([5, 6], dtype=np.int64),
--                logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)),
--                vocab_size=100,
--            ),
--        }
--    )
--    ft = DummyResponses(
--        token_dists={
--            "q1": TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(np.array([0.3, 0.35, 0.35], dtype=np.float32)),
--                vocab_size=100,
--            ),
--            "q2": TokenDist(
--                token_ids=np.array([5, 6], dtype=np.int64),
--                logprobs=np.log(np.array([0.4, 0.6], dtype=np.float32)),
--                vocab_size=100,
--            ),
--        }
--    )
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--def _identical_backend() -> DummyDifferentialBackend:
--    dist = TokenDist(
--        token_ids=np.array([1, 2, 3], dtype=np.int64),
--        logprobs=np.log(np.array([0.5, 0.3, 0.2], dtype=np.float32)),
--        vocab_size=100,
--    )
--    base = DummyResponses(token_dists={"q1": dist})
--    ft = DummyResponses(token_dists={"q1": dist})
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--class TestDeltaKL:
--    def test_passes_when_distributions_diverge(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "dk",
--                "kind": "delta_kl",
--                "prompts": ["q1", "q2"],
--                "assert_mean_gte": 0.01,
--            }
--        )
--        ctx = RunContext(backend=_diverging_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--        assert result.raw is not None
--        assert result.raw > 0.01
--        assert result.evidence["num_prompts"] == 2
--        assert len(result.evidence["per_prompt"]) == 2
--
--    def test_fails_when_distributions_identical(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "dk",
--                "kind": "delta_kl",
--                "prompts": ["q1"],
--                "assert_mean_gte": 0.01,
--            }
--        )
--        ctx = RunContext(backend=_identical_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.FAIL
--        assert result.raw == 0.0
--
--    def test_z_score_path_when_null_stats_present(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "dk",
--                "kind": "delta_kl",
--                "prompts": ["q1"],
--                "assert_z_gte": 2.0,
--            }
--        )
--        null_stats = {"delta_kl": {"mean": 0.01, "std": 0.01, "n": 3.0}}
--        ctx = RunContext(backend=_diverging_backend(), null_stats=null_stats)
--        result = probe.run(spec, ctx)
--        assert result.z_score is not None
--        # Our synthetic ft diverges ~0.1+, far above μ=0.01, σ=0.01 → huge z.
--        assert result.z_score > 2.0
--        assert result.verdict == Verdict.PASS
--
--    def test_error_on_empty_prompts(self) -> None:
--        probe, spec = build_probe({"name": "dk", "kind": "delta_kl", "prompts": []})
--        ctx = RunContext(backend=_identical_backend())
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.ERROR
--
--    def test_kl_kind_available(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "dk",
--                "kind": "delta_kl",
--                "prompts": ["q1"],
--                "divergence": "kl",
--                "assert_mean_gte": 0.0,
--            }
--        )
--        ctx = RunContext(backend=_diverging_backend())
--        result = probe.run(spec, ctx)
--        assert result.evidence["divergence_kind"] == "kl"

sway/tests/unit/test_probe_leakage.pydeleted

--"""Tests for :mod:`dlm_sway.probes.leakage`."""
--
--from __future__ import annotations
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.sections import Section
--from dlm_sway.probes.base import RunContext, build_probe
--from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb
--
--
--class TestLCS:
--    def test_identical_returns_one(self) -> None:
--        assert _lcs_ratio("abcdef", "abcdef") == 1.0
--
--    def test_disjoint_returns_low(self) -> None:
--        assert _lcs_ratio("abc", "xyz") < 0.3
--
--    def test_empty_returns_zero(self) -> None:
--        assert _lcs_ratio("", "abc") == 0.0
--
--
--class TestPerturb:
--    def test_typo_swaps_first_two(self) -> None:
--        assert _perturb("hello", "typo") == "ehllo"
--
--    def test_case_flip_inverts_first_alpha(self) -> None:
--        assert _perturb("abc", "case_flip") == "Abc"
--        assert _perturb("ABC", "case_flip") == "aBC"
--
--    def test_drop_punct_removes_punct(self) -> None:
--        assert _perturb("a, b. c!", "drop_punct") == "a b c"
--
--
--class TestFragility:
--    def test_zero_when_clean_zero(self) -> None:
--        assert _fragility(0.0, 0.0) == 0.0
--
--    def test_expected_when_perturbed_dropped(self) -> None:
--        import pytest
--
--        assert _fragility(0.8, 0.2) == pytest.approx(0.75)
--
--
--def _prose_section(sid: str, content: str) -> Section:
--    return Section(id=sid, kind="prose", content=content)
--
--
--def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend:
--    """Build a backend whose ft generate() returns a controlled prefix of ``target``.
--
--    The target is "aaa..." (200 chars) so we can measure LCS ratio
--    against it deterministically.
--    """
--    content = ("The capital of France is Paris. " * 30).strip()
--    # Generate a fraction of the target to hit the desired recall.
--    target = content[128 : 128 + 256]
--    ft_full = target[: int(ft_recall * len(target))]
--    ft_pert = target[: int(ft_perturbed_recall * len(target))]
--
--    base = DummyResponses()
--    ft = DummyResponses(
--        generations={
--            content[:128]: ft_full,
--            # perturbations of the first 128 chars hit these three:
--            **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")},
--        }
--    )
--    return DummyDifferentialBackend(base=base, ft=ft), content
--
--
--class TestProbe:
--    def test_skip_without_sections(self) -> None:
--        backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
--        probe, spec = build_probe({"name": "c3", "kind": "leakage"})
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.SKIP
--
--    def test_pass_when_no_leak(self) -> None:
--        backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0)
--        probe, spec = build_probe(
--            {
--                "name": "c3",
--                "kind": "leakage",
--                "prefix_chars": 128,
--                "continuation_chars": 256,
--            }
--        )
--        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--
--    def test_fail_when_strong_low_fragility_leak(self) -> None:
--        backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9)
--        probe, spec = build_probe(
--            {
--                "name": "c3",
--                "kind": "leakage",
--                "prefix_chars": 128,
--                "continuation_chars": 256,
--                "assert_recall_lt": 0.5,
--                "min_fragility": 0.3,
--            }
--        )
--        ctx = RunContext(backend=backend, sections=(_prose_section("a", content),))
--        result = probe.run(spec, ctx)
--        # High recall + low fragility → fail.
--        assert result.verdict == Verdict.FAIL

sway/tests/unit/test_probe_paraphrase_invariance.pydeleted

--"""Tests for :mod:`dlm_sway.probes.paraphrase_invariance`."""
--
--from __future__ import annotations
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _backend(*, par_lift_fraction: float, verb_lift: float = 10.0) -> DummyDifferentialBackend:
--    """Return a backend with tunable verbatim/paraphrase lifts.
--
--    The ft view adds ``verb_lift`` nats to the verbatim (Q,A) logprob
--    and ``par_lift_fraction * verb_lift`` to paraphrase logprobs.
--    """
--    base = DummyResponses(
--        logprobs={
--            ("Q", "A"): -20.0,
--            ("Q_par1", "A"): -20.0,
--            ("Q_par2", "A"): -20.0,
--        }
--    )
--    ft = DummyResponses(
--        logprobs={
--            ("Q", "A"): -20.0 + verb_lift,
--            ("Q_par1", "A"): -20.0 + par_lift_fraction * verb_lift,
--            ("Q_par2", "A"): -20.0 + par_lift_fraction * verb_lift,
--        }
--    )
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--def test_pass_when_generalizing() -> None:
--    # High paraphrase lift + high verbatim → healthy generalization.
--    backend = _backend(par_lift_fraction=0.9)
--    probe, spec = build_probe(
--        {
--            "name": "pi",
--            "kind": "paraphrase_invariance",
--            "intent": "generalize",
--            "min_verbatim_lift": 0.05,
--            "min_generalization_ratio": 0.5,
--            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1", "Q_par2"]}],
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.PASS
--    assert result.raw is not None
--    assert result.raw >= 0.5
--
--
--def test_fails_when_only_memorized_but_intent_generalize() -> None:
--    backend = _backend(par_lift_fraction=0.0)
--    probe, spec = build_probe(
--        {
--            "name": "pi",
--            "kind": "paraphrase_invariance",
--            "intent": "generalize",
--            "min_verbatim_lift": 0.05,
--            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.FAIL
--
--
--def test_passes_memorize_intent_when_only_memorized() -> None:
--    backend = _backend(par_lift_fraction=0.0)
--    probe, spec = build_probe(
--        {
--            "name": "pi",
--            "kind": "paraphrase_invariance",
--            "intent": "memorize",
--            "min_verbatim_lift": 0.05,
--            "max_generalization_ratio_if_memorize": 0.3,
--            "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}],
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.PASS
--
--
--def test_error_on_empty_cases() -> None:
--    probe, spec = build_probe({"name": "pi", "kind": "paraphrase_invariance", "cases": []})
--    backend = _backend(par_lift_fraction=0.9)
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_preference_flip.pydeleted

--"""Tests for :mod:`dlm_sway.probes.preference_flip`."""
--
--from __future__ import annotations
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.sections import Section, SectionPreference
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend:
--    """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin).
--
--    We distribute the margin half to the chosen and half (negative) to
--    the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected)
--    equal the requested margin.
--    """
--    base_lp: dict[tuple[str, str], float] = {}
--    ft_lp: dict[tuple[str, str], float] = {}
--    for prompt, chosen, rejected, base_m, ft_m in pairs:
--        base_lp[(prompt, chosen)] = base_m / 2
--        base_lp[(prompt, rejected)] = -base_m / 2
--        ft_lp[(prompt, chosen)] = ft_m / 2
--        ft_lp[(prompt, rejected)] = -ft_m / 2
--    return DummyDifferentialBackend(
--        base=DummyResponses(logprobs=base_lp),
--        ft=DummyResponses(logprobs=ft_lp),
--    )
--
--
--def test_pass_when_base_wrong_flipped() -> None:
--    backend = _backend(
--        [
--            ("p1", "good1", "bad1", -2.0, 2.0),  # base wrong, ft flips
--            ("p2", "good2", "bad2", -1.5, 1.0),  # base wrong, ft flips
--            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
--            ("p4", "good4", "bad4", 1.0, 2.0),  # base already right (no contribution)
--        ]
--    )
--    triples = [
--        {"prompt": p, "chosen": c, "rejected": r}
--        for (p, c, r, _, _) in [
--            ("p1", "good1", "bad1", 0, 0),
--            ("p2", "good2", "bad2", 0, 0),
--            ("p3", "good3", "bad3", 0, 0),
--            ("p4", "good4", "bad4", 0, 0),
--        ]
--    ]
--    probe, spec = build_probe(
--        {
--            "name": "pf",
--            "kind": "preference_flip",
--            "triples": triples,
--            "assert_flip_rate_gte": 0.7,
--            "min_triples_for_decision": 3,
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.PASS
--    assert result.raw == 1.0  # 3/3 flipped
--
--
--def test_fail_when_base_wrong_not_flipped() -> None:
--    backend = _backend(
--        [
--            ("p1", "good1", "bad1", -2.0, -1.5),  # base wrong, ft still wrong
--            ("p2", "good2", "bad2", -1.5, -1.0),  # base wrong, ft still wrong
--            ("p3", "good3", "bad3", -0.5, 0.8),  # base wrong, ft flips
--        ]
--    )
--    triples = [
--        {"prompt": p, "chosen": c, "rejected": r}
--        for p, c, r in [
--            ("p1", "good1", "bad1"),
--            ("p2", "good2", "bad2"),
--            ("p3", "good3", "bad3"),
--        ]
--    ]
--    probe, spec = build_probe(
--        {
--            "name": "pf",
--            "kind": "preference_flip",
--            "triples": triples,
--            "assert_flip_rate_gte": 0.7,
--            "min_triples_for_decision": 3,
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.FAIL
--    assert result.raw is not None
--    assert result.raw < 0.7
--
--
--def test_skip_when_no_triples_anywhere() -> None:
--    probe, spec = build_probe({"name": "pf", "kind": "preference_flip"})
--    backend = _backend([])
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.SKIP
--
--
--def test_warn_when_too_few_base_wrong() -> None:
--    backend = _backend(
--        [
--            ("p1", "good1", "bad1", 1.0, 2.0),  # base right
--            ("p2", "good2", "bad2", 0.5, 1.0),  # base right
--            ("p3", "good3", "bad3", -0.5, 0.5),  # base wrong
--        ]
--    )
--    triples = [
--        {"prompt": p, "chosen": c, "rejected": r}
--        for p, c, r in [
--            ("p1", "good1", "bad1"),
--            ("p2", "good2", "bad2"),
--            ("p3", "good3", "bad3"),
--        ]
--    ]
--    probe, spec = build_probe(
--        {
--            "name": "pf",
--            "kind": "preference_flip",
--            "triples": triples,
--            "min_triples_for_decision": 3,
--        }
--    )
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.WARN
--
--
--def test_triples_pulled_from_sections() -> None:
--    pref_section = Section(
--        id="p1",
--        kind="preference",
--        content="...",
--        preferences=(
--            SectionPreference(prompt="q1", chosen="good", rejected="bad"),
--            SectionPreference(prompt="q2", chosen="good2", rejected="bad2"),
--            SectionPreference(prompt="q3", chosen="good3", rejected="bad3"),
--        ),
--    )
--    backend = _backend(
--        [
--            ("q1", "good", "bad", -1.0, 1.0),
--            ("q2", "good2", "bad2", -1.0, 1.0),
--            ("q3", "good3", "bad3", -1.0, 1.0),
--        ]
--    )
--    probe, spec = build_probe(
--        {
--            "name": "pf",
--            "kind": "preference_flip",
--            "assert_flip_rate_gte": 0.7,
--            "min_triples_for_decision": 3,
--        }
--    )
--    ctx = RunContext(backend=backend, sections=(pref_section,))
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.PASS

sway/tests/unit/test_probe_prompt_collapse.pydeleted

--"""Tests for :mod:`dlm_sway.probes.prompt_collapse`.
--
--Uses a programmable dummy backend that serves different token dists
--depending on whether the prompt contains the stuffing prefix. That's the
--cleanest way to simulate "divergence decays with context length" without
--a real model.
--"""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.scoring import TokenDist
--from dlm_sway.probes.base import RunContext, build_probe
--from dlm_sway.probes.prompt_collapse import _fit_half_life
--
--
--class TestFitHalfLife:
--    def test_exponential_recovered(self) -> None:
--        lengths = np.array([0.0, 100.0, 200.0, 300.0])
--        # y = 1.0 * exp(-x / 100)
--        y = np.exp(-lengths / 100.0)
--        h = _fit_half_life(lengths, y)
--        assert h is not None
--        import math
--
--        # True half-life = ln(2) * 100 ≈ 69.3
--        assert abs(h - math.log(2.0) * 100.0) < 1e-6
--
--    def test_returns_none_for_flat(self) -> None:
--        lengths = np.array([0.0, 100.0, 200.0])
--        y = np.array([1e-10, 1e-10, 1e-10])
--        assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None
--        # Either None or a huge half-life — both acceptable for flat input.
--
--    def test_returns_none_for_increasing(self) -> None:
--        lengths = np.array([0.0, 100.0, 200.0])
--        y = np.array([0.1, 0.3, 0.5])
--        assert _fit_half_life(lengths, y) is None
--
--
--def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend:
--    """Return a backend whose divergence decays with prompt length.
--
--    ``stuffing_sensitivity`` controls how quickly the ft distribution
--    snaps back to base as prompt length grows; lower = healthier adapter.
--    """
--    import numpy as np
--
--    base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32)
--
--    class _StuffedResponses(DummyResponses):
--        def __init__(self, is_ft: bool):
--            super().__init__()
--            self._is_ft = is_ft
--
--        # Override retrieval by subclassing the view's lookup path.
--
--    # Simpler: use explicit prompts at each expected length to seed the dict.
--    # The probe prefixes stuffing so the dummy sees the exact final prompt.
--    # We pre-build dists for each prompt we expect to see.
--    base = DummyResponses()
--    ft = DummyResponses()
--
--    # Pre-generate prompts the probe will query. The probe uses default
--    # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok.
--    from dlm_sway.probes.prompt_collapse import _stuffing
--
--    for ctx_len in (0, 256, 512, 1024):
--        prefix = _stuffing(ctx_len)
--        for prompt in ("q1",):
--            key = prefix + prompt
--            # Base: always tight on token 1.
--            base.token_dists[key] = TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(base_probs),
--                vocab_size=100,
--            )
--            # FT: diverges at ctx=0, decays toward base with length.
--            decay = np.exp(-ctx_len * stuffing_sensitivity)
--            ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay
--            ft_probs = ft_probs / ft_probs.sum()
--            ft.token_dists[key] = TokenDist(
--                token_ids=np.array([1, 2, 3], dtype=np.int64),
--                logprobs=np.log(ft_probs.astype(np.float32)),
--                vocab_size=100,
--            )
--    return DummyDifferentialBackend(base=base, ft=ft)
--
--
--class TestPromptCollapse:
--    def test_healthy_adapter_passes(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "pc",
--                "kind": "prompt_collapse",
--                "prompts": ["q1"],
--                "context_lengths": [0, 256, 512, 1024],
--                "assert_half_life_tokens": 100,
--            }
--        )
--        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001))
--        result = probe.run(spec, ctx)
--        # Half-life should be well above 100 with slow decay.
--        assert result.verdict == Verdict.PASS
--        assert result.raw is not None
--        assert result.raw > 100
--
--    def test_collapsing_adapter_fails(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "pc",
--                "kind": "prompt_collapse",
--                "prompts": ["q1"],
--                "context_lengths": [0, 256, 512, 1024],
--                "assert_half_life_tokens": 500,
--            }
--        )
--        ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02))
--        result = probe.run(spec, ctx)
--        # Fast decay → short half-life → fail against 500-token threshold.
--        assert result.verdict == Verdict.FAIL
--
--    def test_error_on_empty_prompts(self) -> None:
--        probe, spec = build_probe(
--            {
--                "name": "pc",
--                "kind": "prompt_collapse",
--                "prompts": [],
--                "context_lengths": [0, 256],
--            }
--        )
--        ctx = RunContext(backend=_programmed_backend(0.001))
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_probe_section_internalization.pydeleted

--"""Tests for :mod:`dlm_sway.probes.section_internalization` (the flagship B1)."""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.core.scoring import RollingLogprob
--from dlm_sway.core.sections import Section, SectionProbe
--from dlm_sway.probes.base import RunContext, build_probe
--
--
--def _rolling(mean_lp: float, n: int = 10) -> RollingLogprob:
--    lp = np.full(n - 1, mean_lp, dtype=np.float32)
--    return RollingLogprob(
--        token_ids=np.arange(n, dtype=np.int64),
--        logprobs=lp,
--        num_tokens=n,
--        total_logprob=float(lp.sum()),
--    )
--
--
--def _section(sid: str, kind: str = "prose", content: str = "content", probes=()) -> Section:
--    return Section(id=sid, kind=kind, content=content, probes=tuple(probes))  # type: ignore[arg-type]
--
--
--def test_skip_without_sections() -> None:
--    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
--    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
--    ctx = RunContext(backend=backend)
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.SKIP
--
--
--def test_skip_with_single_section() -> None:
--    probe, spec = build_probe({"name": "sis", "kind": "section_internalization"})
--    backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
--    ctx = RunContext(backend=backend, sections=(_section("a"),))
--    result = probe.run(spec, ctx)
--    assert result.verdict == Verdict.SKIP
--
--
--def test_pass_when_each_section_gets_distinct_lift() -> None:
--    # Build a dummy backend where the ft is much lower-PPL than base on
--    # every section's content — uniform lift, but leak-check math
--    # yields ~zero differential leak so all sections pass.
--    content_a = "aaa " * 10
--    content_b = "bbb " * 10
--
--    base = DummyResponses(rolling={content_a: _rolling(-3.0), content_b: _rolling(-3.0)})
--    ft = DummyResponses(rolling={content_a: _rolling(-1.0), content_b: _rolling(-2.5)})
--    backend = DummyDifferentialBackend(base=base, ft=ft)
--
--    sections = (
--        _section("a", content=content_a),
--        _section("b", content=content_b),
--    )
--    probe, spec = build_probe(
--        {
--            "name": "sis",
--            "kind": "section_internalization",
--            "per_section_threshold": 0.05,
--        }
--    )
--    ctx = RunContext(backend=backend, sections=sections)
--    result = probe.run(spec, ctx)
--    assert result.verdict in (Verdict.PASS, Verdict.FAIL)
--    assert "per_section" in result.evidence
--    assert len(result.evidence["per_section"]) == 2
--
--
--def test_instruction_uses_logprob_of() -> None:
--    # Instruction sections contribute their probe Q/A pairs; feed
--    # logprobs so the ft view comes out cheaper than base.
--    probes_a = (SectionProbe(prompt="Qa", gold="Aa"),)
--    probes_b = (SectionProbe(prompt="Qb", gold="Ab"),)
--    base = DummyResponses(logprobs={("Qa", "Aa"): -10.0, ("Qb", "Ab"): -10.0})
--    ft = DummyResponses(logprobs={("Qa", "Aa"): -3.0, ("Qb", "Ab"): -8.0})
--    backend = DummyDifferentialBackend(base=base, ft=ft)
--
--    sections = (
--        _section("a", kind="instruction", content="...", probes=probes_a),
--        _section("b", kind="instruction", content="...", probes=probes_b),
--    )
--    probe, spec = build_probe(
--        {"name": "sis", "kind": "section_internalization", "per_section_threshold": 0.05}
--    )
--    ctx = RunContext(backend=backend, sections=sections)
--    result = probe.run(spec, ctx)
--    per = result.evidence["per_section"]
--    # Section A got much more lift than B, so effective_sis(a) > effective_sis(b).
--    sis_by_id = {row["section_id"]: row["effective_sis"] for row in per}
--    assert sis_by_id["a"] > sis_by_id["b"]

sway/tests/unit/test_probe_style_fingerprint.pydeleted

--"""Tests for :mod:`dlm_sway.probes.style_fingerprint`."""
--
--from __future__ import annotations
--
--import numpy as np
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.result import Verdict
--from dlm_sway.probes.base import RunContext, build_probe
--from dlm_sway.probes.style_fingerprint import fingerprint
--
--
--class TestFingerprint:
--    def test_zero_vector_for_empty(self) -> None:
--        fp = fingerprint("")
--        assert fp.shape == (6,)
--        assert np.allclose(fp, 0.0)
--
--    def test_non_zero_for_normal_text(self) -> None:
--        fp = fingerprint("This is a sentence. This is another one. A third.")
--        assert fp.shape == (6,)
--        assert fp[0] > 0  # mean sentence length
--        assert fp[2] > 0  # TTR
--        assert fp[3] > 0  # avg word length
--
--    def test_distinct_styles_distinct_fingerprints(self) -> None:
--        terse = "Go. Now. Quick."
--        verbose = (
--            "We must, with all deliberate speed and measured consideration, "
--            "proceed expeditiously towards the elaborated and carefully "
--            "constructed resolution of the foregoing matter."
--        )
--        assert not np.allclose(fingerprint(terse), fingerprint(verbose))
--
--
--def _backend_with_samples(base: list[str], ft: list[str]) -> DummyDifferentialBackend:
--    return DummyDifferentialBackend(
--        base=DummyResponses(generations={f"p{i}": s for i, s in enumerate(base)}),
--        ft=DummyResponses(generations={f"p{i}": s for i, s in enumerate(ft)}),
--    )
--
--
--class TestProbe:
--    def test_pass_when_ft_drifts_toward_doc(self) -> None:
--        base_samples = ["Short. Plain. Words."] * 2
--        ft_samples = [
--            "Wherein many clauses conjoin themselves, through extended "
--            "ruminations, unto a meandering whole of considerable length."
--        ] * 2
--        doc = (
--            "Wherein many clauses conjoin themselves, through extended "
--            "ruminations, unto a meandering whole of considerable length. "
--            "Further elaboration, no less copious, follows apace."
--        )
--        backend = _backend_with_samples(base_samples, ft_samples)
--        probe, spec = build_probe(
--            {
--                "name": "c1",
--                "kind": "style_fingerprint",
--                "prompts": ["p0", "p1"],
--                "doc_reference": doc,
--                "max_new_tokens": 32,
--                "assert_shift_gte": 0.2,
--            }
--        )
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.PASS
--        assert result.raw is not None
--        assert result.raw > 0.2
--
--    def test_fail_when_no_stylistic_shift(self) -> None:
--        base_samples = ["Short. Plain. Words."] * 2
--        ft_samples = ["Short. Plain. Words."] * 2
--        doc = "Wherein clauses conjoin into meandering wholes of length."
--        backend = _backend_with_samples(base_samples, ft_samples)
--        probe, spec = build_probe(
--            {
--                "name": "c1",
--                "kind": "style_fingerprint",
--                "prompts": ["p0", "p1"],
--                "doc_reference": doc,
--                "assert_shift_gte": 0.25,
--            }
--        )
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.FAIL
--
--    def test_skip_without_doc_reference(self) -> None:
--        backend = _backend_with_samples(["x"], ["y"])
--        probe, spec = build_probe(
--            {
--                "name": "c1",
--                "kind": "style_fingerprint",
--                "prompts": ["p0"],
--            }
--        )
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.SKIP
--
--    def test_error_on_empty_prompts(self) -> None:
--        backend = _backend_with_samples([], [])
--        probe, spec = build_probe(
--            {
--                "name": "c1",
--                "kind": "style_fingerprint",
--                "prompts": [],
--                "doc_reference": "doc",
--            }
--        )
--        ctx = RunContext(backend=backend)
--        result = probe.run(spec, ctx)
--        assert result.verdict == Verdict.ERROR

sway/tests/unit/test_result.pydeleted

--"""Tests for :mod:`dlm_sway.core.result`."""
--
--from __future__ import annotations
--
--from dataclasses import FrozenInstanceError
--
--import pytest
--
--from dlm_sway.core.result import (
--    DEFAULT_COMPONENT_WEIGHTS,
--    ProbeResult,
--    SuiteResult,
--    SwayScore,
--    Verdict,
--    utcnow,
--)
--
--
--class TestVerdict:
--    def test_is_str_enum(self) -> None:
--        assert Verdict.PASS.value == "pass"
--        assert str(Verdict.WARN.value) == "warn"
--
--    def test_all_expected_members(self) -> None:
--        assert {v.value for v in Verdict} == {
--            "pass",
--            "fail",
--            "warn",
--            "skip",
--            "error",
--        }
--
--
--class TestProbeResult:
--    def test_minimum_construction(self) -> None:
--        r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82)
--        assert r.raw is None
--        assert r.evidence == {}
--        assert r.message == ""
--        assert r.duration_s == 0.0
--
--    def test_frozen(self) -> None:
--        r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5)
--        with pytest.raises(FrozenInstanceError):
--            r.score = 0.6  # type: ignore[misc]
--
--
--class TestSuiteResult:
--    def test_wall_seconds(self) -> None:
--        from datetime import timedelta
--
--        started = utcnow()
--        finished = started + timedelta(seconds=2, milliseconds=500)
--        result = SuiteResult(
--            spec_path="sway.yaml",
--            started_at=started,
--            finished_at=finished,
--            base_model_id="b",
--            adapter_id="a",
--            sway_version="0.1.0.dev0",
--        )
--        assert result.wall_seconds == pytest.approx(2.5, abs=1e-6)
--
--
--class TestSwayScore:
--    def test_default_weights_sum_to_one(self) -> None:
--        assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9
--
--    def test_band_boundaries(self) -> None:
--        assert SwayScore.band_for(0.0) == "noise"
--        assert SwayScore.band_for(0.29) == "noise"
--        assert SwayScore.band_for(0.30) == "partial"
--        assert SwayScore.band_for(0.59) == "partial"
--        assert SwayScore.band_for(0.60) == "healthy"
--        assert SwayScore.band_for(0.85) == "healthy"
--        assert SwayScore.band_for(0.851) == "suspicious"
--        assert SwayScore.band_for(0.99) == "suspicious"
--
--
--def test_utcnow_is_tz_aware() -> None:
--    now = utcnow()
--    assert now.tzinfo is not None

sway/tests/unit/test_scoring.pydeleted

--"""Tests for :mod:`dlm_sway.core.scoring`."""
--
--from __future__ import annotations
--
--import math
--
--import numpy as np
--
--from dlm_sway.core.scoring import (
--    DifferentialBackend,
--    RollingLogprob,
--    ScoringBackend,
--    TokenDist,
--)
--
--
--class TestRollingLogprob:
--    def test_empty_sequence(self) -> None:
--        r = RollingLogprob(
--            token_ids=np.array([42], dtype=np.int64),
--            logprobs=np.array([], dtype=np.float32),
--            num_tokens=1,
--            total_logprob=0.0,
--        )
--        assert r.mean_logprob == 0.0
--        assert r.perplexity == 1.0
--
--    def test_mean_and_perplexity(self) -> None:
--        # Three tokens, two transition logprobs summing to -4.0 → mean -2.0.
--        r = RollingLogprob(
--            token_ids=np.array([1, 2, 3], dtype=np.int64),
--            logprobs=np.array([-1.5, -2.5], dtype=np.float32),
--            num_tokens=3,
--            total_logprob=-4.0,
--        )
--        assert math.isclose(r.mean_logprob, -2.0, rel_tol=1e-6)
--        assert math.isclose(r.perplexity, math.exp(2.0), rel_tol=1e-6)
--
--
--class TestTokenDist:
--    def test_construction_and_defaults(self) -> None:
--        dist = TokenDist(
--            token_ids=np.array([1, 2, 3], dtype=np.int64),
--            logprobs=np.array([-0.1, -1.0, -3.0], dtype=np.float32),
--            vocab_size=50_257,
--        )
--        assert dist.tail_logprob == 0.0
--        assert dist.token_ids.shape == (3,)
--
--
--class TestProtocols:
--    def test_scoring_backend_runtime_checkable(self) -> None:
--        class FakeScoring:
--            def logprob_of(self, prompt: str, completion: str) -> float:
--                return 0.0
--
--            def rolling_logprob(self, text: str) -> RollingLogprob:
--                return RollingLogprob(
--                    token_ids=np.array([0], dtype=np.int64),
--                    logprobs=np.array([], dtype=np.float32),
--                    num_tokens=1,
--                    total_logprob=0.0,
--                )
--
--            def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
--                return TokenDist(
--                    token_ids=np.array([0], dtype=np.int64),
--                    logprobs=np.array([0.0], dtype=np.float32),
--                    vocab_size=1,
--                )
--
--        assert isinstance(FakeScoring(), ScoringBackend)
--
--    def test_differential_backend_runtime_checkable(self) -> None:
--        from contextlib import nullcontext
--
--        class FakeDiff:
--            def as_base(self):  # type: ignore[no-untyped-def]
--                return nullcontext(object())
--
--            def as_finetuned(self):  # type: ignore[no-untyped-def]
--                return nullcontext(object())
--
--        assert isinstance(FakeDiff(), DifferentialBackend)

sway/tests/unit/test_sections.pydeleted

--"""Tests for :mod:`dlm_sway.core.sections`."""
--
--from __future__ import annotations
--
--from dlm_sway.core.sections import (
--    Section,
--    SectionPreference,
--    SectionProbe,
--    filter_kinds,
--)
--
--
--def test_default_field_types() -> None:
--    s = Section(id="abc", kind="prose", content="hello world")
--    assert s.probes == ()
--    assert s.preferences == ()
--    assert s.tag is None
--
--
--def test_filter_kinds() -> None:
--    sections = (
--        Section(id="a", kind="prose", content="x"),
--        Section(id="b", kind="instruction", content="y"),
--        Section(id="c", kind="preference", content="z"),
--    )
--    only_prose = filter_kinds(sections, ("prose",))
--    assert len(only_prose) == 1
--    assert only_prose[0].id == "a"
--
--
--def test_section_probe_and_preference() -> None:
--    p = SectionProbe(prompt="Q", gold="A")
--    assert p.prompt == "Q"
--    pref = SectionPreference(prompt="P", chosen="good", rejected="bad")
--    assert pref.chosen == "good"

sway/tests/unit/test_suite_runner.pydeleted

--"""Tests for :mod:`dlm_sway.suite.runner`.
--
--Uses the dummy backend + ad-hoc probe classes so nothing real is loaded.
--"""
--
--from __future__ import annotations
--
--from typing import Literal
--
--import pytest
--
--from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
--from dlm_sway.core.errors import ProbeError
--from dlm_sway.core.result import ProbeResult, Verdict
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--from dlm_sway.suite.runner import run
--from dlm_sway.suite.spec import SwaySpec
--
--
--class _PassSpec(ProbeSpec):
--    kind: Literal["__runner_pass"] = "__runner_pass"
--
--
--class _PassProbe(Probe):
--    kind = "__runner_pass"
--    spec_cls = _PassSpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.PASS, score=0.9)
--
--
--class _FailSpec(ProbeSpec):
--    kind: Literal["__runner_fail"] = "__runner_fail"
--
--
--class _FailProbe(Probe):
--    kind = "__runner_fail"
--    spec_cls = _FailSpec
--    category = "attribution"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.FAIL, score=0.1)
--
--
--class _RaiseSpec(ProbeSpec):
--    kind: Literal["__runner_raise"] = "__runner_raise"
--
--
--class _RaiseProbe(Probe):
--    kind = "__runner_raise"
--    spec_cls = _RaiseSpec
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        raise ProbeError(spec.kind, "kaboom")
--
--
--class _UnexpectedSpec(ProbeSpec):
--    kind: Literal["__runner_unexpected"] = "__runner_unexpected"
--
--
--class _UnexpectedProbe(Probe):
--    kind = "__runner_unexpected"
--    spec_cls = _UnexpectedSpec
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        raise ValueError("surprise")
--
--
--@pytest.fixture
--def backend() -> DummyDifferentialBackend:
--    return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
--
--
--def _spec(*entries: dict) -> SwaySpec:
--    return SwaySpec.model_validate(
--        {
--            "version": 1,
--            "models": {
--                "base": {"base": "b"},
--                "ft": {"base": "b", "adapter": "/tmp/a"},
--            },
--            "suite": list(entries),
--        }
--    )
--
--
--class TestRunner:
--    def test_runs_each_probe_in_order(self, backend: DummyDifferentialBackend) -> None:
--        spec = _spec(
--            {"name": "p1", "kind": "__runner_pass"},
--            {"name": "p2", "kind": "__runner_fail"},
--        )
--        result = run(spec, backend)
--        assert [r.name for r in result.probes] == ["p1", "p2"]
--        assert result.probes[0].verdict == Verdict.PASS
--        assert result.probes[1].verdict == Verdict.FAIL
--
--    def test_disabled_probe_records_skip(self, backend: DummyDifferentialBackend) -> None:
--        spec = _spec({"name": "p1", "kind": "__runner_pass", "enabled": False})
--        result = run(spec, backend)
--        assert result.probes[0].verdict == Verdict.SKIP
--        assert "disabled" in result.probes[0].message
--
--    def test_probeerror_becomes_error_verdict(self, backend: DummyDifferentialBackend) -> None:
--        spec = _spec({"name": "oops", "kind": "__runner_raise"})
--        result = run(spec, backend)
--        assert result.probes[0].verdict == Verdict.ERROR
--        assert "kaboom" in result.probes[0].message
--
--    def test_unexpected_exception_becomes_error_verdict(
--        self, backend: DummyDifferentialBackend
--    ) -> None:
--        spec = _spec({"name": "oops", "kind": "__runner_unexpected"})
--        result = run(spec, backend)
--        assert result.probes[0].verdict == Verdict.ERROR
--        assert "ValueError" in result.probes[0].message
--
--    def test_wall_seconds_populated(self, backend: DummyDifferentialBackend) -> None:
--        spec = _spec({"name": "p1", "kind": "__runner_pass"})
--        result = run(spec, backend)
--        assert result.wall_seconds >= 0
--        assert result.probes[0].duration_s >= 0
--
--    def test_null_adapter_passes_on_null_calibrated_backend(
--        self, backend: DummyDifferentialBackend
--    ) -> None:
--        # Dummy backend implements NullCalibratedBackend, so calibration runs.
--        spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]})
--        result = run(spec, backend)
--        assert result.probes[0].kind == "null_adapter"
--        assert result.probes[0].verdict == Verdict.PASS
--        # And the suite's null_stats bubbles up onto the result.
--        assert "delta_kl" in result.null_stats

sway/tests/unit/test_suite_score_report.pydeleted

--"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
--
--from __future__ import annotations
--
--import json
--from datetime import timedelta
--from typing import Literal
--
--import pytest
--
--from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
--from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
--from dlm_sway.suite import report, score
--from dlm_sway.suite.spec import SwaySpec
--
--
--class _AdherenceSpec(ProbeSpec):
--    kind: Literal["__score_adherence"] = "__score_adherence"
--
--
--class _AdherenceProbe(Probe):
--    kind = "__score_adherence"
--    spec_cls = _AdherenceSpec
--    category = "adherence"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        raise NotImplementedError  # never executed; registered for category lookup
--
--
--class _AttributionSpec(ProbeSpec):
--    kind: Literal["__score_attribution"] = "__score_attribution"
--
--
--class _AttributionProbe(Probe):
--    kind = "__score_attribution"
--    spec_cls = _AttributionSpec
--    category = "attribution"
--
--    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
--        raise NotImplementedError
--
--
--def _synth_suite(*probes: ProbeResult) -> SuiteResult:
--    started = utcnow()
--    return SuiteResult(
--        spec_path="sway.yaml",
--        started_at=started,
--        finished_at=started + timedelta(seconds=1),
--        base_model_id="base",
--        adapter_id="adapter",
--        sway_version="0.1.0.dev0",
--        probes=probes,
--    )
--
--
--class TestCompute:
--    def test_single_passing_probe(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
--        )
--        s = score.compute(suite)
--        assert s.overall == pytest.approx(0.8)
--        assert s.components["adherence"] == pytest.approx(0.8)
--        assert s.band == "healthy"
--
--    def test_mixed_categories_weighted(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
--            ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
--        )
--        s = score.compute(suite)
--        # Active categories: adherence (0.30) + attribution (0.35). Normalized.
--        expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
--        assert s.overall == pytest.approx(expected)
--
--    def test_errors_and_skips_excluded(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
--            ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
--            ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
--        )
--        s = score.compute(suite)
--        assert s.components["adherence"] == pytest.approx(0.9)
--
--    def test_per_probe_weights_override_uniform(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(
--                name="a",
--                kind="__score_adherence",
--                verdict=Verdict.PASS,
--                score=1.0,
--                evidence={"weight": 3.0},
--            ),
--            ProbeResult(
--                name="b",
--                kind="__score_adherence",
--                verdict=Verdict.PASS,
--                score=0.0,
--                evidence={"weight": 1.0},
--            ),
--        )
--        s = score.compute(suite)
--        # Weighted mean: (3·1 + 1·0) / 4 = 0.75
--        assert s.components["adherence"] == pytest.approx(0.75)
--
--    def test_failed_probe_surfaces_in_findings(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(
--                name="bad",
--                kind="__score_adherence",
--                verdict=Verdict.FAIL,
--                score=0.1,
--                message="nope",
--            )
--        )
--        s = score.compute(suite)
--        assert any("bad" in f for f in s.findings)
--
--
--class TestJsonReport:
--    def test_schema_fields(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(
--                name="p1",
--                kind="__score_adherence",
--                verdict=Verdict.PASS,
--                score=0.75,
--                raw=0.12,
--                z_score=3.1,
--            )
--        )
--        s = score.compute(suite)
--        out = json.loads(report.to_json(suite, s))
--        assert out["schema_version"] == 1
--        assert out["score"]["overall"] == pytest.approx(0.75)
--        assert out["probes"][0]["verdict"] == "pass"
--        assert out["probes"][0]["z_score"] == pytest.approx(3.1)
--
--
--class TestJunit:
--    def test_counts_populated(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
--            ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
--            ProbeResult(
--                name="p3",
--                kind="__score_adherence",
--                verdict=Verdict.ERROR,
--                score=None,
--            ),
--        )
--        s = score.compute(suite)
--        xml = report.to_junit(suite, s)
--        assert 'tests="3"' in xml
--        assert 'failures="1"' in xml
--        assert 'errors="1"' in xml
--        assert "<failure" in xml
--        assert "<error" in xml
--
--
--class TestMarkdown:
--    def test_contains_probe_table(self) -> None:
--        suite = _synth_suite(
--            ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
--        )
--        s = score.compute(suite)
--        md = report.to_markdown(suite, s)
--        assert "dlm-sway report" in md
--        assert "| p1 | `__score_adherence`" in md
--
--
--class TestTerminal:
--    def test_renders_without_error(self) -> None:
--        import io
--
--        from rich.console import Console
--
--        suite = _synth_suite(
--            ProbeResult(
--                name="p1",
--                kind="__score_adherence",
--                verdict=Verdict.PASS,
--                score=0.8,
--                raw=0.12,
--                z_score=3.1,
--                message="looks fine",
--            ),
--            ProbeResult(
--                name="p2",
--                kind="__score_attribution",
--                verdict=Verdict.FAIL,
--                score=0.1,
--                message="a very long message that will be truncated — " * 5,
--            ),
--            ProbeResult(
--                name="p3",
--                kind="__score_adherence",
--                verdict=Verdict.SKIP,
--                score=None,
--            ),
--        )
--        s = score.compute(suite)
--        buf = io.StringIO()
--        console = Console(file=buf, force_terminal=False, width=120)
--        report.to_terminal(suite, s, console=console)
--        out = buf.getvalue()
--        assert "dlm-sway report" in out
--        assert "overall:" in out
--        assert "p1" in out
--        assert "p2" in out
--        # Top findings section kicks in because p2 failed.
--        assert "top findings" in out
--
--
--# Force the SwaySpec model to stay reachable from tests (keeps mypy happy
--# on the eventual CLI path that calls into both).
--assert SwaySpec is not None

sway/tests/unit/test_suite_spec.pydeleted

--"""Tests for :mod:`dlm_sway.suite.spec` + :mod:`dlm_sway.suite.loader`."""
--
--from __future__ import annotations
--
--from pathlib import Path
--
--import pytest
--
--from dlm_sway.core.errors import SpecValidationError
--from dlm_sway.suite.loader import from_dict, load_spec
--from dlm_sway.suite.spec import SwaySpec
--
--
--def _minimum_valid() -> dict:
--    return {
--        "version": 1,
--        "models": {
--            "base": {"kind": "hf", "base": "HuggingFaceTB/SmolLM2-135M-Instruct"},
--            "ft": {
--                "kind": "hf",
--                "base": "HuggingFaceTB/SmolLM2-135M-Instruct",
--                "adapter": "/tmp/adapter",
--            },
--        },
--        "suite": [],
--    }
--
--
--class TestSwaySpec:
--    def test_minimum_valid(self) -> None:
--        spec = from_dict(_minimum_valid())
--        assert isinstance(spec, SwaySpec)
--        assert spec.version == 1
--        assert spec.defaults.seed == 0
--        assert spec.defaults.differential is True
--        assert spec.suite == []
--
--    def test_rejects_unknown_top_level_keys(self) -> None:
--        data = _minimum_valid()
--        data["bogus"] = True
--        with pytest.raises(SpecValidationError) as exc_info:
--            from_dict(data)
--        assert "bogus" in str(exc_info.value).lower()
--
--    def test_rejects_future_version(self) -> None:
--        data = _minimum_valid()
--        data["version"] = 9
--        with pytest.raises(SpecValidationError, match="unsupported sway spec version"):
--            from_dict(data)
--
--    def test_defaults_frozen(self) -> None:
--        spec = from_dict(_minimum_valid())
--        from pydantic import ValidationError
--
--        with pytest.raises(ValidationError):
--            spec.defaults.seed = 99  # type: ignore[misc]
--
--
--class TestLoader:
--    def test_missing_file(self, tmp_path: Path) -> None:
--        missing = tmp_path / "nope.yaml"
--        with pytest.raises(SpecValidationError, match="not found"):
--            load_spec(missing)
--
--    def test_invalid_yaml(self, tmp_path: Path) -> None:
--        bad = tmp_path / "bad.yaml"
--        # An unmatched { triggers yaml.scanner; a structurally ambiguous
--        # indent parses as a string value, which isn't a YAML error.
--        bad.write_text("{ unmatched: [", encoding="utf-8")
--        with pytest.raises(SpecValidationError, match="invalid YAML"):
--            load_spec(bad)
--
--    def test_non_mapping_top_level(self, tmp_path: Path) -> None:
--        bad = tmp_path / "list.yaml"
--        bad.write_text("- 1\n- 2\n", encoding="utf-8")
--        with pytest.raises(SpecValidationError, match="must be a mapping"):
--            load_spec(bad)
--
--    def test_roundtrip_via_yaml(self, tmp_path: Path) -> None:
--        import yaml
--
--        path = tmp_path / "sway.yaml"
--        path.write_text(yaml.safe_dump(_minimum_valid()), encoding="utf-8")
--        spec = load_spec(path)
--        assert spec.models.ft.adapter == Path("/tmp/adapter")

sway/tests/unit/test_visualize.pydeleted

--"""Tests for :mod:`dlm_sway.visualize`.
--
--Exercises the error path (matplotlib missing) and the happy path when
--the module is present by stubbing ``matplotlib.pyplot`` via sys.modules.
--"""
--
--from __future__ import annotations
--
--import sys
--import types
--from datetime import timedelta
--
--import pytest
--
--from dlm_sway.core.errors import BackendNotAvailableError
--from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
--
--
--def _suite_with(*probes: ProbeResult) -> SuiteResult:
--    started = utcnow()
--    return SuiteResult(
--        spec_path="sway.yaml",
--        started_at=started,
--        finished_at=started + timedelta(seconds=1),
--        base_model_id="b",
--        adapter_id="a",
--        sway_version="0.1.0.dev0",
--        probes=probes,
--    )
--
--
--class _FakeFig:
--    def tight_layout(self) -> None:  # pragma: no cover — trivial
--        return None
--
--
--class _FakeAx:
--    def __init__(self) -> None:
--        self.calls: list[str] = []
--
--    def bar(self, *a, **k):  # type: ignore[no-untyped-def]
--        self.calls.append("bar")
--
--    def plot(self, *a, **k):  # type: ignore[no-untyped-def]
--        self.calls.append("plot")
--
--    def hist(self, *a, **k):  # type: ignore[no-untyped-def]
--        self.calls.append("hist")
--
--    def axhline(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def axvline(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def set_xticks(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def set_xticklabels(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def set_xlabel(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def set_ylabel(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def set_title(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--    def legend(self, *a, **k):  # type: ignore[no-untyped-def]
--        return None
--
--
--@pytest.fixture
--def fake_mpl(monkeypatch: pytest.MonkeyPatch) -> _FakeAx:
--    ax = _FakeAx()
--
--    def _subplots(*a, **k):  # type: ignore[no-untyped-def]
--        return _FakeFig(), ax
--
--    plt = types.ModuleType("matplotlib.pyplot")
--    plt.subplots = _subplots  # type: ignore[attr-defined]
--    mpl_pkg = types.ModuleType("matplotlib")
--    monkeypatch.setitem(sys.modules, "matplotlib", mpl_pkg)
--    monkeypatch.setitem(sys.modules, "matplotlib.pyplot", plt)
--    return ax
--
--
--def test_section_sis_plot_uses_per_section_evidence(fake_mpl: _FakeAx) -> None:
--    from dlm_sway.visualize import plot_section_sis
--
--    suite = _suite_with(
--        ProbeResult(
--            name="sis",
--            kind="section_internalization",
--            verdict=Verdict.PASS,
--            score=0.75,
--            raw=0.1,
--            evidence={
--                "per_section": [
--                    {
--                        "section_id": "a",
--                        "kind": "prose",
--                        "tag": None,
--                        "base_nll": 3.0,
--                        "ft_nll": 2.5,
--                        "own_lift": 0.17,
--                        "leak_lift": 0.02,
--                        "effective_sis": 0.15,
--                        "passed": True,
--                    },
--                    {
--                        "section_id": "b",
--                        "kind": "instruction",
--                        "tag": "intro",
--                        "base_nll": 4.0,
--                        "ft_nll": 3.9,
--                        "own_lift": 0.025,
--                        "leak_lift": 0.03,
--                        "effective_sis": -0.005,
--                        "passed": False,
--                    },
--                ],
--                "per_section_threshold": 0.05,
--            },
--        )
--    )
--    plot_section_sis(suite)
--    assert "bar" in fake_mpl.calls
--
--
--def test_adapter_ablation_plot(fake_mpl: _FakeAx) -> None:
--    from dlm_sway.visualize import plot_adapter_ablation
--
--    suite = _suite_with(
--        ProbeResult(
--            name="abl",
--            kind="adapter_ablation",
--            verdict=Verdict.PASS,
--            score=0.8,
--            raw=0.9,
--            evidence={
--                "lambdas": [0.0, 0.5, 1.0, 1.25],
--                "mean_divergence_per_lambda": [0.0, 0.5, 1.0, 1.1],
--                "linearity": 0.91,
--                "saturation_lambda": 0.75,
--                "overshoot": 1.1,
--            },
--        )
--    )
--    plot_adapter_ablation(suite)
--    assert "plot" in fake_mpl.calls
--
--
--def test_kl_histogram_plot(fake_mpl: _FakeAx) -> None:
--    from dlm_sway.visualize import plot_kl_histogram
--
--    suite = _suite_with(
--        ProbeResult(
--            name="dk",
--            kind="delta_kl",
--            verdict=Verdict.PASS,
--            score=0.7,
--            raw=0.1,
--            evidence={"per_prompt": [0.05, 0.1, 0.12, 0.09, 0.15], "divergence_kind": "js"},
--        )
--    )
--    plot_kl_histogram(suite)
--    assert "hist" in fake_mpl.calls
--
--
--def test_raises_when_matplotlib_missing(monkeypatch: pytest.MonkeyPatch) -> None:
--    # Purge matplotlib modules and block imports.
--    for mod in list(sys.modules):
--        if mod == "matplotlib" or mod.startswith("matplotlib."):
--            monkeypatch.delitem(sys.modules, mod, raising=False)
--
--    import builtins
--
--    real_import = builtins.__import__
--
--    def fake_import(name: str, *a, **k):  # type: ignore[no-untyped-def]
--        if name == "matplotlib" or name.startswith("matplotlib."):
--            raise ImportError("matplotlib missing in this venv")
--        return real_import(name, *a, **k)
--
--    monkeypatch.setattr(builtins, "__import__", fake_import)
--
--    from dlm_sway.visualize import plot_section_sis
--
--    suite = _suite_with()
--    with pytest.raises(BackendNotAvailableError):
--        plot_section_sis(suite)
--
--
--def test_raises_when_no_matching_probe(fake_mpl: _FakeAx) -> None:
--    from dlm_sway.visualize import plot_section_sis
--
--    suite = _suite_with()  # empty — no section_internalization probe
--    with pytest.raises(ValueError, match="section_internalization"):
--        plot_section_sis(suite)