sway: convert in-tree subproject to git submodule pointing at tenseleyFlow/sway
- SHA
72bb0030b72321dea3c66a2e6d7ce26e52c74550- Parents
-
9da4019 - Tree
e628ba5
72bb003
72bb0030b72321dea3c66a2e6d7ce26e52c745509da4019
e628ba5.gitmodulesmodified@@ -5,3 +5,6 @@ | |||
| 5 | # `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/ | 5 | # `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/ |
| 6 | # which the submodule's own .gitignore covers. | 6 | # which the submodule's own .gitignore covers. |
| 7 | ignore = untracked | 7 | ignore = untracked |
| 8 | +[submodule "sway"] | ||
| 9 | + path = sway | ||
| 10 | + url = https://github.com/tenseleyFlow/sway.git | ||
swayadded@@ -0,0 +1,1 @@ | |||
| 1 | +Subproject commit 98ad9417c94e1bbeb97cf5e553878d7953513f69 | ||
sway/CHANGELOG.mddeleted@@ -1,41 +0,0 @@ | |||
| 1 | -# Changelog | ||
| 2 | - | ||
| 3 | -## 0.1.0.dev0 — 2026-04-20 | ||
| 4 | - | ||
| 5 | -Initial pre-alpha. Full 11-primitive battery shipped. | ||
| 6 | - | ||
| 7 | -### Primitives | ||
| 8 | - | ||
| 9 | -- **Adherence** | ||
| 10 | - - `delta_kl` — mean JS/KL divergence between base and fine-tuned next-token distributions | ||
| 11 | - - `adapter_revert` — reversion under adversarial paraphrase (needs `sway-eval[semsim]`) | ||
| 12 | - - `prompt_collapse` — exponential-decay fit of divergence over context length | ||
| 13 | -- **Attribution** | ||
| 14 | - - `section_internalization` *(flagship)* — per-section `effective_sis` with leak check | ||
| 15 | - - `paraphrase_invariance` — memorization vs. generalization, intent-aware | ||
| 16 | - - `preference_flip` — DPO/ORPO chosen/rejected margin inversion | ||
| 17 | -- **Calibration** | ||
| 18 | - - `style_fingerprint` — 6-dim numpy-only stylistic shift vs. document | ||
| 19 | - - `calibration_drift` — general-knowledge regression on a packaged 30-item pack | ||
| 20 | - - `leakage` — greedy LCS recall + perturbation fragility | ||
| 21 | -- **Ablation** | ||
| 22 | - - `adapter_ablation` *(signature primitive)* — λ-scaled divergence curve with linearity, saturation, overshoot metrics | ||
| 23 | -- **Baseline** | ||
| 24 | - - `null_adapter` — stats scaffolding for z-score calibration (implementation pending) | ||
| 25 | - | ||
| 26 | -### Infrastructure | ||
| 27 | - | ||
| 28 | -- `DifferentialBackend` + `ScalableDifferentialBackend` protocols | ||
| 29 | -- HuggingFace + PEFT backend with `disable_adapter` / `set_adapter` toggling and LoRA-scale mutation | ||
| 30 | -- Dummy backend for unit tests (canned responses + linear-blend scalable mode) | ||
| 31 | -- YAML spec loader, composite score (four-category weighted), rich terminal + JSON + JUnit + Markdown reports | ||
| 32 | -- Typer CLI: `run`, `gate`, `check`, `diff`, `autogen`, `doctor`, `report` | ||
| 33 | -- `.dlm` bridge (`dlm-sway[dlm]`): resolver + full-battery autogen | ||
| 34 | -- Matplotlib visualizations (`dlm-sway[viz]`): SIS bar chart, ablation curve, KL histogram | ||
| 35 | - | ||
| 36 | -### Known gaps | ||
| 37 | - | ||
| 38 | -- Null-adapter baseline is scaffolded but its HF-level materialization (building random-init LoRAs at matched rank) is not yet wired — probes fall back to fixed thresholds until the next milestone. | ||
| 39 | -- Custom backend entry-point dispatch (`kind: custom`) is stubbed but not implemented. | ||
| 40 | -- MLX backend is registered as a future-milestone target; all MLX paths raise `BackendNotAvailableError`. | ||
| 41 | -- PyPI publication of the `dlm-sway` wheel is pending a clean CI release workflow. | ||
sway/LICENSEdeleted@@ -1,21 +0,0 @@ | |||
| 1 | -MIT License | ||
| 2 | - | ||
| 3 | -Copyright (c) 2026 Matt Wolffe | ||
| 4 | - | ||
| 5 | -Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 6 | -of this software and associated documentation files (the "Software"), to deal | ||
| 7 | -in the Software without restriction, including without limitation the rights | ||
| 8 | -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 9 | -copies of the Software, and to permit persons to whom the Software is | ||
| 10 | -furnished to do so, subject to the following conditions: | ||
| 11 | - | ||
| 12 | -The above copyright notice and this permission notice shall be included in all | ||
| 13 | -copies or substantial portions of the Software. | ||
| 14 | - | ||
| 15 | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 16 | -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 17 | -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 18 | -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 19 | -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 20 | -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 21 | -SOFTWARE. | ||
sway/README.mddeleted@@ -1,101 +0,0 @@ | |||
| 1 | -# dlm-sway | ||
| 2 | - | ||
| 3 | -Differential testing for fine-tuned causal language models. | ||
| 4 | - | ||
| 5 | -**One question:** *did LoRA/QLoRA training actually change model behavior | ||
| 6 | -in a meaningful way, or is the model just defaulting to the pretrained | ||
| 7 | -base?* | ||
| 8 | - | ||
| 9 | -`dlm-sway` gives you a trustworthy, reproducible answer with eleven | ||
| 10 | -purpose-built primitives, each z-scored against a null-adapter baseline. | ||
| 11 | -No LLM judges. No external APIs. Deterministic on CPU where possible. | ||
| 12 | - | ||
| 13 | -## Install | ||
| 14 | - | ||
| 15 | -```bash | ||
| 16 | -pip install "dlm-sway[hf]" # HuggingFace + PEFT backend | ||
| 17 | -pip install "dlm-sway[hf,style,semsim]" # full primitive battery | ||
| 18 | -pip install "dlm-sway[all]" # everything including optional viz | ||
| 19 | -pip install "dlm-sway[dlm]" # auto-generate tests from a .dlm file | ||
| 20 | -``` | ||
| 21 | - | ||
| 22 | -## 90-second smoke test | ||
| 23 | - | ||
| 24 | -```bash | ||
| 25 | -dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct | ||
| 26 | -``` | ||
| 27 | - | ||
| 28 | -Outputs a verdict in under a minute on CPU for small models: *your | ||
| 29 | -adapter is 4.2σ above noise* ✅ or *indistinguishable from a null | ||
| 30 | -adapter* ❌. | ||
| 31 | - | ||
| 32 | -## Full suite | ||
| 33 | - | ||
| 34 | -```yaml | ||
| 35 | -# sway.yaml | ||
| 36 | -version: 1 | ||
| 37 | -models: | ||
| 38 | - base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"} | ||
| 39 | - ft: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct", | ||
| 40 | - adapter: "./runs/adapter/v0003"} | ||
| 41 | -suite: | ||
| 42 | - - {name: knows_concept, kind: dir, | ||
| 43 | - prompt: "The Dunning-Kruger effect describes", | ||
| 44 | - target: " a cognitive bias where", | ||
| 45 | - distractor: " a programming language"} | ||
| 46 | - - {name: no_reversion, kind: adapter_revert, paraphrases: 4} | ||
| 47 | - - {name: section_attribution, kind: section_internalization} | ||
| 48 | -``` | ||
| 49 | - | ||
| 50 | -```bash | ||
| 51 | -dlm-sway run sway.yaml # full report to terminal + JSON | ||
| 52 | -dlm-sway gate sway.yaml --junit # CI-friendly; non-zero on fail | ||
| 53 | -``` | ||
| 54 | - | ||
| 55 | -## Why it exists | ||
| 56 | - | ||
| 57 | -Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"* | ||
| 58 | -That's the wrong question after a targeted LoRA fine-tune on a small | ||
| 59 | -user-authored document. The right question is *"did the adapter actually | ||
| 60 | -move the model toward what I wrote?"* — and existing tools answer this | ||
| 61 | -poorly. | ||
| 62 | - | ||
| 63 | -`dlm-sway` answers it directly via eleven primitives across four | ||
| 64 | -categories: | ||
| 65 | - | ||
| 66 | -| Category | Primitives | | ||
| 67 | -|---------------|-------------------------------------------------------| | ||
| 68 | -| Adherence | `delta_kl`, `adapter_revert`, `prompt_collapse` | | ||
| 69 | -| Attribution | `section_internalization`, `paraphrase_invariance`, `preference_flip` | | ||
| 70 | -| Calibration | `style_fingerprint`, `calibration_drift`, `leakage` | | ||
| 71 | -| Ablation | `adapter_ablation` ← the signature primitive | | ||
| 72 | - | ||
| 73 | -**The signature primitive.** `adapter_ablation` scales the LoRA additive | ||
| 74 | -term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence | ||
| 75 | -curve. A healthy fine-tune shows a smooth, monotonic, non-saturated | ||
| 76 | -response. A degenerate one shows a step function or an overshoot-then- | ||
| 77 | -crash. Nobody else does this because nobody else gets this close to the | ||
| 78 | -adapter math. | ||
| 79 | - | ||
| 80 | -## The `.dlm` integration | ||
| 81 | - | ||
| 82 | -If you trained your adapter via the [DocumentLanguageModel | ||
| 83 | -project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway | ||
| 84 | -can auto-generate a test suite from your document's sections: | ||
| 85 | - | ||
| 86 | -```bash | ||
| 87 | -pip install "dlm-sway[hf,dlm]" | ||
| 88 | -dlm-sway autogen path/to/doc.dlm -o sway.yaml | ||
| 89 | -dlm-sway run sway.yaml | ||
| 90 | -``` | ||
| 91 | - | ||
| 92 | -Per-section attribution tells you *which* parts of your document | ||
| 93 | -actually moved the model — a kind of signal no other tool provides. | ||
| 94 | - | ||
| 95 | -## Status | ||
| 96 | - | ||
| 97 | -Pre-alpha. API will break. Version `0.1.0` is the first tag. | ||
| 98 | - | ||
| 99 | -## License | ||
| 100 | - | ||
| 101 | -MIT | ||
sway/pyproject.tomldeleted@@ -1,210 +0,0 @@ | |||
| 1 | -[project] | ||
| 2 | -name = "dlm-sway" | ||
| 3 | -version = "0.1.0.dev0" | ||
| 4 | -description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?" | ||
| 5 | -readme = "README.md" | ||
| 6 | -requires-python = ">=3.11" | ||
| 7 | -license = { text = "MIT" } | ||
| 8 | -authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }] | ||
| 9 | -keywords = [ | ||
| 10 | - "lora", | ||
| 11 | - "qlora", | ||
| 12 | - "peft", | ||
| 13 | - "fine-tuning", | ||
| 14 | - "evaluation", | ||
| 15 | - "llm", | ||
| 16 | - "differential-testing", | ||
| 17 | -] | ||
| 18 | -classifiers = [ | ||
| 19 | - "Development Status :: 3 - Alpha", | ||
| 20 | - "Intended Audience :: Developers", | ||
| 21 | - "Intended Audience :: Science/Research", | ||
| 22 | - "License :: OSI Approved :: MIT License", | ||
| 23 | - "Programming Language :: Python :: 3", | ||
| 24 | - "Programming Language :: Python :: 3.11", | ||
| 25 | - "Programming Language :: Python :: 3.12", | ||
| 26 | - "Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
| 27 | -] | ||
| 28 | - | ||
| 29 | -# Core deps: spec loading, orchestration, reporting. No torch — a user | ||
| 30 | -# who only defines specs or writes a custom backend shouldn't pull 3 GB | ||
| 31 | -# of CUDA wheels. | ||
| 32 | -dependencies = [ | ||
| 33 | - "pydantic>=2.9", | ||
| 34 | - "pyyaml>=6.0", | ||
| 35 | - "typer>=0.12", | ||
| 36 | - "rich>=13.7", | ||
| 37 | - "numpy>=1.26", | ||
| 38 | - "packaging>=24.0", | ||
| 39 | -] | ||
| 40 | - | ||
| 41 | -[project.optional-dependencies] | ||
| 42 | -# HuggingFace + PEFT scoring backend. The canonical path. | ||
| 43 | -hf = [ | ||
| 44 | - "torch>=2.4", | ||
| 45 | - "transformers>=4.45", | ||
| 46 | - "peft>=0.13", | ||
| 47 | - "safetensors>=0.4", | ||
| 48 | -] | ||
| 49 | -# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op | ||
| 50 | -# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays | ||
| 51 | -# sane. | ||
| 52 | -mlx = [ | ||
| 53 | - "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 54 | - "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 55 | -] | ||
| 56 | -# Stylistic fingerprinting (C1). spaCy models pull at runtime via | ||
| 57 | -# `python -m spacy download`. | ||
| 58 | -style = [ | ||
| 59 | - "spacy>=3.7", | ||
| 60 | - "textstat>=0.7", | ||
| 61 | - "nlpaug>=1.1", | ||
| 62 | -] | ||
| 63 | -# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly. | ||
| 64 | -semsim = [ | ||
| 65 | - "sentence-transformers>=3.0", | ||
| 66 | -] | ||
| 67 | -# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm. | ||
| 68 | -dlm = [ | ||
| 69 | - "dlm>=0.9", | ||
| 70 | -] | ||
| 71 | -# Visualization (P9). | ||
| 72 | -viz = [ | ||
| 73 | - "matplotlib>=3.8", | ||
| 74 | -] | ||
| 75 | -all = [ | ||
| 76 | - "torch>=2.4", | ||
| 77 | - "transformers>=4.45", | ||
| 78 | - "peft>=0.13", | ||
| 79 | - "safetensors>=0.4", | ||
| 80 | - "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 81 | - "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 82 | - "spacy>=3.7", | ||
| 83 | - "textstat>=0.7", | ||
| 84 | - "nlpaug>=1.1", | ||
| 85 | - "sentence-transformers>=3.0", | ||
| 86 | - "matplotlib>=3.8", | ||
| 87 | -] | ||
| 88 | - | ||
| 89 | -[project.scripts] | ||
| 90 | -dlm-sway = "dlm_sway.cli.app:main" | ||
| 91 | - | ||
| 92 | -[project.urls] | ||
| 93 | -Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel" | ||
| 94 | -Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues" | ||
| 95 | - | ||
| 96 | -[dependency-groups] | ||
| 97 | -dev = [ | ||
| 98 | - "pytest>=8.0", | ||
| 99 | - "pytest-cov>=5.0", | ||
| 100 | - "mypy>=1.11", | ||
| 101 | - "ruff>=0.6", | ||
| 102 | - "types-pyyaml>=6.0", | ||
| 103 | - "hypothesis>=6.152.1", | ||
| 104 | -] | ||
| 105 | - | ||
| 106 | -[build-system] | ||
| 107 | -requires = ["hatchling"] | ||
| 108 | -build-backend = "hatchling.build" | ||
| 109 | - | ||
| 110 | -[tool.hatch.build.targets.wheel] | ||
| 111 | -packages = ["src/dlm_sway"] | ||
| 112 | - | ||
| 113 | -# -------- ruff -------- | ||
| 114 | -[tool.ruff] | ||
| 115 | -line-length = 100 | ||
| 116 | -target-version = "py311" | ||
| 117 | -src = ["src", "tests"] | ||
| 118 | - | ||
| 119 | -[tool.ruff.lint] | ||
| 120 | -select = [ | ||
| 121 | - "E", # pycodestyle errors | ||
| 122 | - "F", # pyflakes | ||
| 123 | - "W", # pycodestyle warnings | ||
| 124 | - "I", # isort | ||
| 125 | - "UP", # pyupgrade | ||
| 126 | - "B", # bugbear | ||
| 127 | - "N", # pep8-naming | ||
| 128 | - "C4", # comprehensions | ||
| 129 | - "SIM", # simplify | ||
| 130 | - "PT", # pytest | ||
| 131 | - "RET", # return | ||
| 132 | - "ARG", # unused args | ||
| 133 | - "PTH", # use pathlib | ||
| 134 | - "TID", # tidy imports | ||
| 135 | -] | ||
| 136 | -ignore = [ | ||
| 137 | - "E501", # handled by formatter | ||
| 138 | -] | ||
| 139 | - | ||
| 140 | -[tool.ruff.lint.per-file-ignores] | ||
| 141 | -"tests/**/*.py" = ["ARG", "PT011", "SIM117"] | ||
| 142 | -# PyTorch's canonical `import torch.nn.functional as F` is universally | ||
| 143 | -# read, so we allow the naming exception in the HF backend only. | ||
| 144 | -"src/dlm_sway/backends/hf.py" = ["N812"] | ||
| 145 | -# The .dlm bridge is the one place allowed to import the ``dlm`` package. | ||
| 146 | -"src/dlm_sway/integrations/dlm/*.py" = ["TID251"] | ||
| 147 | - | ||
| 148 | -[tool.ruff.lint.flake8-tidy-imports.banned-api] | ||
| 149 | -# Hard architectural boundary: the `dlm` package is only importable | ||
| 150 | -# from inside the optional integration shim. This keeps dlm-sway | ||
| 151 | -# usable for anyone with just a HuggingFace base + PEFT adapter. | ||
| 152 | -"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)." | ||
| 153 | - | ||
| 154 | -[tool.ruff.format] | ||
| 155 | -quote-style = "double" | ||
| 156 | -indent-style = "space" | ||
| 157 | - | ||
| 158 | -# -------- mypy -------- | ||
| 159 | -[tool.mypy] | ||
| 160 | -strict = true | ||
| 161 | -python_version = "3.11" | ||
| 162 | -packages = ["dlm_sway"] | ||
| 163 | -mypy_path = "src" | ||
| 164 | -warn_return_any = true | ||
| 165 | -warn_unused_ignores = true | ||
| 166 | -warn_redundant_casts = true | ||
| 167 | -no_implicit_optional = true | ||
| 168 | -disallow_untyped_decorators = true | ||
| 169 | -plugins = ["pydantic.mypy"] | ||
| 170 | - | ||
| 171 | -[tool.pydantic-mypy] | ||
| 172 | -init_forbid_extra = true | ||
| 173 | -init_typed = true | ||
| 174 | -warn_required_dynamic_aliases = true | ||
| 175 | - | ||
| 176 | -# Stubless ML ecosystem packages. Narrow boundaries in backends/* import | ||
| 177 | -# them explicitly; the rest of the codebase stays strict. | ||
| 178 | -[[tool.mypy.overrides]] | ||
| 179 | -module = [ | ||
| 180 | - "torch", | ||
| 181 | - "torch.*", | ||
| 182 | - "transformers.*", | ||
| 183 | - "peft.*", | ||
| 184 | - "safetensors.*", | ||
| 185 | - "mlx.*", | ||
| 186 | - "mlx_lm.*", | ||
| 187 | - "sentence_transformers.*", | ||
| 188 | - "spacy.*", | ||
| 189 | - "textstat.*", | ||
| 190 | - "nlpaug.*", | ||
| 191 | - "matplotlib", | ||
| 192 | - "matplotlib.*", | ||
| 193 | - "huggingface_hub.*", | ||
| 194 | - "dlm.*", | ||
| 195 | -] | ||
| 196 | -ignore_missing_imports = true | ||
| 197 | -disable_error_code = ["no-untyped-call"] | ||
| 198 | - | ||
| 199 | -# -------- pytest -------- | ||
| 200 | -[tool.pytest.ini_options] | ||
| 201 | -testpaths = ["tests"] | ||
| 202 | -addopts = [ | ||
| 203 | - "-ra", | ||
| 204 | - "-m", "not slow and not gpu and not online", | ||
| 205 | -] | ||
| 206 | -markers = [ | ||
| 207 | - "slow: expensive; deselected by default", | ||
| 208 | - "gpu: requires CUDA; skipped on CPU/MPS runners", | ||
| 209 | - "online: touches the network; skipped in offline CI", | ||
| 210 | -] | ||
sway/src/dlm_sway/__init__.pydeleted@@ -1,42 +0,0 @@ | |||
| 1 | -"""dlm-sway — differential testing for fine-tuned causal language models.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.core.errors import ( | ||
| 6 | - BackendNotAvailableError, | ||
| 7 | - ProbeError, | ||
| 8 | - SpecValidationError, | ||
| 9 | - SwayError, | ||
| 10 | -) | ||
| 11 | -from dlm_sway.core.model import LoadedModel, Model, ModelSpec | ||
| 12 | -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict | ||
| 13 | -from dlm_sway.core.scoring import ( | ||
| 14 | - DifferentialBackend, | ||
| 15 | - NullCalibratedBackend, | ||
| 16 | - RollingLogprob, | ||
| 17 | - ScalableDifferentialBackend, | ||
| 18 | - ScoringBackend, | ||
| 19 | - TokenDist, | ||
| 20 | -) | ||
| 21 | - | ||
| 22 | -__all__ = [ | ||
| 23 | - "BackendNotAvailableError", | ||
| 24 | - "DifferentialBackend", | ||
| 25 | - "LoadedModel", | ||
| 26 | - "Model", | ||
| 27 | - "ModelSpec", | ||
| 28 | - "NullCalibratedBackend", | ||
| 29 | - "ProbeError", | ||
| 30 | - "ProbeResult", | ||
| 31 | - "RollingLogprob", | ||
| 32 | - "ScalableDifferentialBackend", | ||
| 33 | - "ScoringBackend", | ||
| 34 | - "SpecValidationError", | ||
| 35 | - "SuiteResult", | ||
| 36 | - "SwayError", | ||
| 37 | - "SwayScore", | ||
| 38 | - "TokenDist", | ||
| 39 | - "Verdict", | ||
| 40 | -] | ||
| 41 | - | ||
| 42 | -__version__ = "0.1.0.dev0" | ||
sway/src/dlm_sway/backends/__init__.pydeleted@@ -1,118 +0,0 @@ | |||
| 1 | -"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom. | ||
| 2 | - | ||
| 3 | -Backends are constructed from a :class:`~dlm_sway.core.model.ModelSpec` | ||
| 4 | -via :func:`build`. Heavy backends (HF, MLX) import their framework only | ||
| 5 | -on construction so ``import dlm_sway`` stays cheap for users who only | ||
| 6 | -touch the dummy backend or the spec loader. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -from pathlib import Path | ||
| 12 | -from typing import TYPE_CHECKING | ||
| 13 | - | ||
| 14 | -from dlm_sway.core.errors import SpecValidationError | ||
| 15 | -from dlm_sway.core.model import ModelSpec | ||
| 16 | - | ||
| 17 | -if TYPE_CHECKING: | ||
| 18 | - from dlm_sway.core.scoring import DifferentialBackend | ||
| 19 | - | ||
| 20 | - | ||
| 21 | -def build(base_spec: ModelSpec, *, adapter_path: Path | None = None) -> DifferentialBackend: | ||
| 22 | - """Materialize a differential backend from a model spec. | ||
| 23 | - | ||
| 24 | - The adapter path typically comes from ``ft.adapter`` in the spec — | ||
| 25 | - it's lifted to a keyword here so the same function can be used for | ||
| 26 | - "differential" (base + adapter on one loaded model) or future | ||
| 27 | - split-load paths. | ||
| 28 | - """ | ||
| 29 | - effective_adapter = adapter_path if adapter_path is not None else base_spec.adapter | ||
| 30 | - | ||
| 31 | - if base_spec.kind == "dummy": | ||
| 32 | - # Dummy backend isn't really about the spec — it's for tests | ||
| 33 | - # that pre-populate responses. Surface a loud error if someone | ||
| 34 | - # tries to build it through the normal path. | ||
| 35 | - raise SpecValidationError( | ||
| 36 | - "kind='dummy' backends must be constructed directly via " | ||
| 37 | - "DummyDifferentialBackend(base=..., ft=...); they cannot be " | ||
| 38 | - "materialized from a ModelSpec." | ||
| 39 | - ) | ||
| 40 | - | ||
| 41 | - if base_spec.kind == "hf": | ||
| 42 | - if effective_adapter is None: | ||
| 43 | - raise SpecValidationError( | ||
| 44 | - "hf backend requires an adapter path (set `adapter:` on the ft model)" | ||
| 45 | - ) | ||
| 46 | - from dlm_sway.backends.hf import HuggingFaceDifferentialBackend | ||
| 47 | - | ||
| 48 | - return HuggingFaceDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter) | ||
| 49 | - | ||
| 50 | - if base_spec.kind == "mlx": | ||
| 51 | - if effective_adapter is None: | ||
| 52 | - raise SpecValidationError( | ||
| 53 | - "mlx backend requires an adapter path (set `adapter:` on the ft model; " | ||
| 54 | - "must be an MLX .npz adapter — use dlm's peft→mlx converter if needed)" | ||
| 55 | - ) | ||
| 56 | - from dlm_sway.backends.mlx import MLXDifferentialBackend | ||
| 57 | - | ||
| 58 | - return MLXDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter) | ||
| 59 | - | ||
| 60 | - if base_spec.kind == "custom": | ||
| 61 | - return _load_custom(base_spec, effective_adapter) | ||
| 62 | - | ||
| 63 | - raise SpecValidationError(f"unknown backend kind: {base_spec.kind!r}") | ||
| 64 | - | ||
| 65 | - | ||
| 66 | -def _load_custom(base_spec: ModelSpec, adapter: Path | None) -> DifferentialBackend: | ||
| 67 | - """Dispatch to a user-supplied backend via ``entry_point='pkg.mod:Name'``. | ||
| 68 | - | ||
| 69 | - The imported class is instantiated as ``Cls(base_spec=..., adapter_path=...)`` | ||
| 70 | - — the same signature as :class:`dlm_sway.backends.hf.HuggingFaceDifferentialBackend` | ||
| 71 | - so authors can model their implementation on the built-in. The | ||
| 72 | - result is runtime-checked against :class:`DifferentialBackend` so | ||
| 73 | - protocol violations fail at construction, not deep inside a probe. | ||
| 74 | - """ | ||
| 75 | - from dlm_sway.core.scoring import DifferentialBackend as DiffBackend | ||
| 76 | - | ||
| 77 | - entry = base_spec.entry_point | ||
| 78 | - if not entry: | ||
| 79 | - raise SpecValidationError( | ||
| 80 | - "kind='custom' requires an entry_point of the form 'pkg.module:ClassName'" | ||
| 81 | - ) | ||
| 82 | - if ":" not in entry: | ||
| 83 | - raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}") | ||
| 84 | - module_path, _, class_name = entry.partition(":") | ||
| 85 | - if not module_path or not class_name: | ||
| 86 | - raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}") | ||
| 87 | - | ||
| 88 | - import importlib | ||
| 89 | - | ||
| 90 | - try: | ||
| 91 | - module = importlib.import_module(module_path) | ||
| 92 | - except ImportError as exc: | ||
| 93 | - raise SpecValidationError( | ||
| 94 | - f"custom backend: cannot import module {module_path!r}: {exc}" | ||
| 95 | - ) from exc | ||
| 96 | - cls = getattr(module, class_name, None) | ||
| 97 | - if cls is None: | ||
| 98 | - raise SpecValidationError( | ||
| 99 | - f"custom backend: module {module_path!r} has no attribute {class_name!r}" | ||
| 100 | - ) | ||
| 101 | - | ||
| 102 | - try: | ||
| 103 | - instance = cls(base_spec=base_spec, adapter_path=adapter) | ||
| 104 | - except TypeError as exc: | ||
| 105 | - raise SpecValidationError( | ||
| 106 | - f"custom backend {entry!r} constructor signature mismatch: {exc}. " | ||
| 107 | - "Expected Cls(base_spec: ModelSpec, adapter_path: Path | None)" | ||
| 108 | - ) from exc | ||
| 109 | - | ||
| 110 | - if not isinstance(instance, DiffBackend): | ||
| 111 | - raise SpecValidationError( | ||
| 112 | - f"custom backend {entry!r} does not satisfy DifferentialBackend " | ||
| 113 | - "(needs as_base() and as_finetuned() context managers)" | ||
| 114 | - ) | ||
| 115 | - return instance | ||
| 116 | - | ||
| 117 | - | ||
| 118 | -__all__ = ["build"] | ||
sway/src/dlm_sway/backends/dummy.pydeleted@@ -1,257 +0,0 @@ | |||
| 1 | -"""In-memory backend for unit tests. | ||
| 2 | - | ||
| 3 | -Deterministic, torchless, and trivially fast. Tests pass canned responses | ||
| 4 | -and canned score tables keyed by ``(mode, prompt, completion)``. The same | ||
| 5 | -backend instance serves as both ``as_base`` and ``as_finetuned`` — it | ||
| 6 | -switches an internal mode flag. | ||
| 7 | - | ||
| 8 | -Use it to drive every probe's unit test without loading a real model. | ||
| 9 | -For integration tests against a real PEFT adapter, see | ||
| 10 | -:class:`~dlm_sway.backends.hf.HuggingFaceDifferentialBackend`. | ||
| 11 | -""" | ||
| 12 | - | ||
| 13 | -from __future__ import annotations | ||
| 14 | - | ||
| 15 | -import math | ||
| 16 | -from collections.abc import Iterator | ||
| 17 | -from contextlib import contextmanager | ||
| 18 | -from dataclasses import dataclass, field | ||
| 19 | -from typing import Literal | ||
| 20 | - | ||
| 21 | -import numpy as np | ||
| 22 | - | ||
| 23 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | ||
| 24 | - | ||
| 25 | -Mode = Literal["base", "ft"] | ||
| 26 | - | ||
| 27 | - | ||
| 28 | -@dataclass(slots=True) | ||
| 29 | -class DummyResponses: | ||
| 30 | - """Canned data for one mode (base or ft). | ||
| 31 | - | ||
| 32 | - Callers populate one of these per mode and hand both to | ||
| 33 | - :class:`DummyDifferentialBackend`. | ||
| 34 | - """ | ||
| 35 | - | ||
| 36 | - generations: dict[str, str] = field(default_factory=dict) | ||
| 37 | - """Prompt → canned completion. Lookup is exact-match.""" | ||
| 38 | - logprobs: dict[tuple[str, str], float] = field(default_factory=dict) | ||
| 39 | - """``(prompt, completion) → sum logprob``. Default ``-10.0`` if missing.""" | ||
| 40 | - rolling: dict[str, RollingLogprob] = field(default_factory=dict) | ||
| 41 | - """Text → canned :class:`RollingLogprob`.""" | ||
| 42 | - token_dists: dict[str, TokenDist] = field(default_factory=dict) | ||
| 43 | - """Prompt → canned :class:`TokenDist`.""" | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -class _DummyView: | ||
| 47 | - """The per-mode view yielded by ``as_base`` / ``as_finetuned``. | ||
| 48 | - | ||
| 49 | - Implements :class:`~dlm_sway.core.model.Model` *and* | ||
| 50 | - :class:`~dlm_sway.core.scoring.ScoringBackend` — i.e. the | ||
| 51 | - ``ScoringModel`` intersection. | ||
| 52 | - """ | ||
| 53 | - | ||
| 54 | - def __init__(self, mode: Mode, responses: DummyResponses) -> None: | ||
| 55 | - self.id = mode | ||
| 56 | - self._mode: Mode = mode | ||
| 57 | - self._r = responses | ||
| 58 | - | ||
| 59 | - # -- Model --------------------------------------------------------- | ||
| 60 | - def generate( | ||
| 61 | - self, | ||
| 62 | - prompt: str, | ||
| 63 | - *, | ||
| 64 | - max_new_tokens: int, | ||
| 65 | - temperature: float = 0.0, | ||
| 66 | - top_p: float = 1.0, | ||
| 67 | - seed: int = 0, | ||
| 68 | - ) -> str: | ||
| 69 | - del max_new_tokens, temperature, top_p, seed # canned; decoding is trivial. | ||
| 70 | - try: | ||
| 71 | - return self._r.generations[prompt] | ||
| 72 | - except KeyError as exc: | ||
| 73 | - raise KeyError( | ||
| 74 | - f"dummy backend ({self._mode}): no canned generation for prompt {prompt!r}" | ||
| 75 | - ) from exc | ||
| 76 | - | ||
| 77 | - def close(self) -> None: | ||
| 78 | - return None | ||
| 79 | - | ||
| 80 | - # -- ScoringBackend ------------------------------------------------ | ||
| 81 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 82 | - return self._r.logprobs.get((prompt, completion), -10.0) | ||
| 83 | - | ||
| 84 | - def rolling_logprob(self, text: str) -> RollingLogprob: | ||
| 85 | - if text in self._r.rolling: | ||
| 86 | - return self._r.rolling[text] | ||
| 87 | - # Synthesize a plausible rolling logprob so probes that just | ||
| 88 | - # want a non-trivial value work without per-text configuration. | ||
| 89 | - tokens = text.split() | ||
| 90 | - n = max(len(tokens), 1) | ||
| 91 | - per_tok = -2.0 if self._mode == "base" else -1.5 | ||
| 92 | - return RollingLogprob( | ||
| 93 | - token_ids=np.arange(n, dtype=np.int64), | ||
| 94 | - logprobs=np.full(max(n - 1, 0), per_tok, dtype=np.float32), | ||
| 95 | - num_tokens=n, | ||
| 96 | - total_logprob=per_tok * max(n - 1, 0), | ||
| 97 | - ) | ||
| 98 | - | ||
| 99 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 100 | - del top_k | ||
| 101 | - if prompt in self._r.token_dists: | ||
| 102 | - return self._r.token_dists[prompt] | ||
| 103 | - # Synthesize a sharp base / broad ft distribution so divergence | ||
| 104 | - # probes see a non-zero signal without hand-rolled data. | ||
| 105 | - vocab = 1000 | ||
| 106 | - k = 8 | ||
| 107 | - if self._mode == "base": | ||
| 108 | - lp = np.array([-0.1] + [-5.0] * (k - 1), dtype=np.float32) | ||
| 109 | - else: | ||
| 110 | - # More uniform mass across the top-k tokens. | ||
| 111 | - lp = np.full(k, -math.log(k), dtype=np.float32) | ||
| 112 | - return TokenDist( | ||
| 113 | - token_ids=np.arange(k, dtype=np.int64), | ||
| 114 | - logprobs=lp, | ||
| 115 | - vocab_size=vocab, | ||
| 116 | - tail_logprob=math.log1p(-float(np.exp(lp).sum())) if np.exp(lp).sum() < 1 else 0.0, | ||
| 117 | - ) | ||
| 118 | - | ||
| 119 | - | ||
| 120 | -class _NullView(_DummyView): | ||
| 121 | - """A dummy view that perturbs the base distribution with seeded noise. | ||
| 122 | - | ||
| 123 | - Used by :meth:`DummyDifferentialBackend.as_null_adapter`. The | ||
| 124 | - perturbation is small (matches an ``init_scale=0.02`` adapter) so | ||
| 125 | - the null-vs-base divergence stays well below real-adapter territory | ||
| 126 | - in probe tests. | ||
| 127 | - """ | ||
| 128 | - | ||
| 129 | - def __init__(self, base_responses: DummyResponses, seed: int, init_scale: float) -> None: | ||
| 130 | - super().__init__("base", base_responses) | ||
| 131 | - self._seed = seed | ||
| 132 | - self._init_scale = init_scale | ||
| 133 | - | ||
| 134 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 135 | - base_dist = super().next_token_dist(prompt, top_k=top_k) | ||
| 136 | - rng = np.random.default_rng(self._seed + hash(prompt) % 1_000_003) | ||
| 137 | - noise = rng.normal(0.0, self._init_scale, size=base_dist.logprobs.shape).astype(np.float32) | ||
| 138 | - new_lp = base_dist.logprobs + noise | ||
| 139 | - # Re-normalize (within the top-k slice) so a valid distribution comes back. | ||
| 140 | - max_lp = new_lp.max() | ||
| 141 | - new_probs = np.exp(new_lp - max_lp) | ||
| 142 | - new_probs /= new_probs.sum() | ||
| 143 | - return TokenDist( | ||
| 144 | - token_ids=base_dist.token_ids, | ||
| 145 | - logprobs=np.log(new_probs).astype(np.float32), | ||
| 146 | - vocab_size=base_dist.vocab_size, | ||
| 147 | - tail_logprob=base_dist.tail_logprob, | ||
| 148 | - ) | ||
| 149 | - | ||
| 150 | - | ||
| 151 | -class _InterpolatedView(_DummyView): | ||
| 152 | - """A dummy view where logits/dists are a lam-blend of base and ft. | ||
| 153 | - | ||
| 154 | - Used by :meth:`DummyDifferentialBackend.as_scaled_adapter`. | ||
| 155 | - Generation falls back to the ft view at lam>=0.5, base otherwise — | ||
| 156 | - rounded because the dummy backend's generations are canned strings | ||
| 157 | - with no notion of "how much". | ||
| 158 | - """ | ||
| 159 | - | ||
| 160 | - def __init__( | ||
| 161 | - self, | ||
| 162 | - base_responses: DummyResponses, | ||
| 163 | - ft_responses: DummyResponses, | ||
| 164 | - lam: float, | ||
| 165 | - ) -> None: | ||
| 166 | - super().__init__( | ||
| 167 | - "ft" if lam >= 0.5 else "base", ft_responses if lam >= 0.5 else base_responses | ||
| 168 | - ) | ||
| 169 | - self._base_r = base_responses | ||
| 170 | - self._ft_r = ft_responses | ||
| 171 | - self._lam = lam | ||
| 172 | - | ||
| 173 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 174 | - base_v = self._base_r.logprobs.get((prompt, completion), -10.0) | ||
| 175 | - ft_v = self._ft_r.logprobs.get((prompt, completion), -10.0) | ||
| 176 | - return (1 - self._lam) * base_v + self._lam * ft_v | ||
| 177 | - | ||
| 178 | - def next_token_dist(self, prompt: str, *, top_k: int = 256): # type: ignore[no-untyped-def] | ||
| 179 | - base_dist = _DummyView("base", self._base_r).next_token_dist(prompt, top_k=top_k) | ||
| 180 | - ft_dist = _DummyView("ft", self._ft_r).next_token_dist(prompt, top_k=top_k) | ||
| 181 | - # Both dists are on the same synthetic support when unseeded; blend | ||
| 182 | - # their logprobs via log-space linear interpolation, which is a | ||
| 183 | - # log-linear "tempered" mix and keeps normalization close enough. | ||
| 184 | - lam = self._lam | ||
| 185 | - blended_lp = (1 - lam) * base_dist.logprobs + lam * ft_dist.logprobs | ||
| 186 | - return type(base_dist)( | ||
| 187 | - token_ids=base_dist.token_ids, | ||
| 188 | - logprobs=blended_lp, | ||
| 189 | - vocab_size=base_dist.vocab_size, | ||
| 190 | - tail_logprob=base_dist.tail_logprob, | ||
| 191 | - ) | ||
| 192 | - | ||
| 193 | - | ||
| 194 | -class DummyDifferentialBackend: | ||
| 195 | - """Dummy implementation of | ||
| 196 | - :class:`~dlm_sway.core.scoring.DifferentialBackend`. | ||
| 197 | - | ||
| 198 | - Construction takes one :class:`DummyResponses` per mode. The two | ||
| 199 | - modes are mutually exclusive — the backend enforces that callers | ||
| 200 | - exit one view before entering the other, catching bugs in probes | ||
| 201 | - that hold a stale view across a toggle. | ||
| 202 | - | ||
| 203 | - Also implements | ||
| 204 | - :class:`~dlm_sway.core.scoring.ScalableDifferentialBackend` with a | ||
| 205 | - linear-blend between base and ft responses, so probes that need | ||
| 206 | - ``as_scaled_adapter`` (N2 AdapterAblation) are unit-testable. | ||
| 207 | - """ | ||
| 208 | - | ||
| 209 | - def __init__(self, *, base: DummyResponses, ft: DummyResponses) -> None: | ||
| 210 | - self._base_r = base | ||
| 211 | - self._ft_r = ft | ||
| 212 | - self._base = _DummyView("base", base) | ||
| 213 | - self._ft = _DummyView("ft", ft) | ||
| 214 | - self._active: str | None = None | ||
| 215 | - | ||
| 216 | - @contextmanager | ||
| 217 | - def as_base(self) -> Iterator[_DummyView]: | ||
| 218 | - self._enter("base") | ||
| 219 | - try: | ||
| 220 | - yield self._base | ||
| 221 | - finally: | ||
| 222 | - self._exit() | ||
| 223 | - | ||
| 224 | - @contextmanager | ||
| 225 | - def as_finetuned(self) -> Iterator[_DummyView]: | ||
| 226 | - self._enter("ft") | ||
| 227 | - try: | ||
| 228 | - yield self._ft | ||
| 229 | - finally: | ||
| 230 | - self._exit() | ||
| 231 | - | ||
| 232 | - @contextmanager | ||
| 233 | - def as_scaled_adapter(self, lam: float) -> Iterator[_DummyView]: | ||
| 234 | - self._enter(f"scaled({lam})") | ||
| 235 | - try: | ||
| 236 | - yield _InterpolatedView(self._base_r, self._ft_r, lam) | ||
| 237 | - finally: | ||
| 238 | - self._exit() | ||
| 239 | - | ||
| 240 | - @contextmanager | ||
| 241 | - def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_DummyView]: | ||
| 242 | - self._enter(f"null({seed})") | ||
| 243 | - try: | ||
| 244 | - yield _NullView(self._base_r, seed=seed, init_scale=init_scale) | ||
| 245 | - finally: | ||
| 246 | - self._exit() | ||
| 247 | - | ||
| 248 | - def _enter(self, mode: str) -> None: | ||
| 249 | - if self._active is not None: | ||
| 250 | - raise RuntimeError( | ||
| 251 | - f"DifferentialBackend view already active ({self._active!r}); " | ||
| 252 | - f"exit the current view before entering {mode!r}." | ||
| 253 | - ) | ||
| 254 | - self._active = mode | ||
| 255 | - | ||
| 256 | - def _exit(self) -> None: | ||
| 257 | - self._active = None | ||
sway/src/dlm_sway/backends/hf.pydeleted@@ -1,375 +0,0 @@ | |||
| 1 | -"""HuggingFace + PEFT differential backend. | ||
| 2 | - | ||
| 3 | -Loads the base once, attaches the LoRA adapter once, and toggles between | ||
| 4 | -"base" and "fine-tuned" views on the same module via PEFT's | ||
| 5 | -:meth:`~peft.PeftModel.disable_adapter` / :meth:`~peft.PeftModel.set_adapter`. | ||
| 6 | - | ||
| 7 | -This is the single most important backend in sway. Every numeric probe | ||
| 8 | -benefits from the shared-weights toggle — memory is halved compared to | ||
| 9 | -loading two copies, and KV-cache layouts stay aligned so pairwise KL math | ||
| 10 | -is straight-forward. | ||
| 11 | - | ||
| 12 | -Heavy imports (``torch``, ``transformers``, ``peft``) are deferred until | ||
| 13 | -``HuggingFaceDifferentialBackend`` is actually instantiated so | ||
| 14 | -``import dlm_sway`` stays light for users of the dummy backend or spec | ||
| 15 | -validation. | ||
| 16 | -""" | ||
| 17 | - | ||
| 18 | -from __future__ import annotations | ||
| 19 | - | ||
| 20 | -from collections.abc import Iterator | ||
| 21 | -from contextlib import contextmanager | ||
| 22 | -from dataclasses import dataclass | ||
| 23 | -from pathlib import Path | ||
| 24 | -from typing import TYPE_CHECKING, Any, Literal | ||
| 25 | - | ||
| 26 | -import numpy as np | ||
| 27 | - | ||
| 28 | -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError | ||
| 29 | -from dlm_sway.core.model import ModelSpec | ||
| 30 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | ||
| 31 | - | ||
| 32 | -if TYPE_CHECKING: | ||
| 33 | - from transformers import PreTrainedModel, PreTrainedTokenizerBase | ||
| 34 | - | ||
| 35 | - | ||
| 36 | -Device = Literal["cuda", "mps", "cpu"] | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def _detect_device() -> Device: | ||
| 40 | - try: | ||
| 41 | - import torch | ||
| 42 | - except ImportError as exc: | ||
| 43 | - raise BackendNotAvailableError("hf", extra="hf") from exc | ||
| 44 | - if torch.cuda.is_available(): | ||
| 45 | - return "cuda" | ||
| 46 | - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): | ||
| 47 | - return "mps" | ||
| 48 | - return "cpu" | ||
| 49 | - | ||
| 50 | - | ||
| 51 | -def _resolve_dtype(requested: str, device: Device) -> Any: | ||
| 52 | - """Map the user's ``dtype`` preference to a torch dtype.""" | ||
| 53 | - import torch # noqa: PLC0415 — lazy | ||
| 54 | - | ||
| 55 | - if requested == "fp16": | ||
| 56 | - return torch.float16 | ||
| 57 | - if requested == "bf16": | ||
| 58 | - return torch.bfloat16 | ||
| 59 | - if requested == "fp32": | ||
| 60 | - return torch.float32 | ||
| 61 | - # auto: bf16 on CUDA (Ampere+) / MPS; fp32 on CPU for numerical stability. | ||
| 62 | - if device == "cuda" and torch.cuda.is_bf16_supported(): | ||
| 63 | - return torch.bfloat16 | ||
| 64 | - if device == "mps": | ||
| 65 | - return torch.float16 | ||
| 66 | - return torch.float32 | ||
| 67 | - | ||
| 68 | - | ||
| 69 | -def _require_hf() -> tuple[Any, Any, Any]: | ||
| 70 | - """Import torch + transformers + peft, raising a friendly error if missing.""" | ||
| 71 | - try: | ||
| 72 | - import torch | ||
| 73 | - import transformers | ||
| 74 | - except ImportError as exc: | ||
| 75 | - raise BackendNotAvailableError("hf", extra="hf") from exc | ||
| 76 | - try: | ||
| 77 | - import peft | ||
| 78 | - except ImportError as exc: | ||
| 79 | - raise BackendNotAvailableError( | ||
| 80 | - "hf", extra="hf", hint="peft is required for the adapter toggle." | ||
| 81 | - ) from exc | ||
| 82 | - return torch, transformers, peft | ||
| 83 | - | ||
| 84 | - | ||
| 85 | -# --- the view object ------------------------------------------------------ | ||
| 86 | - | ||
| 87 | - | ||
| 88 | -@dataclass(slots=True) | ||
| 89 | -class _HFView: | ||
| 90 | - """One side (base or ft) of a :class:`HuggingFaceDifferentialBackend`. | ||
| 91 | - | ||
| 92 | - Both sides reuse the same underlying module; the difference is | ||
| 93 | - whether the adapter is active. | ||
| 94 | - """ | ||
| 95 | - | ||
| 96 | - id: str | ||
| 97 | - _model: Any | ||
| 98 | - _tokenizer: Any | ||
| 99 | - _device: str | ||
| 100 | - _pad_token_id: int | ||
| 101 | - | ||
| 102 | - # -- Model --------------------------------------------------------- | ||
| 103 | - def generate( | ||
| 104 | - self, | ||
| 105 | - prompt: str, | ||
| 106 | - *, | ||
| 107 | - max_new_tokens: int, | ||
| 108 | - temperature: float = 0.0, | ||
| 109 | - top_p: float = 1.0, | ||
| 110 | - seed: int = 0, | ||
| 111 | - ) -> str: | ||
| 112 | - import torch | ||
| 113 | - | ||
| 114 | - torch.manual_seed(seed) | ||
| 115 | - inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device) | ||
| 116 | - do_sample = temperature > 0.0 | ||
| 117 | - gen_kwargs: dict[str, Any] = { | ||
| 118 | - "max_new_tokens": max_new_tokens, | ||
| 119 | - "do_sample": do_sample, | ||
| 120 | - "pad_token_id": self._pad_token_id, | ||
| 121 | - } | ||
| 122 | - if do_sample: | ||
| 123 | - gen_kwargs["temperature"] = temperature | ||
| 124 | - gen_kwargs["top_p"] = top_p | ||
| 125 | - with torch.inference_mode(): | ||
| 126 | - out_ids = self._model.generate(**inputs, **gen_kwargs) | ||
| 127 | - new_tokens = out_ids[0, inputs["input_ids"].shape[1] :] | ||
| 128 | - return str(self._tokenizer.decode(new_tokens, skip_special_tokens=True)) | ||
| 129 | - | ||
| 130 | - def close(self) -> None: | ||
| 131 | - return None | ||
| 132 | - | ||
| 133 | - # -- ScoringBackend ------------------------------------------------ | ||
| 134 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 135 | - import torch | ||
| 136 | - import torch.nn.functional as F | ||
| 137 | - | ||
| 138 | - prompt_ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device) | ||
| 139 | - full_ids = self._tokenizer(prompt + completion, return_tensors="pt").input_ids.to( | ||
| 140 | - self._device | ||
| 141 | - ) | ||
| 142 | - if full_ids.shape[1] <= prompt_ids.shape[1]: | ||
| 143 | - raise ProbeError( | ||
| 144 | - "logprob_of", | ||
| 145 | - f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})", | ||
| 146 | - ) | ||
| 147 | - target_ids = full_ids[:, prompt_ids.shape[1] :] | ||
| 148 | - with torch.inference_mode(): | ||
| 149 | - logits = self._model(full_ids).logits # (1, T, V) | ||
| 150 | - # Align: logit at position t predicts token at t+1. We want | ||
| 151 | - # predictions for the completion slice. | ||
| 152 | - shift_logits = logits[:, prompt_ids.shape[1] - 1 : -1, :] # (1, C, V) | ||
| 153 | - log_probs = F.log_softmax(shift_logits.float(), dim=-1) | ||
| 154 | - gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1) | ||
| 155 | - return float(gathered.sum().item()) | ||
| 156 | - | ||
| 157 | - def rolling_logprob(self, text: str) -> RollingLogprob: | ||
| 158 | - import torch | ||
| 159 | - import torch.nn.functional as F | ||
| 160 | - | ||
| 161 | - ids = self._tokenizer(text, return_tensors="pt").input_ids.to(self._device) | ||
| 162 | - if ids.shape[1] < 2: | ||
| 163 | - return RollingLogprob( | ||
| 164 | - token_ids=ids[0].cpu().numpy().astype(np.int64), | ||
| 165 | - logprobs=np.array([], dtype=np.float32), | ||
| 166 | - num_tokens=int(ids.shape[1]), | ||
| 167 | - total_logprob=0.0, | ||
| 168 | - ) | ||
| 169 | - with torch.inference_mode(): | ||
| 170 | - logits = self._model(ids).logits # (1, T, V) | ||
| 171 | - log_probs = F.log_softmax(logits[:, :-1].float(), dim=-1) # predicts tokens 1..T | ||
| 172 | - gathered = log_probs.gather(-1, ids[:, 1:].unsqueeze(-1)).squeeze(-1).squeeze(0) | ||
| 173 | - return RollingLogprob( | ||
| 174 | - token_ids=ids[0].cpu().numpy().astype(np.int64), | ||
| 175 | - logprobs=gathered.cpu().numpy().astype(np.float32), | ||
| 176 | - num_tokens=int(ids.shape[1]), | ||
| 177 | - total_logprob=float(gathered.sum().item()), | ||
| 178 | - ) | ||
| 179 | - | ||
| 180 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 181 | - import torch | ||
| 182 | - import torch.nn.functional as F | ||
| 183 | - | ||
| 184 | - ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device) | ||
| 185 | - with torch.inference_mode(): | ||
| 186 | - logits = self._model(ids).logits[:, -1, :] # (1, V) | ||
| 187 | - log_probs = F.log_softmax(logits.float(), dim=-1).squeeze(0) | ||
| 188 | - k = min(top_k, int(log_probs.shape[0])) | ||
| 189 | - top = torch.topk(log_probs, k=k) | ||
| 190 | - tail_mass = float(1.0 - torch.exp(top.values).sum().item()) | ||
| 191 | - tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0 | ||
| 192 | - return TokenDist( | ||
| 193 | - token_ids=top.indices.cpu().numpy().astype(np.int64), | ||
| 194 | - logprobs=top.values.cpu().numpy().astype(np.float32), | ||
| 195 | - vocab_size=int(log_probs.shape[0]), | ||
| 196 | - tail_logprob=tail_logprob, | ||
| 197 | - ) | ||
| 198 | - | ||
| 199 | - | ||
| 200 | -# --- the backend ----------------------------------------------------------- | ||
| 201 | - | ||
| 202 | - | ||
| 203 | -class HuggingFaceDifferentialBackend: | ||
| 204 | - """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for HF+PEFT. | ||
| 205 | - | ||
| 206 | - The adapter toggle relies on | ||
| 207 | - :meth:`peft.PeftModel.disable_adapter` producing a context where the | ||
| 208 | - forward pass skips the LoRA deltas, and | ||
| 209 | - :meth:`peft.PeftModel.set_adapter` (or just exiting the disable | ||
| 210 | - context) re-enabling them. A dedicated sanity test asserts that | ||
| 211 | - these actually change logits on a fixture. | ||
| 212 | - """ | ||
| 213 | - | ||
| 214 | - def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None: | ||
| 215 | - torch, transformers, peft = _require_hf() | ||
| 216 | - self._torch = torch | ||
| 217 | - self._spec = base_spec | ||
| 218 | - self._adapter_path = Path(adapter_path).expanduser().resolve() | ||
| 219 | - | ||
| 220 | - device_str: Device = ( | ||
| 221 | - _detect_device() if base_spec.device == "auto" else base_spec.device # type: ignore[assignment] | ||
| 222 | - ) | ||
| 223 | - self._device: str = device_str | ||
| 224 | - dtype = _resolve_dtype(base_spec.dtype, device_str) | ||
| 225 | - | ||
| 226 | - tokenizer = transformers.AutoTokenizer.from_pretrained( | ||
| 227 | - str(self._adapter_path) | ||
| 228 | - if (self._adapter_path / "tokenizer_config.json").exists() | ||
| 229 | - else base_spec.base, | ||
| 230 | - trust_remote_code=base_spec.trust_remote_code, | ||
| 231 | - ) | ||
| 232 | - if tokenizer.pad_token_id is None: | ||
| 233 | - tokenizer.pad_token = tokenizer.eos_token | ||
| 234 | - | ||
| 235 | - base_model = transformers.AutoModelForCausalLM.from_pretrained( | ||
| 236 | - base_spec.base, | ||
| 237 | - torch_dtype=dtype, | ||
| 238 | - trust_remote_code=base_spec.trust_remote_code, | ||
| 239 | - ) | ||
| 240 | - base_model.to(self._device) | ||
| 241 | - peft_model = peft.PeftModel.from_pretrained( | ||
| 242 | - base_model, | ||
| 243 | - str(self._adapter_path), | ||
| 244 | - is_trainable=False, | ||
| 245 | - ) | ||
| 246 | - peft_model.eval() | ||
| 247 | - | ||
| 248 | - self._tokenizer: PreTrainedTokenizerBase = tokenizer | ||
| 249 | - self._peft_model: PreTrainedModel = peft_model | ||
| 250 | - self._pad_token_id: int = int(tokenizer.pad_token_id) | ||
| 251 | - self._active: str | None = None | ||
| 252 | - | ||
| 253 | - # -- DifferentialBackend ------------------------------------------- | ||
| 254 | - | ||
| 255 | - @contextmanager | ||
| 256 | - def as_base(self) -> Iterator[_HFView]: | ||
| 257 | - self._enter("base") | ||
| 258 | - try: | ||
| 259 | - # peft.PeftModel.disable_adapter is a context manager; mypy | ||
| 260 | - # mis-reads it as a Tensor on this transformers version. | ||
| 261 | - with self._peft_model.disable_adapter(): # type: ignore[operator] | ||
| 262 | - yield self._make_view("base") | ||
| 263 | - finally: | ||
| 264 | - self._exit() | ||
| 265 | - | ||
| 266 | - @contextmanager | ||
| 267 | - def as_finetuned(self) -> Iterator[_HFView]: | ||
| 268 | - self._enter("ft") | ||
| 269 | - try: | ||
| 270 | - yield self._make_view("ft") | ||
| 271 | - finally: | ||
| 272 | - self._exit() | ||
| 273 | - | ||
| 274 | - @contextmanager | ||
| 275 | - def as_scaled_adapter(self, lam: float) -> Iterator[_HFView]: | ||
| 276 | - """Temporarily multiply every LoRA layer's scaling factor by ``lam``. | ||
| 277 | - | ||
| 278 | - Works by walking the PEFT module tree and mutating each | ||
| 279 | - ``LoraLayer.scaling[adapter_name]`` in place. The original | ||
| 280 | - scalings are restored when the context exits — or when an | ||
| 281 | - exception propagates, to keep the model in a sane state. | ||
| 282 | - """ | ||
| 283 | - self._enter(f"scaled({lam})") | ||
| 284 | - # ``module`` is dynamic (peft LoraLayer subclass) — Any avoids | ||
| 285 | - # mypy treating its ``.scaling`` as a Tensor when peft is loaded. | ||
| 286 | - saved: list[tuple[Any, str, float]] = [] | ||
| 287 | - try: | ||
| 288 | - import peft # noqa: PLC0415 — already a hard dep of this backend | ||
| 289 | - | ||
| 290 | - lora_cls = getattr(peft.tuners.lora, "LoraLayer", None) | ||
| 291 | - if lora_cls is None: | ||
| 292 | - raise RuntimeError("peft.tuners.lora.LoraLayer not found; check peft>=0.13 pin") | ||
| 293 | - for module in self._peft_model.modules(): | ||
| 294 | - if not isinstance(module, lora_cls): | ||
| 295 | - continue | ||
| 296 | - scaling = getattr(module, "scaling", None) | ||
| 297 | - if not isinstance(scaling, dict): | ||
| 298 | - continue | ||
| 299 | - for key, original in scaling.items(): | ||
| 300 | - saved.append((module, key, float(original))) | ||
| 301 | - scaling[key] = float(original) * lam | ||
| 302 | - yield self._make_view(f"scaled_{lam:.2f}") | ||
| 303 | - finally: | ||
| 304 | - for module, key, original in saved: | ||
| 305 | - module.scaling[key] = original | ||
| 306 | - self._exit() | ||
| 307 | - | ||
| 308 | - @contextmanager | ||
| 309 | - def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_HFView]: | ||
| 310 | - """Temporarily replace every LoRA ``A``/``B`` tensor with random noise. | ||
| 311 | - | ||
| 312 | - Same rank, alpha, and target modules as the real adapter — only | ||
| 313 | - the weights differ. This is the denominator in every z-score | ||
| 314 | - path: "how much signal does structural noise produce?" | ||
| 315 | - | ||
| 316 | - Implementation walks the PEFT module tree for ``lora_A``/``lora_B`` | ||
| 317 | - parameters, saves a clone of each current value, overwrites in | ||
| 318 | - place with a zero-mean Gaussian at ``init_scale``, and restores | ||
| 319 | - on exit (including on exception). | ||
| 320 | - """ | ||
| 321 | - import torch | ||
| 322 | - | ||
| 323 | - self._enter(f"null({seed})") | ||
| 324 | - gen = torch.Generator(device="cpu").manual_seed(int(seed)) | ||
| 325 | - saved: list[tuple[torch.nn.Parameter, torch.Tensor]] = [] | ||
| 326 | - try: | ||
| 327 | - for pname, param in self._peft_model.named_parameters(): | ||
| 328 | - if not any(key in pname for key in ("lora_A", "lora_B")): | ||
| 329 | - continue | ||
| 330 | - saved.append((param, param.detach().clone())) | ||
| 331 | - with torch.no_grad(): | ||
| 332 | - noise = torch.randn( | ||
| 333 | - *param.shape, | ||
| 334 | - generator=gen, | ||
| 335 | - dtype=torch.float32, | ||
| 336 | - ).to(dtype=param.dtype, device=param.device) | ||
| 337 | - param.copy_(noise * init_scale) | ||
| 338 | - yield self._make_view(f"null_{seed}") | ||
| 339 | - finally: | ||
| 340 | - with torch.no_grad(): | ||
| 341 | - for param, original in saved: | ||
| 342 | - param.copy_(original) | ||
| 343 | - self._exit() | ||
| 344 | - | ||
| 345 | - def close(self) -> None: | ||
| 346 | - """Release GPU memory. Safe to call more than once.""" | ||
| 347 | - if getattr(self, "_peft_model", None) is not None: | ||
| 348 | - del self._peft_model | ||
| 349 | - if self._torch.cuda.is_available(): | ||
| 350 | - self._torch.cuda.empty_cache() | ||
| 351 | - | ||
| 352 | - # -- internals ----------------------------------------------------- | ||
| 353 | - | ||
| 354 | - def _make_view(self, mode: str) -> _HFView: | ||
| 355 | - return _HFView( | ||
| 356 | - id=mode, | ||
| 357 | - _model=self._peft_model, | ||
| 358 | - _tokenizer=self._tokenizer, | ||
| 359 | - _device=self._device, | ||
| 360 | - _pad_token_id=self._pad_token_id, | ||
| 361 | - ) | ||
| 362 | - | ||
| 363 | - def _enter(self, mode: str) -> None: | ||
| 364 | - if self._active is not None: | ||
| 365 | - raise RuntimeError( | ||
| 366 | - f"HuggingFaceDifferentialBackend view {self._active!r} already active; " | ||
| 367 | - f"exit it before entering {mode!r}." | ||
| 368 | - ) | ||
| 369 | - self._active = mode | ||
| 370 | - | ||
| 371 | - def _exit(self) -> None: | ||
| 372 | - self._active = None | ||
| 373 | - | ||
| 374 | - | ||
| 375 | -__all__ = ["HuggingFaceDifferentialBackend"] | ||
sway/src/dlm_sway/backends/mlx.pydeleted@@ -1,205 +0,0 @@ | |||
| 1 | -"""MLX backend for Apple Silicon (darwin-arm64). | ||
| 2 | - | ||
| 3 | -Partial implementation covering the common case: a PEFT adapter that's | ||
| 4 | -already been converted to MLX's ``.npz`` format. Unlike the HF backend, | ||
| 5 | -MLX has no runtime ``disable_adapter`` context — adapters get fused into | ||
| 6 | -the linear layers at load time — so this backend keeps **both** a base | ||
| 7 | -model and an adapted model in memory. Fine for the small (<3B) models | ||
| 8 | -MLX is typically used with on Apple Silicon; document the cost clearly. | ||
| 9 | - | ||
| 10 | -If users point this backend at raw PEFT safetensors, ``mlx_lm.load`` | ||
| 11 | -will refuse them with its own error. A future milestone can wire a | ||
| 12 | -PEFT-→-MLX converter; for now the contract is "bring your own .npz". | ||
| 13 | -""" | ||
| 14 | - | ||
| 15 | -from __future__ import annotations | ||
| 16 | - | ||
| 17 | -from collections.abc import Iterator | ||
| 18 | -from contextlib import contextmanager | ||
| 19 | -from dataclasses import dataclass | ||
| 20 | -from pathlib import Path | ||
| 21 | -from typing import TYPE_CHECKING, Any | ||
| 22 | - | ||
| 23 | -import numpy as np | ||
| 24 | - | ||
| 25 | -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError | ||
| 26 | -from dlm_sway.core.model import ModelSpec | ||
| 27 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | ||
| 28 | - | ||
| 29 | -if TYPE_CHECKING: | ||
| 30 | - pass | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -def _require_mlx() -> tuple[Any, Any]: | ||
| 34 | - try: | ||
| 35 | - import mlx.core as mx | ||
| 36 | - import mlx_lm | ||
| 37 | - except ImportError as exc: | ||
| 38 | - raise BackendNotAvailableError( | ||
| 39 | - "mlx", | ||
| 40 | - extra="mlx", | ||
| 41 | - hint="MLX backend needs mlx + mlx-lm on darwin-arm64.", | ||
| 42 | - ) from exc | ||
| 43 | - return mx, mlx_lm | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -@dataclass(slots=True) | ||
| 47 | -class _MLXView: | ||
| 48 | - """One side (base or ft) of the MLX backend. | ||
| 49 | - | ||
| 50 | - Both sides carry the same tokenizer (MLX stores it alongside the | ||
| 51 | - converted model files, so sharing avoids double-loading). | ||
| 52 | - """ | ||
| 53 | - | ||
| 54 | - id: str | ||
| 55 | - _model: Any | ||
| 56 | - _tokenizer: Any | ||
| 57 | - | ||
| 58 | - def generate( | ||
| 59 | - self, | ||
| 60 | - prompt: str, | ||
| 61 | - *, | ||
| 62 | - max_new_tokens: int, | ||
| 63 | - temperature: float = 0.0, | ||
| 64 | - top_p: float = 1.0, | ||
| 65 | - seed: int = 0, | ||
| 66 | - ) -> str: | ||
| 67 | - del seed # mlx_lm.generate seeds via its own global state | ||
| 68 | - _, mlx_lm = _require_mlx() | ||
| 69 | - kwargs: dict[str, Any] = {"max_tokens": max_new_tokens, "verbose": False} | ||
| 70 | - if temperature > 0.0: | ||
| 71 | - kwargs["temp"] = temperature | ||
| 72 | - kwargs["top_p"] = top_p | ||
| 73 | - out = mlx_lm.generate(self._model, self._tokenizer, prompt=prompt, **kwargs) | ||
| 74 | - return str(out) | ||
| 75 | - | ||
| 76 | - def close(self) -> None: | ||
| 77 | - return None | ||
| 78 | - | ||
| 79 | - # -- ScoringBackend ------------------------------------------------ | ||
| 80 | - | ||
| 81 | - def _forward_logits(self, prompt: str) -> np.ndarray: | ||
| 82 | - """Run the model once and return ``(seq_len, vocab)`` logits.""" | ||
| 83 | - mx, _ = _require_mlx() | ||
| 84 | - input_ids = self._tokenizer.encode(prompt) | ||
| 85 | - tokens = mx.array(input_ids)[None, :] # (1, T) | ||
| 86 | - out = self._model(tokens) | ||
| 87 | - # mlx_lm models return an mx.array; convert to numpy for downstream math. | ||
| 88 | - return np.asarray(out[0]) | ||
| 89 | - | ||
| 90 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 91 | - input_ids = self._tokenizer.encode(prompt) | ||
| 92 | - full_ids = self._tokenizer.encode(prompt + completion) | ||
| 93 | - if len(full_ids) <= len(input_ids): | ||
| 94 | - raise ProbeError( | ||
| 95 | - "logprob_of", | ||
| 96 | - f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})", | ||
| 97 | - ) | ||
| 98 | - logits = self._forward_logits(prompt + completion) # (T, V) | ||
| 99 | - # Position t predicts token t+1 — slice off the last row and the prompt span. | ||
| 100 | - shift = logits[len(input_ids) - 1 : -1, :] | ||
| 101 | - target_ids = np.asarray(full_ids[len(input_ids) :], dtype=np.int64) | ||
| 102 | - log_probs = _log_softmax(shift.astype(np.float64), axis=-1) | ||
| 103 | - gathered = log_probs[np.arange(len(target_ids)), target_ids] | ||
| 104 | - return float(gathered.sum()) | ||
| 105 | - | ||
| 106 | - def rolling_logprob(self, text: str) -> RollingLogprob: | ||
| 107 | - ids = self._tokenizer.encode(text) | ||
| 108 | - if len(ids) < 2: | ||
| 109 | - return RollingLogprob( | ||
| 110 | - token_ids=np.asarray(ids, dtype=np.int64), | ||
| 111 | - logprobs=np.array([], dtype=np.float32), | ||
| 112 | - num_tokens=len(ids), | ||
| 113 | - total_logprob=0.0, | ||
| 114 | - ) | ||
| 115 | - logits = self._forward_logits(text) | ||
| 116 | - log_probs = _log_softmax(logits[:-1].astype(np.float64), axis=-1) | ||
| 117 | - ids_arr = np.asarray(ids, dtype=np.int64) | ||
| 118 | - gathered = log_probs[np.arange(len(ids) - 1), ids_arr[1:]] | ||
| 119 | - return RollingLogprob( | ||
| 120 | - token_ids=ids_arr, | ||
| 121 | - logprobs=gathered.astype(np.float32), | ||
| 122 | - num_tokens=len(ids), | ||
| 123 | - total_logprob=float(gathered.sum()), | ||
| 124 | - ) | ||
| 125 | - | ||
| 126 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 127 | - logits = self._forward_logits(prompt) | ||
| 128 | - last_logits = logits[-1].astype(np.float64) | ||
| 129 | - log_probs = _log_softmax(last_logits, axis=-1) | ||
| 130 | - k = min(top_k, log_probs.shape[0]) | ||
| 131 | - # np.argpartition for top-k then sort the partition. | ||
| 132 | - part = np.argpartition(log_probs, -k)[-k:] | ||
| 133 | - top_ids = part[np.argsort(log_probs[part])[::-1]] | ||
| 134 | - top_lp = log_probs[top_ids] | ||
| 135 | - tail_mass = float(1.0 - np.exp(top_lp).sum()) | ||
| 136 | - tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0 | ||
| 137 | - return TokenDist( | ||
| 138 | - token_ids=top_ids.astype(np.int64), | ||
| 139 | - logprobs=top_lp.astype(np.float32), | ||
| 140 | - vocab_size=int(log_probs.shape[0]), | ||
| 141 | - tail_logprob=tail_logprob, | ||
| 142 | - ) | ||
| 143 | - | ||
| 144 | - | ||
| 145 | -class MLXDifferentialBackend: | ||
| 146 | - """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for MLX models. | ||
| 147 | - | ||
| 148 | - Loads two copies of the same base model — one bare, one with the | ||
| 149 | - adapter fused — because MLX has no runtime toggle. Memory cost: 2× | ||
| 150 | - base weights. On typical Apple Silicon workloads with ≤3B models | ||
| 151 | - this is acceptable. | ||
| 152 | - """ | ||
| 153 | - | ||
| 154 | - def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None: | ||
| 155 | - mx, mlx_lm = _require_mlx() | ||
| 156 | - self._mx = mx | ||
| 157 | - self._spec = base_spec | ||
| 158 | - self._adapter_path = Path(adapter_path).expanduser().resolve() | ||
| 159 | - | ||
| 160 | - # Load bare base (no adapter). | ||
| 161 | - self._base_model, self._tokenizer = mlx_lm.load(base_spec.base) | ||
| 162 | - # Load ft with adapter attached. ``adapter_path`` is mlx_lm's kwarg. | ||
| 163 | - self._ft_model, _ = mlx_lm.load(base_spec.base, adapter_path=str(self._adapter_path)) | ||
| 164 | - self._active: str | None = None | ||
| 165 | - | ||
| 166 | - @contextmanager | ||
| 167 | - def as_base(self) -> Iterator[_MLXView]: | ||
| 168 | - self._enter("base") | ||
| 169 | - try: | ||
| 170 | - yield _MLXView(id="base", _model=self._base_model, _tokenizer=self._tokenizer) | ||
| 171 | - finally: | ||
| 172 | - self._exit() | ||
| 173 | - | ||
| 174 | - @contextmanager | ||
| 175 | - def as_finetuned(self) -> Iterator[_MLXView]: | ||
| 176 | - self._enter("ft") | ||
| 177 | - try: | ||
| 178 | - yield _MLXView(id="ft", _model=self._ft_model, _tokenizer=self._tokenizer) | ||
| 179 | - finally: | ||
| 180 | - self._exit() | ||
| 181 | - | ||
| 182 | - def close(self) -> None: | ||
| 183 | - """MLX reclaims memory when references drop; nothing to do here.""" | ||
| 184 | - return | ||
| 185 | - | ||
| 186 | - def _enter(self, mode: str) -> None: | ||
| 187 | - if self._active is not None: | ||
| 188 | - raise RuntimeError( | ||
| 189 | - f"MLXDifferentialBackend view {self._active!r} already active; " | ||
| 190 | - f"exit it before entering {mode!r}." | ||
| 191 | - ) | ||
| 192 | - self._active = mode | ||
| 193 | - | ||
| 194 | - def _exit(self) -> None: | ||
| 195 | - self._active = None | ||
| 196 | - | ||
| 197 | - | ||
| 198 | -def _log_softmax(x: np.ndarray, *, axis: int) -> np.ndarray: | ||
| 199 | - x_max = np.max(x, axis=axis, keepdims=True) | ||
| 200 | - y = x - x_max | ||
| 201 | - log_sum = np.log(np.sum(np.exp(y), axis=axis, keepdims=True)) | ||
| 202 | - return np.asarray(y - log_sum, dtype=np.float64) | ||
| 203 | - | ||
| 204 | - | ||
| 205 | -__all__ = ["MLXDifferentialBackend"] | ||
sway/src/dlm_sway/cli/__init__.pydeleted@@ -1,1 +0,0 @@ | |||
| 1 | -"""Command-line interface (entry point: ``dlm-sway``).""" | ||
sway/src/dlm_sway/cli/app.pydeleted@@ -1,59 +0,0 @@ | |||
| 1 | -"""dlm-sway CLI entry point. | ||
| 2 | - | ||
| 3 | -``pip install dlm-sway`` installs this module's :func:`main` as the | ||
| 4 | -``dlm-sway`` console script. Every subcommand is a thin wrapper around a | ||
| 5 | -library-level function so the CLI surface mirrors what programmatic | ||
| 6 | -callers get. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -import typer | ||
| 12 | - | ||
| 13 | -from dlm_sway import __version__ | ||
| 14 | -from dlm_sway.cli import commands | ||
| 15 | - | ||
| 16 | -app = typer.Typer( | ||
| 17 | - name="dlm-sway", | ||
| 18 | - no_args_is_help=True, | ||
| 19 | - add_completion=False, | ||
| 20 | - help="Differential testing for fine-tuned causal language models.", | ||
| 21 | -) | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def _version_callback(value: bool) -> None: | ||
| 25 | - if value: | ||
| 26 | - typer.echo(f"dlm-sway {__version__}") | ||
| 27 | - raise typer.Exit() | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -@app.callback() | ||
| 31 | -def _root( | ||
| 32 | - version: bool = typer.Option( # noqa: B008 — typer pattern | ||
| 33 | - False, | ||
| 34 | - "--version", | ||
| 35 | - callback=_version_callback, | ||
| 36 | - is_eager=True, | ||
| 37 | - help="Print version and exit.", | ||
| 38 | - ), | ||
| 39 | -) -> None: | ||
| 40 | - """Root callback; accepts ``--version``.""" | ||
| 41 | - del version | ||
| 42 | - | ||
| 43 | - | ||
| 44 | -app.command("run")(commands.run_cmd) | ||
| 45 | -app.command("gate")(commands.gate_cmd) | ||
| 46 | -app.command("check")(commands.check_cmd) | ||
| 47 | -app.command("diff")(commands.diff_cmd) | ||
| 48 | -app.command("autogen")(commands.autogen_cmd) | ||
| 49 | -app.command("doctor")(commands.doctor_cmd) | ||
| 50 | -app.command("report")(commands.report_cmd) | ||
| 51 | - | ||
| 52 | - | ||
| 53 | -def main() -> None: | ||
| 54 | - """Script entry point registered in :file:`pyproject.toml`.""" | ||
| 55 | - app() | ||
| 56 | - | ||
| 57 | - | ||
| 58 | -if __name__ == "__main__": | ||
| 59 | - main() | ||
sway/src/dlm_sway/cli/commands.pydeleted@@ -1,396 +0,0 @@ | |||
| 1 | -"""Command implementations for the ``dlm-sway`` CLI. | ||
| 2 | - | ||
| 3 | -Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`. | ||
| 4 | -Commands deliberately do as little as possible themselves — the real | ||
| 5 | -work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the | ||
| 6 | -probes package. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -import json | ||
| 12 | -import sys | ||
| 13 | -from pathlib import Path | ||
| 14 | -from typing import Annotated, Any | ||
| 15 | - | ||
| 16 | -import typer | ||
| 17 | -from rich.console import Console | ||
| 18 | - | ||
| 19 | -from dlm_sway import __version__ | ||
| 20 | -from dlm_sway.core.errors import SwayError | ||
| 21 | -from dlm_sway.core.result import SuiteResult, SwayScore, Verdict | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def run_cmd( | ||
| 25 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | ||
| 26 | - json_out: Annotated[ | ||
| 27 | - Path | None, | ||
| 28 | - typer.Option( | ||
| 29 | - "--json", | ||
| 30 | - "-j", | ||
| 31 | - help="Write the JSON report to this path in addition to the terminal render.", | ||
| 32 | - ), | ||
| 33 | - ] = None, | ||
| 34 | - markdown_out: Annotated[ | ||
| 35 | - Path | None, | ||
| 36 | - typer.Option("--markdown", "-m", help="Write a markdown report to this path."), | ||
| 37 | - ] = None, | ||
| 38 | -) -> None: | ||
| 39 | - """Execute a suite and render a terminal report.""" | ||
| 40 | - try: | ||
| 41 | - result, score_obj = _execute_spec(spec) | ||
| 42 | - except SwayError as exc: | ||
| 43 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | ||
| 44 | - raise typer.Exit(code=2) from exc | ||
| 45 | - | ||
| 46 | - from dlm_sway.suite import report | ||
| 47 | - | ||
| 48 | - console = Console() | ||
| 49 | - report.to_terminal(result, score_obj, console=console) | ||
| 50 | - | ||
| 51 | - if json_out is not None: | ||
| 52 | - json_out.write_text(report.to_json(result, score_obj), encoding="utf-8") | ||
| 53 | - console.print(f"\n[dim]wrote JSON → {json_out}[/dim]") | ||
| 54 | - if markdown_out is not None: | ||
| 55 | - markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8") | ||
| 56 | - console.print(f"[dim]wrote markdown → {markdown_out}[/dim]") | ||
| 57 | - | ||
| 58 | - | ||
| 59 | -def gate_cmd( | ||
| 60 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | ||
| 61 | - junit_out: Annotated[ | ||
| 62 | - Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.") | ||
| 63 | - ] = None, | ||
| 64 | - coverage_threshold: Annotated[ | ||
| 65 | - float | None, | ||
| 66 | - typer.Option( | ||
| 67 | - "--threshold", | ||
| 68 | - help="Override the spec's coverage_threshold. Exit non-zero below it.", | ||
| 69 | - ), | ||
| 70 | - ] = None, | ||
| 71 | -) -> None: | ||
| 72 | - """Execute a suite and exit non-zero on failure (CI gate).""" | ||
| 73 | - try: | ||
| 74 | - result, score_obj = _execute_spec(spec) | ||
| 75 | - except SwayError as exc: | ||
| 76 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | ||
| 77 | - raise typer.Exit(code=2) from exc | ||
| 78 | - | ||
| 79 | - from dlm_sway.suite import report | ||
| 80 | - from dlm_sway.suite.loader import load_spec as _load_spec | ||
| 81 | - | ||
| 82 | - console = Console() | ||
| 83 | - report.to_terminal(result, score_obj, console=console) | ||
| 84 | - | ||
| 85 | - if junit_out is not None: | ||
| 86 | - junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8") | ||
| 87 | - console.print(f"[dim]wrote JUnit → {junit_out}[/dim]") | ||
| 88 | - | ||
| 89 | - threshold = ( | ||
| 90 | - coverage_threshold | ||
| 91 | - if coverage_threshold is not None | ||
| 92 | - else _load_spec(spec).defaults.coverage_threshold | ||
| 93 | - ) | ||
| 94 | - has_failures = any(p.verdict == Verdict.FAIL for p in result.probes) | ||
| 95 | - below_threshold = score_obj.overall < threshold | ||
| 96 | - if has_failures or below_threshold: | ||
| 97 | - console.print( | ||
| 98 | - f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}" | ||
| 99 | - if below_threshold | ||
| 100 | - else "\n[red]gate FAILED[/red] — at least one probe reported FAIL" | ||
| 101 | - ) | ||
| 102 | - raise typer.Exit(code=1) | ||
| 103 | - console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}") | ||
| 104 | - | ||
| 105 | - | ||
| 106 | -def check_cmd( | ||
| 107 | - adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")], | ||
| 108 | - base: Annotated[str, typer.Option("--base", help="HuggingFace base model id or local path.")], | ||
| 109 | - prompts: Annotated[ | ||
| 110 | - Path | None, | ||
| 111 | - typer.Option( | ||
| 112 | - "--prompts", | ||
| 113 | - help="File with one prompt per line. Defaults to sway's built-in quick set.", | ||
| 114 | - ), | ||
| 115 | - ] = None, | ||
| 116 | -) -> None: | ||
| 117 | - """<60s smoke test: "is this adapter doing anything at all?". | ||
| 118 | - | ||
| 119 | - Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No | ||
| 120 | - spec file required. | ||
| 121 | - """ | ||
| 122 | - from dlm_sway.backends import build as build_backend | ||
| 123 | - from dlm_sway.core.model import ModelSpec | ||
| 124 | - from dlm_sway.suite import report | ||
| 125 | - from dlm_sway.suite.runner import run as run_suite | ||
| 126 | - from dlm_sway.suite.score import compute as compute_score | ||
| 127 | - from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec | ||
| 128 | - | ||
| 129 | - quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS | ||
| 130 | - | ||
| 131 | - base_spec = ModelSpec(base=base, kind="hf") | ||
| 132 | - ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter) | ||
| 133 | - spec = SwaySpec( | ||
| 134 | - version=1, | ||
| 135 | - models=SuiteModels(base=base_spec, ft=ft_spec), | ||
| 136 | - defaults=SuiteDefaults(seed=0), | ||
| 137 | - suite=[ | ||
| 138 | - { | ||
| 139 | - "name": "quick_delta_kl", | ||
| 140 | - "kind": "delta_kl", | ||
| 141 | - "prompts": list(quick_prompts), | ||
| 142 | - "assert_mean_gte": 0.01, | ||
| 143 | - }, | ||
| 144 | - { | ||
| 145 | - "name": "quick_calibration", | ||
| 146 | - "kind": "calibration_drift", | ||
| 147 | - "items_limit": 10, | ||
| 148 | - }, | ||
| 149 | - ], | ||
| 150 | - ) | ||
| 151 | - try: | ||
| 152 | - backend = build_backend(ft_spec) | ||
| 153 | - except SwayError as exc: | ||
| 154 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | ||
| 155 | - raise typer.Exit(code=2) from exc | ||
| 156 | - | ||
| 157 | - try: | ||
| 158 | - result = run_suite(spec, backend, spec_path="<check>") | ||
| 159 | - finally: | ||
| 160 | - _close_if_possible(backend) | ||
| 161 | - score_obj = compute_score(result) | ||
| 162 | - report.to_terminal(result, score_obj, console=Console()) | ||
| 163 | - | ||
| 164 | - | ||
| 165 | -def diff_cmd( | ||
| 166 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | ||
| 167 | - adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")], | ||
| 168 | - adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")], | ||
| 169 | -) -> None: | ||
| 170 | - """Run the same suite against two adapters and show per-probe deltas.""" | ||
| 171 | - from dlm_sway.backends import build as build_backend | ||
| 172 | - from dlm_sway.suite.loader import load_spec | ||
| 173 | - from dlm_sway.suite.runner import run as run_suite | ||
| 174 | - from dlm_sway.suite.score import compute as compute_score | ||
| 175 | - | ||
| 176 | - sway_spec = load_spec(spec) | ||
| 177 | - console = Console() | ||
| 178 | - | ||
| 179 | - def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]: | ||
| 180 | - ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path}) | ||
| 181 | - backend = build_backend(ft_spec) | ||
| 182 | - try: | ||
| 183 | - result = run_suite(sway_spec, backend, spec_path=str(spec)) | ||
| 184 | - finally: | ||
| 185 | - _close_if_possible(backend) | ||
| 186 | - scored = compute_score(result) | ||
| 187 | - per_probe = {p.name: (p.score or 0.0) for p in result.probes} | ||
| 188 | - return scored.overall, per_probe | ||
| 189 | - | ||
| 190 | - try: | ||
| 191 | - overall_a, per_a = _score_for(adapter_a) | ||
| 192 | - overall_b, per_b = _score_for(adapter_b) | ||
| 193 | - except SwayError as exc: | ||
| 194 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | ||
| 195 | - raise typer.Exit(code=2) from exc | ||
| 196 | - | ||
| 197 | - console.print(f"[bold]overall[/bold] A: {overall_a:.2f} B: {overall_b:.2f}") | ||
| 198 | - console.print() | ||
| 199 | - console.print("[bold]per-probe[/bold] (A → B, Δ):") | ||
| 200 | - for name in sorted(per_a.keys() | per_b.keys()): | ||
| 201 | - a = per_a.get(name, 0.0) | ||
| 202 | - b = per_b.get(name, 0.0) | ||
| 203 | - delta = b - a | ||
| 204 | - sign = "+" if delta >= 0 else "" | ||
| 205 | - console.print(f" {name:<30} {a:.2f} → {b:.2f} ({sign}{delta:+.2f})") | ||
| 206 | - | ||
| 207 | - | ||
| 208 | -def autogen_cmd( | ||
| 209 | - dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")], | ||
| 210 | - out: Annotated[ | ||
| 211 | - Path, | ||
| 212 | - typer.Option("--out", "-o", help="Where to write the generated sway.yaml."), | ||
| 213 | - ] = Path("sway.yaml"), | ||
| 214 | -) -> None: | ||
| 215 | - """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm]).""" | ||
| 216 | - import importlib | ||
| 217 | - | ||
| 218 | - try: | ||
| 219 | - autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen") | ||
| 220 | - except ImportError as exc: | ||
| 221 | - typer.secho( | ||
| 222 | - "dlm integration not installed — run: pip install 'dlm-sway[dlm]'", | ||
| 223 | - err=True, | ||
| 224 | - fg=typer.colors.RED, | ||
| 225 | - ) | ||
| 226 | - raise typer.Exit(code=2) from exc | ||
| 227 | - | ||
| 228 | - try: | ||
| 229 | - autogen_mod.write_sway_yaml(dlm_path, out) | ||
| 230 | - except SwayError as exc: | ||
| 231 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | ||
| 232 | - raise typer.Exit(code=2) from exc | ||
| 233 | - | ||
| 234 | - typer.echo(f"wrote {out}") | ||
| 235 | - | ||
| 236 | - | ||
| 237 | -def doctor_cmd() -> None: | ||
| 238 | - """Print backend availability and version info.""" | ||
| 239 | - console = Console() | ||
| 240 | - console.print(f"[bold]dlm-sway[/bold] {__version__}") | ||
| 241 | - console.print(f" python: {sys.version.split()[0]}") | ||
| 242 | - console.print(f" platform: {sys.platform}") | ||
| 243 | - console.print() | ||
| 244 | - | ||
| 245 | - console.print("[bold]backends[/bold]") | ||
| 246 | - console.print( | ||
| 247 | - f" hf: {_probe_import('torch')} {_probe_import('transformers')} {_probe_import('peft')}" | ||
| 248 | - ) | ||
| 249 | - console.print(f" mlx: {_probe_import('mlx')} {_probe_import('mlx_lm')}") | ||
| 250 | - console.print(f" semsim: {_probe_import('sentence_transformers')}") | ||
| 251 | - console.print( | ||
| 252 | - f" style+: {_probe_import('spacy')} {_probe_import('textstat')} {_probe_import('nlpaug')}" | ||
| 253 | - ) | ||
| 254 | - console.print(f" dlm: {_probe_import('dlm')}") | ||
| 255 | - console.print(f" viz: {_probe_import('matplotlib')}") | ||
| 256 | - | ||
| 257 | - | ||
| 258 | -def report_cmd( | ||
| 259 | - result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")], | ||
| 260 | - format: Annotated[ | ||
| 261 | - str, typer.Option("--format", help="Output format: terminal, md, junit, json.") | ||
| 262 | - ] = "terminal", | ||
| 263 | -) -> None: | ||
| 264 | - """Re-render a previously saved run (for history tracking / dashboards).""" | ||
| 265 | - raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8")) | ||
| 266 | - fmt = format.lower() | ||
| 267 | - if fmt == "json": | ||
| 268 | - typer.echo(json.dumps(raw, indent=2, sort_keys=True)) | ||
| 269 | - return | ||
| 270 | - if fmt in {"md", "markdown"}: | ||
| 271 | - # A file-level re-render needs the dataclasses back; simplest is | ||
| 272 | - # to synthesize a minimal markdown from the JSON directly. | ||
| 273 | - typer.echo(_render_markdown_from_json(raw)) | ||
| 274 | - return | ||
| 275 | - if fmt == "junit": | ||
| 276 | - typer.echo(_render_junit_from_json(raw)) | ||
| 277 | - return | ||
| 278 | - # Default: terminal-ish one-liner summary. | ||
| 279 | - score: dict[str, Any] = raw.get("score", {}) | ||
| 280 | - typer.echo(f"overall: {score.get('overall', 0.0):.2f} [{score.get('band', '?')}]") | ||
| 281 | - probes: list[dict[str, Any]] = raw.get("probes", []) | ||
| 282 | - for p in probes: | ||
| 283 | - typer.echo( | ||
| 284 | - f" {p['name']:<30} {p['verdict']:<6} " | ||
| 285 | - f"{(p.get('score') or 0.0):.2f} {p.get('message', '')[:60]}" | ||
| 286 | - ) | ||
| 287 | - | ||
| 288 | - | ||
| 289 | -# -- helpers ----------------------------------------------------------- | ||
| 290 | - | ||
| 291 | - | ||
| 292 | -_BUILTIN_QUICK_PROMPTS: tuple[str, ...] = ( | ||
| 293 | - "The quick brown fox", | ||
| 294 | - "Once upon a time", | ||
| 295 | - "The answer to the question is", | ||
| 296 | - "One important lesson is", | ||
| 297 | - "In my opinion,", | ||
| 298 | - "The first step is to", | ||
| 299 | - "Remember that", | ||
| 300 | - "A common mistake is", | ||
| 301 | -) | ||
| 302 | - | ||
| 303 | - | ||
| 304 | -def _load_prompts(path: Path) -> tuple[str, ...]: | ||
| 305 | - return tuple( | ||
| 306 | - line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip() | ||
| 307 | - ) | ||
| 308 | - | ||
| 309 | - | ||
| 310 | -def _execute_spec(path: Path) -> tuple[SuiteResult, SwayScore]: | ||
| 311 | - """Load a spec, build a backend, run the suite, fold scores. Shared | ||
| 312 | - by ``run`` and ``gate``. Picks up .dlm-derived sections when the | ||
| 313 | - spec's ``dlm_source`` is set.""" | ||
| 314 | - from dlm_sway.backends import build as build_backend | ||
| 315 | - from dlm_sway.suite.loader import load_spec | ||
| 316 | - from dlm_sway.suite.runner import run as run_suite | ||
| 317 | - from dlm_sway.suite.score import compute as compute_score | ||
| 318 | - | ||
| 319 | - spec = load_spec(path) | ||
| 320 | - sections = None | ||
| 321 | - doc_text = None | ||
| 322 | - if spec.dlm_source is not None: | ||
| 323 | - import importlib | ||
| 324 | - | ||
| 325 | - try: | ||
| 326 | - resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver") | ||
| 327 | - handle = resolver.resolve_dlm(Path(spec.dlm_source)) | ||
| 328 | - sections = handle.sections | ||
| 329 | - doc_text = handle.doc_text | ||
| 330 | - except ImportError: | ||
| 331 | - # Honoring dlm_source is best-effort — probes that need | ||
| 332 | - # sections will SKIP with a pointer at the extra. | ||
| 333 | - sections = None | ||
| 334 | - backend = build_backend(spec.models.ft) | ||
| 335 | - try: | ||
| 336 | - result = run_suite(spec, backend, spec_path=str(path), sections=sections, doc_text=doc_text) | ||
| 337 | - finally: | ||
| 338 | - _close_if_possible(backend) | ||
| 339 | - score_obj = compute_score(result) | ||
| 340 | - return result, score_obj | ||
| 341 | - | ||
| 342 | - | ||
| 343 | -def _close_if_possible(backend: object) -> None: | ||
| 344 | - close = getattr(backend, "close", None) | ||
| 345 | - if callable(close): | ||
| 346 | - close() | ||
| 347 | - | ||
| 348 | - | ||
| 349 | -def _probe_import(name: str) -> str: | ||
| 350 | - import importlib | ||
| 351 | - | ||
| 352 | - try: | ||
| 353 | - mod = importlib.import_module(name) | ||
| 354 | - except ImportError: | ||
| 355 | - return f"[red]{name}: missing[/red]" | ||
| 356 | - ver = getattr(mod, "__version__", "installed") | ||
| 357 | - return f"[green]{name}: {ver}[/green]" | ||
| 358 | - | ||
| 359 | - | ||
| 360 | -def _render_markdown_from_json(raw: dict[str, Any]) -> str: | ||
| 361 | - score: dict[str, Any] = raw.get("score", {}) | ||
| 362 | - lines: list[str] = [ | ||
| 363 | - "# dlm-sway report", | ||
| 364 | - "", | ||
| 365 | - f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`) ", | ||
| 366 | - f"**Base:** `{raw.get('base_model_id', '?')}` ", | ||
| 367 | - f"**Adapter:** `{raw.get('adapter_id', '?')}` ", | ||
| 368 | - "", | ||
| 369 | - "## Probes", | ||
| 370 | - "", | ||
| 371 | - "| name | kind | verdict | score |", | ||
| 372 | - "|---|---|---|---:|", | ||
| 373 | - ] | ||
| 374 | - probes: list[dict[str, Any]] = raw.get("probes", []) | ||
| 375 | - for p in probes: | ||
| 376 | - lines.append( | ||
| 377 | - f"| {p['name']} | `{p['kind']}` | {p['verdict']} | {(p.get('score') or 0.0):.2f} |" | ||
| 378 | - ) | ||
| 379 | - return "\n".join(lines) | ||
| 380 | - | ||
| 381 | - | ||
| 382 | -def _render_junit_from_json(raw: dict[str, Any]) -> str: | ||
| 383 | - """Minimal JUnit renderer from a saved JSON (useful for report --format junit).""" | ||
| 384 | - import xml.etree.ElementTree as ET | ||
| 385 | - | ||
| 386 | - probes: list[dict[str, Any]] = raw.get("probes", []) | ||
| 387 | - testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))}) | ||
| 388 | - for p in probes: | ||
| 389 | - tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]}) | ||
| 390 | - if p["verdict"] == "fail": | ||
| 391 | - ET.SubElement(tc, "failure", {"message": p.get("message", "")}) | ||
| 392 | - elif p["verdict"] == "error": | ||
| 393 | - ET.SubElement(tc, "error", {"message": p.get("message", "")}) | ||
| 394 | - elif p["verdict"] == "skip": | ||
| 395 | - ET.SubElement(tc, "skipped", {"message": p.get("message", "")}) | ||
| 396 | - return ET.tostring(testsuite, encoding="unicode") | ||
sway/src/dlm_sway/core/__init__.pydeleted@@ -1,1 +0,0 @@ | |||
| 1 | -"""Core abstractions: protocols, results, errors, determinism.""" | ||
sway/src/dlm_sway/core/determinism.pydeleted@@ -1,97 +0,0 @@ | |||
| 1 | -"""Deterministic-execution helper. | ||
| 2 | - | ||
| 3 | -Mirrors ``dlm.train.determinism.seed_everything`` so running the same | ||
| 4 | -suite twice on the same host produces the same :class:`ProbeResult` | ||
| 5 | -payloads. The dlm project treats determinism as a contract; sway takes | ||
| 6 | -the same posture for scoring operations. | ||
| 7 | - | ||
| 8 | -Generation is allowed to use non-deterministic attention kernels when | ||
| 9 | -``temperature > 0``, because a deterministic sampled generation is a | ||
| 10 | -contradiction. Scoring (logprobs, rolling logprobs, next-token dists) | ||
| 11 | -always runs under :func:`torch.use_deterministic_algorithms(True)`. | ||
| 12 | -""" | ||
| 13 | - | ||
| 14 | -from __future__ import annotations | ||
| 15 | - | ||
| 16 | -import os | ||
| 17 | -import random | ||
| 18 | -from dataclasses import dataclass | ||
| 19 | -from typing import Literal | ||
| 20 | - | ||
| 21 | -DeterminismClass = Literal["strict", "best_effort", "loose"] | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -@dataclass(frozen=True, slots=True) | ||
| 25 | -class DeterminismSummary: | ||
| 26 | - """What seeding actually accomplished, for logging in the report.""" | ||
| 27 | - | ||
| 28 | - class_: DeterminismClass | ||
| 29 | - seed: int | ||
| 30 | - notes: tuple[str, ...] = () | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -def seed_everything(seed: int, *, strict: bool = True) -> DeterminismSummary: | ||
| 34 | - """Seed every RNG sway's probes touch and flip backend flags. | ||
| 35 | - | ||
| 36 | - Idempotent — safe to call repeatedly with the same seed. | ||
| 37 | - | ||
| 38 | - Parameters | ||
| 39 | - ---------- | ||
| 40 | - seed: | ||
| 41 | - The seed. Callers typically use the value from ``sway.yaml``'s | ||
| 42 | - ``defaults.seed`` (default 0). | ||
| 43 | - strict: | ||
| 44 | - If ``True`` (the default), request deterministic CUDA algorithms | ||
| 45 | - and set ``CUBLAS_WORKSPACE_CONFIG``. Scoring probes need this; | ||
| 46 | - generation-only runs can set it ``False``. | ||
| 47 | - | ||
| 48 | - Returns | ||
| 49 | - ------- | ||
| 50 | - :class:`DeterminismSummary` with a classification: | ||
| 51 | - | ||
| 52 | - - ``"strict"`` — deterministic algorithms active, no warnings. | ||
| 53 | - - ``"best_effort"`` — platform doesn't support full determinism | ||
| 54 | - (MPS, some CPU kernels). | ||
| 55 | - - ``"loose"`` — seeded but deterministic algorithms refused. | ||
| 56 | - """ | ||
| 57 | - | ||
| 58 | - notes: list[str] = [] | ||
| 59 | - clazz: DeterminismClass = "best_effort" | ||
| 60 | - | ||
| 61 | - # Env vars must come first — torch reads them at cuBLAS init. | ||
| 62 | - if strict: | ||
| 63 | - os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") | ||
| 64 | - | ||
| 65 | - random.seed(seed) | ||
| 66 | - | ||
| 67 | - # numpy is a hard dep; safe to seed unconditionally. | ||
| 68 | - import numpy as np | ||
| 69 | - | ||
| 70 | - np.random.seed(seed) | ||
| 71 | - | ||
| 72 | - try: | ||
| 73 | - import torch # noqa: PLC0415 — lazy: torch is an optional extra. | ||
| 74 | - except ModuleNotFoundError: | ||
| 75 | - notes.append("torch not installed; seeded python + numpy only") | ||
| 76 | - return DeterminismSummary(class_="best_effort", seed=seed, notes=tuple(notes)) | ||
| 77 | - | ||
| 78 | - torch.manual_seed(seed) | ||
| 79 | - if torch.cuda.is_available(): | ||
| 80 | - torch.cuda.manual_seed_all(seed) | ||
| 81 | - clazz = "strict" | ||
| 82 | - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): | ||
| 83 | - clazz = "best_effort" | ||
| 84 | - notes.append("MPS: bit-identical across runs is best-effort") | ||
| 85 | - else: | ||
| 86 | - clazz = "best_effort" | ||
| 87 | - notes.append("CPU-only backend: strict determinism depends on BLAS impl") | ||
| 88 | - | ||
| 89 | - if strict: | ||
| 90 | - try: | ||
| 91 | - torch.use_deterministic_algorithms(True, warn_only=True) | ||
| 92 | - torch.backends.cudnn.benchmark = False | ||
| 93 | - except Exception as exc: # noqa: BLE001 — torch raises a naked Exception | ||
| 94 | - clazz = "loose" | ||
| 95 | - notes.append(f"deterministic algorithms refused: {exc}") | ||
| 96 | - | ||
| 97 | - return DeterminismSummary(class_=clazz, seed=seed, notes=tuple(notes)) | ||
sway/src/dlm_sway/core/errors.pydeleted@@ -1,65 +0,0 @@ | |||
| 1 | -"""Exception hierarchy for dlm-sway. | ||
| 2 | - | ||
| 3 | -Every error sway raises inherits from :class:`SwayError` so callers can | ||
| 4 | -catch the whole family with a single ``except``. Subclasses carry enough | ||
| 5 | -context (spec paths, probe names, missing extras) for the CLI to render | ||
| 6 | -actionable messages without the caller having to introspect an exception | ||
| 7 | -chain. | ||
| 8 | -""" | ||
| 9 | - | ||
| 10 | -from __future__ import annotations | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -class SwayError(Exception): | ||
| 14 | - """Root of the dlm-sway exception hierarchy.""" | ||
| 15 | - | ||
| 16 | - | ||
| 17 | -class SpecValidationError(SwayError): | ||
| 18 | - """A ``sway.yaml`` (or equivalent) failed pydantic validation. | ||
| 19 | - | ||
| 20 | - Parameters | ||
| 21 | - ---------- | ||
| 22 | - message: | ||
| 23 | - Human-readable summary of what went wrong. | ||
| 24 | - source: | ||
| 25 | - Path or identifier of the spec being validated, if known. | ||
| 26 | - """ | ||
| 27 | - | ||
| 28 | - def __init__(self, message: str, *, source: str | None = None) -> None: | ||
| 29 | - super().__init__(message) | ||
| 30 | - self.source = source | ||
| 31 | - | ||
| 32 | - def __str__(self) -> str: | ||
| 33 | - base = super().__str__() | ||
| 34 | - return f"{self.source}: {base}" if self.source else base | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class BackendNotAvailableError(SwayError): | ||
| 38 | - """A requested backend's optional dependencies aren't installed. | ||
| 39 | - | ||
| 40 | - The CLI turns this into a pointed ``pip install dlm-sway[<extra>]`` | ||
| 41 | - hint; programmatic callers can read :attr:`extra` directly. | ||
| 42 | - """ | ||
| 43 | - | ||
| 44 | - def __init__(self, backend: str, *, extra: str, hint: str | None = None) -> None: | ||
| 45 | - message = ( | ||
| 46 | - f"backend {backend!r} unavailable — install the extra: pip install 'dlm-sway[{extra}]'" | ||
| 47 | - ) | ||
| 48 | - if hint: | ||
| 49 | - message = f"{message}\n{hint}" | ||
| 50 | - super().__init__(message) | ||
| 51 | - self.backend = backend | ||
| 52 | - self.extra = extra | ||
| 53 | - | ||
| 54 | - | ||
| 55 | -class ProbeError(SwayError): | ||
| 56 | - """A probe failed to *execute* (as opposed to failing its assertion). | ||
| 57 | - | ||
| 58 | - Distinct from a ``verdict=FAIL`` result — assertion failures are | ||
| 59 | - normal and reported via :class:`ProbeResult`. This is for genuine | ||
| 60 | - bugs: missing sections, mismatched tokenizers, NaN logits. | ||
| 61 | - """ | ||
| 62 | - | ||
| 63 | - def __init__(self, probe: str, message: str) -> None: | ||
| 64 | - super().__init__(f"probe {probe!r}: {message}") | ||
| 65 | - self.probe = probe | ||
sway/src/dlm_sway/core/model.pydeleted@@ -1,112 +0,0 @@ | |||
| 1 | -"""The :class:`Model` abstraction and :class:`ModelSpec` user-facing config. | ||
| 2 | - | ||
| 3 | -Probes operate on objects that satisfy :class:`Model` (for generation) | ||
| 4 | -and :class:`~dlm_sway.core.scoring.ScoringBackend` (for logit-level | ||
| 5 | -access). Backends return concrete instances of both — they are | ||
| 6 | -deliberately separate Protocols because not every backend exposes logits | ||
| 7 | -(e.g. an Ollama HTTP backend would implement ``Model`` but not | ||
| 8 | -``ScoringBackend``). | ||
| 9 | - | ||
| 10 | -The user-facing surface is :class:`ModelSpec`, a pydantic model that | ||
| 11 | -describes how to materialize a base + adapter pair. No ``.dlm`` | ||
| 12 | -concepts live at this layer — those belong in | ||
| 13 | -:mod:`dlm_sway.integrations.dlm`. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | -from __future__ import annotations | ||
| 17 | - | ||
| 18 | -from dataclasses import dataclass | ||
| 19 | -from pathlib import Path | ||
| 20 | -from typing import Any, Literal, Protocol, runtime_checkable | ||
| 21 | - | ||
| 22 | -from pydantic import BaseModel, ConfigDict, Field | ||
| 23 | - | ||
| 24 | -BackendKind = Literal["hf", "mlx", "dummy", "custom"] | ||
| 25 | -"""Registered scoring-backend kinds. | ||
| 26 | - | ||
| 27 | -``custom`` is an escape hatch — the runner looks up an entry point when | ||
| 28 | -it sees ``custom`` in a spec. | ||
| 29 | -""" | ||
| 30 | - | ||
| 31 | - | ||
| 32 | -class ModelSpec(BaseModel): | ||
| 33 | - """How to materialize one model (base or fine-tuned).""" | ||
| 34 | - | ||
| 35 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 36 | - | ||
| 37 | - kind: BackendKind = "hf" | ||
| 38 | - base: str | ||
| 39 | - """HuggingFace repo id (``HuggingFaceTB/SmolLM2-135M-Instruct``) or | ||
| 40 | - a local path to a model directory.""" | ||
| 41 | - | ||
| 42 | - adapter: Path | None = None | ||
| 43 | - """Path to a PEFT adapter directory (containing ``adapter_config.json`` | ||
| 44 | - and ``adapter_model.safetensors``). ``None`` → base-only model.""" | ||
| 45 | - | ||
| 46 | - dtype: Literal["auto", "fp16", "bf16", "fp32"] = "auto" | ||
| 47 | - device: str = "auto" | ||
| 48 | - """``"auto"`` chooses CUDA → MPS → CPU in that order.""" | ||
| 49 | - | ||
| 50 | - trust_remote_code: bool = False | ||
| 51 | - """HuggingFace ``trust_remote_code`` passthrough. Off by default — | ||
| 52 | - the user must opt in explicitly, matching sway's no-surprises | ||
| 53 | - posture.""" | ||
| 54 | - | ||
| 55 | - entry_point: str | None = Field(default=None) | ||
| 56 | - """Required when ``kind='custom'``. Import path like | ||
| 57 | - ``mypkg.mybackend:MyBackend``.""" | ||
| 58 | - | ||
| 59 | - | ||
| 60 | -@dataclass(frozen=True, slots=True) | ||
| 61 | -class LoadedModel: | ||
| 62 | - """A materialized model plus the tokenizer that produced it. | ||
| 63 | - | ||
| 64 | - Returned by backend ``load()`` methods. Probes usually don't touch | ||
| 65 | - this directly — they go through the :class:`Model` / | ||
| 66 | - :class:`~dlm_sway.core.scoring.ScoringBackend` Protocols. | ||
| 67 | - """ | ||
| 68 | - | ||
| 69 | - id: str | ||
| 70 | - """Stable handle: ``"base"`` or ``"ft"`` typically.""" | ||
| 71 | - spec: ModelSpec | ||
| 72 | - model: Any | ||
| 73 | - """Framework-native handle (torch ``nn.Module``, MLX array module …). | ||
| 74 | - | ||
| 75 | - Typed as ``Any`` because the frameworks themselves ship unstubbed. | ||
| 76 | - Backend implementations narrow this at their boundary.""" | ||
| 77 | - tokenizer: Any | ||
| 78 | - meta: dict[str, Any] | ||
| 79 | - """Backend-captured metadata: device, dtype, adapter version, bytes | ||
| 80 | - on disk, num trainable params. Surfaced in the suite report.""" | ||
| 81 | - | ||
| 82 | - | ||
| 83 | -@runtime_checkable | ||
| 84 | -class Model(Protocol): | ||
| 85 | - """Minimum interface for text generation. | ||
| 86 | - | ||
| 87 | - Implemented by backend-wrapped model objects. Probes that need logits | ||
| 88 | - also require :class:`~dlm_sway.core.scoring.ScoringBackend`. | ||
| 89 | - """ | ||
| 90 | - | ||
| 91 | - id: str | ||
| 92 | - | ||
| 93 | - def generate( | ||
| 94 | - self, | ||
| 95 | - prompt: str, | ||
| 96 | - *, | ||
| 97 | - max_new_tokens: int, | ||
| 98 | - temperature: float = 0.0, | ||
| 99 | - top_p: float = 1.0, | ||
| 100 | - seed: int = 0, | ||
| 101 | - ) -> str: | ||
| 102 | - """Generate a completion. | ||
| 103 | - | ||
| 104 | - Defaults (``temperature=0``, ``top_p=1``) are greedy-decode for | ||
| 105 | - reproducibility. Callers wanting sampled output must pass | ||
| 106 | - non-defaults *and* a seed. | ||
| 107 | - """ | ||
| 108 | - ... | ||
| 109 | - | ||
| 110 | - def close(self) -> None: | ||
| 111 | - """Release any resources held by this model.""" | ||
| 112 | - ... | ||
sway/src/dlm_sway/core/result.pydeleted@@ -1,139 +0,0 @@ | |||
| 1 | -"""Probe and suite result types. | ||
| 2 | - | ||
| 3 | -Every numeric probe ultimately returns a :class:`ProbeResult`. The suite | ||
| 4 | -runner collects them into a :class:`SuiteResult` and the scorer folds | ||
| 5 | -that into a single :class:`SwayScore` with transparent per-component | ||
| 6 | -weights. | ||
| 7 | - | ||
| 8 | -These dataclasses are deliberately plain — no pydantic — because they | ||
| 9 | -cross probe/backend boundaries hundreds of times per run and a free | ||
| 10 | -``model_validate`` on every construction would dominate the runtime of | ||
| 11 | -cheap probes. | ||
| 12 | -""" | ||
| 13 | - | ||
| 14 | -from __future__ import annotations | ||
| 15 | - | ||
| 16 | -from dataclasses import dataclass, field | ||
| 17 | -from datetime import UTC, datetime | ||
| 18 | -from enum import StrEnum | ||
| 19 | -from typing import Any | ||
| 20 | - | ||
| 21 | - | ||
| 22 | -class Verdict(StrEnum): | ||
| 23 | - """Outcome of a single probe against its assertion.""" | ||
| 24 | - | ||
| 25 | - PASS = "pass" | ||
| 26 | - FAIL = "fail" | ||
| 27 | - WARN = "warn" | ||
| 28 | - SKIP = "skip" | ||
| 29 | - ERROR = "error" | ||
| 30 | - | ||
| 31 | - | ||
| 32 | -@dataclass(frozen=True, slots=True) | ||
| 33 | -class ProbeResult: | ||
| 34 | - """The result of running one probe. | ||
| 35 | - | ||
| 36 | - Attributes | ||
| 37 | - ---------- | ||
| 38 | - name: | ||
| 39 | - User-facing name from the spec (unique within a suite). | ||
| 40 | - kind: | ||
| 41 | - Probe discriminator (``delta_kl``, ``section_internalization`` …). | ||
| 42 | - verdict: | ||
| 43 | - Pass / fail / warn / skip / error. | ||
| 44 | - score: | ||
| 45 | - Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric | ||
| 46 | - probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`. | ||
| 47 | - raw: | ||
| 48 | - The raw metric value (e.g. KL=0.083). Probe-specific units. | ||
| 49 | - z_score: | ||
| 50 | - Standard deviations above the null-adapter baseline. ``None`` | ||
| 51 | - when no null calibration was run. | ||
| 52 | - base_value: | ||
| 53 | - The metric evaluated on the base model, when meaningful. | ||
| 54 | - ft_value: | ||
| 55 | - The metric evaluated on the fine-tuned model, when meaningful. | ||
| 56 | - evidence: | ||
| 57 | - Small structured payload for the report — prompts, example | ||
| 58 | - completions, per-section breakdowns. Kept bounded (<10 KB) so | ||
| 59 | - suite JSON stays under a megabyte. | ||
| 60 | - message: | ||
| 61 | - One-line diagnostic. Surfaces in the terminal report. | ||
| 62 | - duration_s: | ||
| 63 | - Wall time to execute. | ||
| 64 | - """ | ||
| 65 | - | ||
| 66 | - name: str | ||
| 67 | - kind: str | ||
| 68 | - verdict: Verdict | ||
| 69 | - score: float | None | ||
| 70 | - raw: float | None = None | ||
| 71 | - z_score: float | None = None | ||
| 72 | - base_value: float | None = None | ||
| 73 | - ft_value: float | None = None | ||
| 74 | - evidence: dict[str, Any] = field(default_factory=dict) | ||
| 75 | - message: str = "" | ||
| 76 | - duration_s: float = 0.0 | ||
| 77 | - | ||
| 78 | - | ||
| 79 | -@dataclass(frozen=True, slots=True) | ||
| 80 | -class SuiteResult: | ||
| 81 | - """A full run of a sway.yaml suite.""" | ||
| 82 | - | ||
| 83 | - spec_path: str | ||
| 84 | - started_at: datetime | ||
| 85 | - finished_at: datetime | ||
| 86 | - base_model_id: str | ||
| 87 | - adapter_id: str | ||
| 88 | - sway_version: str | ||
| 89 | - probes: tuple[ProbeResult, ...] = () | ||
| 90 | - null_stats: dict[str, dict[str, float]] = field(default_factory=dict) | ||
| 91 | - """Per-primitive null-adapter baseline stats (mean, std, runs). Used | ||
| 92 | - to turn raw metrics into z-scores when rendering the report.""" | ||
| 93 | - | ||
| 94 | - @property | ||
| 95 | - def wall_seconds(self) -> float: | ||
| 96 | - return (self.finished_at - self.started_at).total_seconds() | ||
| 97 | - | ||
| 98 | - | ||
| 99 | -# Component weights for the composite score. Overridable in sway.yaml. | ||
| 100 | -DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = { | ||
| 101 | - "adherence": 0.30, | ||
| 102 | - "attribution": 0.35, | ||
| 103 | - "calibration": 0.20, | ||
| 104 | - "ablation": 0.15, | ||
| 105 | -} | ||
| 106 | - | ||
| 107 | - | ||
| 108 | -@dataclass(frozen=True, slots=True) | ||
| 109 | -class SwayScore: | ||
| 110 | - """Composite score with a transparent per-component breakdown.""" | ||
| 111 | - | ||
| 112 | - overall: float | ||
| 113 | - components: dict[str, float] | ||
| 114 | - weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS)) | ||
| 115 | - band: str = "" | ||
| 116 | - findings: tuple[str, ...] = () | ||
| 117 | - | ||
| 118 | - @staticmethod | ||
| 119 | - def band_for(overall: float) -> str: | ||
| 120 | - """Map a score to a human-readable band. | ||
| 121 | - | ||
| 122 | - Bands (from the plan): | ||
| 123 | - - <0.3 : indistinguishable from noise | ||
| 124 | - - 0.3–0.6 : partial fit | ||
| 125 | - - 0.6–0.85: healthy | ||
| 126 | - - >0.85 : suspiciously good (possible overfit / memorization) | ||
| 127 | - """ | ||
| 128 | - if overall < 0.3: | ||
| 129 | - return "noise" | ||
| 130 | - if overall < 0.6: | ||
| 131 | - return "partial" | ||
| 132 | - if overall <= 0.85: | ||
| 133 | - return "healthy" | ||
| 134 | - return "suspicious" | ||
| 135 | - | ||
| 136 | - | ||
| 137 | -def utcnow() -> datetime: | ||
| 138 | - """Timezone-aware UTC timestamp (used by the runner).""" | ||
| 139 | - return datetime.now(UTC) | ||
sway/src/dlm_sway/core/scoring.pydeleted@@ -1,203 +0,0 @@ | |||
| 1 | -"""Scoring protocols: logprobs, next-token distributions, differential toggling. | ||
| 2 | - | ||
| 3 | -Scoring is **separate** from generation because not every backend can | ||
| 4 | -provide logits. Every numeric sway probe depends on at least one of | ||
| 5 | -three operations: | ||
| 6 | - | ||
| 7 | -1. ``logprob_of(prompt, completion)`` — score a completion against a | ||
| 8 | - prompt (A1, B2, B3, C2, …). | ||
| 9 | -2. ``rolling_logprob(text)`` — perplexity over a piece of text (B1, | ||
| 10 | - C2). | ||
| 11 | -3. ``next_token_dist(prompt, top_k)`` — the raw next-token distribution | ||
| 12 | - at a single position (A1, N2). | ||
| 13 | - | ||
| 14 | -The :class:`DifferentialBackend` is the key performance primitive: | ||
| 15 | -both base and fine-tuned views share the same loaded weights and KV | ||
| 16 | -cache layout, toggled via PEFT's :meth:`set_adapter` / | ||
| 17 | -:meth:`disable_adapter`. A naive "load twice" implementation would | ||
| 18 | -double memory and halve throughput. | ||
| 19 | -""" | ||
| 20 | - | ||
| 21 | -from __future__ import annotations | ||
| 22 | - | ||
| 23 | -from contextlib import AbstractContextManager | ||
| 24 | -from dataclasses import dataclass, field | ||
| 25 | -from typing import Protocol, runtime_checkable | ||
| 26 | - | ||
| 27 | -import numpy as np | ||
| 28 | -from numpy.typing import NDArray | ||
| 29 | - | ||
| 30 | -from dlm_sway.core.model import Model | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -@dataclass(frozen=True, slots=True) | ||
| 34 | -class RollingLogprob: | ||
| 35 | - """Per-token logprobs over a piece of text, plus summary stats. | ||
| 36 | - | ||
| 37 | - Attributes | ||
| 38 | - ---------- | ||
| 39 | - token_ids: | ||
| 40 | - The tokenizer output for ``text``. Length ``N``. | ||
| 41 | - logprobs: | ||
| 42 | - ``log p(token_i | token_<i)`` for each position i ≥ 1. Length | ||
| 43 | - ``N-1``. | ||
| 44 | - num_tokens: | ||
| 45 | - ``N`` — included for convenience; ``len(token_ids)``. | ||
| 46 | - total_logprob: | ||
| 47 | - Sum of :attr:`logprobs`. | ||
| 48 | - """ | ||
| 49 | - | ||
| 50 | - token_ids: NDArray[np.int64] | ||
| 51 | - logprobs: NDArray[np.float32] | ||
| 52 | - num_tokens: int | ||
| 53 | - total_logprob: float | ||
| 54 | - | ||
| 55 | - @property | ||
| 56 | - def mean_logprob(self) -> float: | ||
| 57 | - n = self.logprobs.size | ||
| 58 | - return float(self.total_logprob / n) if n else 0.0 | ||
| 59 | - | ||
| 60 | - @property | ||
| 61 | - def perplexity(self) -> float: | ||
| 62 | - """``exp(-mean_logprob)``. Base-e, natural perplexity.""" | ||
| 63 | - return float(np.exp(-self.mean_logprob)) | ||
| 64 | - | ||
| 65 | - | ||
| 66 | -@dataclass(frozen=True, slots=True) | ||
| 67 | -class TokenDist: | ||
| 68 | - """A (possibly top-k truncated) next-token probability distribution. | ||
| 69 | - | ||
| 70 | - For KL / JS divergence probes sway needs matched distributions | ||
| 71 | - across base and fine-tuned views. The runner is responsible for | ||
| 72 | - aligning ``top_k`` token slices between two ``TokenDist`` objects | ||
| 73 | - before handing them to divergence math. | ||
| 74 | - """ | ||
| 75 | - | ||
| 76 | - token_ids: NDArray[np.int64] | ||
| 77 | - """Token ids, descending by probability. Length ``k``.""" | ||
| 78 | - logprobs: NDArray[np.float32] | ||
| 79 | - """Log-probabilities for :attr:`token_ids`. Length ``k``.""" | ||
| 80 | - vocab_size: int | ||
| 81 | - """Full vocab size — needed to renormalize top-k truncated slices.""" | ||
| 82 | - tail_logprob: float = field(default=0.0) | ||
| 83 | - """log of (1 - sum of exp(logprobs[:k])); 0 if top_k covers the full vocab.""" | ||
| 84 | - | ||
| 85 | - | ||
| 86 | -@runtime_checkable | ||
| 87 | -class ScoringBackend(Protocol): | ||
| 88 | - """Logit-level access to a loaded model.""" | ||
| 89 | - | ||
| 90 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 91 | - """Sum of log-probabilities of ``completion`` tokens given ``prompt``. | ||
| 92 | - | ||
| 93 | - The prompt is *not* scored; only the completion contributes. The | ||
| 94 | - value is in nats (natural log). Longer completions are | ||
| 95 | - monotonically more negative — callers normalize by length if | ||
| 96 | - they need a rate. | ||
| 97 | - """ | ||
| 98 | - ... | ||
| 99 | - | ||
| 100 | - def rolling_logprob(self, text: str) -> RollingLogprob: | ||
| 101 | - """Compute per-token logprobs for the whole of ``text``. | ||
| 102 | - | ||
| 103 | - Equivalent to lm-eval's ``loglikelihood_rolling``. Used for | ||
| 104 | - perplexity comparison on held-out content (B1 SIS, C2). | ||
| 105 | - """ | ||
| 106 | - ... | ||
| 107 | - | ||
| 108 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 109 | - """Next-token distribution at the position after ``prompt``. | ||
| 110 | - | ||
| 111 | - Truncated to ``top_k`` for memory; callers doing divergence math | ||
| 112 | - over the top-k slice accept the (typically negligible) error vs | ||
| 113 | - full-vocab KL. | ||
| 114 | - """ | ||
| 115 | - ... | ||
| 116 | - | ||
| 117 | - | ||
| 118 | -@runtime_checkable | ||
| 119 | -class DifferentialBackend(Protocol): | ||
| 120 | - """A backend that holds base + fine-tuned views on a single loaded model. | ||
| 121 | - | ||
| 122 | - The idiomatic usage is:: | ||
| 123 | - | ||
| 124 | - with backend.as_base() as base_view: | ||
| 125 | - p_base = base_view.next_token_dist(prompt) | ||
| 126 | - with backend.as_finetuned() as ft_view: | ||
| 127 | - p_ft = ft_view.next_token_dist(prompt) | ||
| 128 | - | ||
| 129 | - Implementations toggle PEFT adapters via | ||
| 130 | - :meth:`peft.PeftModel.set_adapter` / :meth:`disable_adapter`. | ||
| 131 | - | ||
| 132 | - Invariant: the two views must be **not simultaneously usable**. A | ||
| 133 | - caller holding a ``base_view`` after entering the ``as_finetuned`` | ||
| 134 | - context is a programmer error and implementations MUST detect and | ||
| 135 | - raise. | ||
| 136 | - """ | ||
| 137 | - | ||
| 138 | - def as_base(self) -> AbstractContextManager[_ScoringModel]: ... | ||
| 139 | - | ||
| 140 | - def as_finetuned(self) -> AbstractContextManager[_ScoringModel]: ... | ||
| 141 | - | ||
| 142 | - | ||
| 143 | -@runtime_checkable | ||
| 144 | -class ScalableDifferentialBackend(DifferentialBackend, Protocol): | ||
| 145 | - """A differential backend that can also scale the LoRA additive term. | ||
| 146 | - | ||
| 147 | - LoRA applies ``W + (alpha/r) · B @ A`` to a base weight matrix. This | ||
| 148 | - protocol exposes a context manager that temporarily multiplies that | ||
| 149 | - additive term by ``lam`` for everything inside the ``with`` block. | ||
| 150 | - | ||
| 151 | - ``lam = 0.0`` is equivalent to :meth:`as_base`. | ||
| 152 | - ``lam = 1.0`` is equivalent to :meth:`as_finetuned`. | ||
| 153 | - ``lam = 1.25`` overshoots — useful for N2 AdapterAblation's | ||
| 154 | - response-curve measurement. | ||
| 155 | - | ||
| 156 | - Only the HF backend ships an implementation in v0.1. Probes that | ||
| 157 | - need scaling check via ``isinstance(backend, ScalableDifferentialBackend)`` | ||
| 158 | - at runtime and SKIP gracefully when unavailable. | ||
| 159 | - """ | ||
| 160 | - | ||
| 161 | - def as_scaled_adapter(self, lam: float) -> AbstractContextManager[_ScoringModel]: ... | ||
| 162 | - | ||
| 163 | - | ||
| 164 | -@runtime_checkable | ||
| 165 | -class NullCalibratedBackend(DifferentialBackend, Protocol): | ||
| 166 | - """A differential backend that can produce a "null adapter" view. | ||
| 167 | - | ||
| 168 | - A null adapter has the *same structure* (rank, alpha, target modules) | ||
| 169 | - as the real adapter but with weights drawn from a zero-mean Gaussian. | ||
| 170 | - Running probes against this view yields the baseline "how much | ||
| 171 | - signal does random noise produce" distribution — the denominator in | ||
| 172 | - every numeric probe's z-score. | ||
| 173 | - | ||
| 174 | - The context manager takes a ``seed`` so calibration runs can be | ||
| 175 | - reproduced and multiple independent null samples can be drawn to | ||
| 176 | - estimate ``std``. | ||
| 177 | - | ||
| 178 | - Implementations MUST restore the real adapter on exit, including | ||
| 179 | - on exceptions, so a caller can freely interleave null and real | ||
| 180 | - calibrations within the same backend lifetime. | ||
| 181 | - """ | ||
| 182 | - | ||
| 183 | - def as_null_adapter( | ||
| 184 | - self, seed: int, *, init_scale: float = 0.02 | ||
| 185 | - ) -> AbstractContextManager[_ScoringModel]: ... | ||
| 186 | - | ||
| 187 | - | ||
| 188 | -# Helper Protocol for type-checking the yielded context object: it | ||
| 189 | -# must satisfy both Model and ScoringBackend. mypy doesn't support | ||
| 190 | -# intersection types, so we spell it out explicitly. | ||
| 191 | -@runtime_checkable | ||
| 192 | -class _ScoringModel(Model, ScoringBackend, Protocol): | ||
| 193 | - """A Model that also exposes ScoringBackend.""" | ||
| 194 | - | ||
| 195 | - ... | ||
| 196 | - | ||
| 197 | - | ||
| 198 | -ScoringModel = _ScoringModel | ||
| 199 | -"""Public alias for the intersection ``Model & ScoringBackend``. | ||
| 200 | - | ||
| 201 | -Exported for backend and probe implementations that need to annotate | ||
| 202 | -variables of this combined type. | ||
| 203 | -""" | ||
sway/src/dlm_sway/core/sections.pydeleted@@ -1,76 +0,0 @@ | |||
| 1 | -"""Minimal section contract for attribution probes. | ||
| 2 | - | ||
| 3 | -The flagship B1 ``section_internalization`` probe needs *structured* | ||
| 4 | -input — a section has an id, a kind, content text, and possibly some | ||
| 5 | -Q/A pairs or chosen/rejected triples. sway defines this shape here so | ||
| 6 | -the probes stay oblivious to the upstream (``.dlm`` parser, custom | ||
| 7 | -loaders, synthetic test fixtures). | ||
| 8 | - | ||
| 9 | -Field names are aligned with :mod:`dlm.doc.sections` but this module | ||
| 10 | -does not import ``dlm`` — the bridge at | ||
| 11 | -:mod:`dlm_sway.integrations.dlm` does the adaptation. | ||
| 12 | -""" | ||
| 13 | - | ||
| 14 | -from __future__ import annotations | ||
| 15 | - | ||
| 16 | -from dataclasses import dataclass, field | ||
| 17 | -from typing import Literal | ||
| 18 | - | ||
| 19 | -SectionKind = Literal["prose", "instruction", "preference"] | ||
| 20 | - | ||
| 21 | - | ||
| 22 | -@dataclass(frozen=True, slots=True) | ||
| 23 | -class SectionProbe: | ||
| 24 | - """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section.""" | ||
| 25 | - | ||
| 26 | - prompt: str | ||
| 27 | - gold: str | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -@dataclass(frozen=True, slots=True) | ||
| 31 | -class SectionPreference: | ||
| 32 | - """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section.""" | ||
| 33 | - | ||
| 34 | - prompt: str | ||
| 35 | - chosen: str | ||
| 36 | - rejected: str | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -@dataclass(frozen=True, slots=True) | ||
| 40 | -class Section: | ||
| 41 | - """One typed chunk of a training document. | ||
| 42 | - | ||
| 43 | - Attributes | ||
| 44 | - ---------- | ||
| 45 | - id: | ||
| 46 | - Content-addressed identifier. ``.dlm`` uses a 16-hex-char | ||
| 47 | - sha256 prefix; sway doesn't enforce a format. | ||
| 48 | - kind: | ||
| 49 | - Discriminator for which of :attr:`probes` / | ||
| 50 | - :attr:`preferences` / :attr:`content` is the primary signal. | ||
| 51 | - content: | ||
| 52 | - Raw section text. Always populated; used by the rolling-PPL | ||
| 53 | - path for PROSE sections. | ||
| 54 | - probes: | ||
| 55 | - For INSTRUCTION: parsed Q/A pairs. Empty tuple for others. | ||
| 56 | - preferences: | ||
| 57 | - For PREFERENCE: parsed chosen/rejected triples. Empty otherwise. | ||
| 58 | - tag: | ||
| 59 | - Optional free-form label for the section (e.g., "intro", | ||
| 60 | - "api-reference"). Surfaces in per-section reports. | ||
| 61 | - """ | ||
| 62 | - | ||
| 63 | - id: str | ||
| 64 | - kind: SectionKind | ||
| 65 | - content: str | ||
| 66 | - probes: tuple[SectionProbe, ...] = field(default_factory=tuple) | ||
| 67 | - preferences: tuple[SectionPreference, ...] = field(default_factory=tuple) | ||
| 68 | - tag: str | None = None | ||
| 69 | - | ||
| 70 | - | ||
| 71 | -def filter_kinds( | ||
| 72 | - sections: tuple[Section, ...], kinds: tuple[SectionKind, ...] | ||
| 73 | -) -> tuple[Section, ...]: | ||
| 74 | - """Return only sections whose ``kind`` matches one of ``kinds``.""" | ||
| 75 | - allow = set(kinds) | ||
| 76 | - return tuple(s for s in sections if s.kind in allow) | ||
sway/src/dlm_sway/integrations/__init__.pydeleted@@ -1,1 +0,0 @@ | |||
| 1 | -"""Optional integrations with upstream fine-tuning tools.""" | ||
sway/src/dlm_sway/integrations/dlm/__init__.pydeleted@@ -1,1 +0,0 @@ | |||
| 1 | -"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``.""" | ||
sway/src/dlm_sway/integrations/dlm/autogen.pydeleted@@ -1,191 +0,0 @@ | |||
| 1 | -"""Auto-generate a ``sway.yaml`` from a ``.dlm`` document. | ||
| 2 | - | ||
| 3 | -Walks the parsed sections and emits one entry per primitive sway ships: | ||
| 4 | -the full 11-primitive battery wired up against the document's own | ||
| 5 | -content. The result is a YAML artifact the user commits alongside their | ||
| 6 | -``.dlm`` and diffs in PRs. | ||
| 7 | - | ||
| 8 | -The generated spec includes a ``dlm_source`` field that the suite loader | ||
| 9 | -uses to pick up :class:`~dlm_sway.core.sections.Section` data at run | ||
| 10 | -time — probes that need sections (B1, B3, C3) then work against the | ||
| 11 | -typed structure instead of re-parsing text. | ||
| 12 | -""" | ||
| 13 | - | ||
| 14 | -from __future__ import annotations | ||
| 15 | - | ||
| 16 | -from pathlib import Path | ||
| 17 | -from typing import Any | ||
| 18 | - | ||
| 19 | -import yaml | ||
| 20 | - | ||
| 21 | -from dlm_sway.core.errors import SwayError | ||
| 22 | -from dlm_sway.core.sections import Section | ||
| 23 | -from dlm_sway.integrations.dlm.resolver import DlmHandle, resolve_dlm | ||
| 24 | - | ||
| 25 | - | ||
| 26 | -def write_sway_yaml(dlm_path: Path, out: Path) -> None: | ||
| 27 | - """Resolve the .dlm, build a spec dict, write it as YAML to ``out``.""" | ||
| 28 | - handle = resolve_dlm(dlm_path) | ||
| 29 | - if handle.adapter_path is None: | ||
| 30 | - raise SwayError( | ||
| 31 | - f"{dlm_path}: no trained adapter found at ~/.dlm/store/{handle.dlm_id}/adapter; " | ||
| 32 | - "train the document with `dlm train` before generating a sway suite." | ||
| 33 | - ) | ||
| 34 | - spec = build_spec_dict(handle, dlm_source=str(dlm_path.resolve())) | ||
| 35 | - out.write_text(yaml.safe_dump(spec, sort_keys=False), encoding="utf-8") | ||
| 36 | - | ||
| 37 | - | ||
| 38 | -def build_spec_dict(handle: DlmHandle, *, dlm_source: str | None = None) -> dict[str, Any]: | ||
| 39 | - """Build a sway.yaml-shaped dict from a :class:`DlmHandle`.""" | ||
| 40 | - base_spec = {"kind": "hf", "base": handle.base_model} | ||
| 41 | - ft_spec = { | ||
| 42 | - "kind": "hf", | ||
| 43 | - "base": handle.base_model, | ||
| 44 | - "adapter": str(handle.adapter_path) if handle.adapter_path else None, | ||
| 45 | - } | ||
| 46 | - spec: dict[str, Any] = { | ||
| 47 | - "version": 1, | ||
| 48 | - "models": {"base": base_spec, "ft": ft_spec}, | ||
| 49 | - "defaults": {"seed": 0, "differential": True}, | ||
| 50 | - "suite": _build_suite(handle.sections), | ||
| 51 | - } | ||
| 52 | - if dlm_source is not None: | ||
| 53 | - spec["dlm_source"] = dlm_source | ||
| 54 | - return spec | ||
| 55 | - | ||
| 56 | - | ||
| 57 | -def _build_suite(sections: tuple[Section, ...]) -> list[dict[str, Any]]: | ||
| 58 | - """Assemble the full probe battery for the given sections. | ||
| 59 | - | ||
| 60 | - The ordering matters: ``null_adapter`` first so every downstream | ||
| 61 | - probe's z-score threshold has stats to consult. | ||
| 62 | - """ | ||
| 63 | - instruction_probes: list[tuple[str, str]] = [ | ||
| 64 | - (p.prompt, p.gold) for s in sections if s.kind == "instruction" for p in s.probes | ||
| 65 | - ] | ||
| 66 | - prose_prompts: list[str] = [] | ||
| 67 | - for s in sections: | ||
| 68 | - if s.kind == "prose" and s.content.strip(): | ||
| 69 | - # Use the section's leading sentence as a natural completion prompt. | ||
| 70 | - first_sentence = s.content.split(".")[0].strip() | ||
| 71 | - if first_sentence: | ||
| 72 | - prose_prompts.append(first_sentence + ".") | ||
| 73 | - | ||
| 74 | - kl_prompts = [q for q, _ in instruction_probes][:16] or prose_prompts[:16] | ||
| 75 | - style_prompts = prose_prompts[:8] or [q for q, _ in instruction_probes][:8] | ||
| 76 | - | ||
| 77 | - suite: list[dict[str, Any]] = [] | ||
| 78 | - | ||
| 79 | - # Baseline calibration — always first. | ||
| 80 | - suite.append({"name": "null_baseline", "kind": "null_adapter", "runs": 3}) | ||
| 81 | - | ||
| 82 | - # Adherence. | ||
| 83 | - if kl_prompts: | ||
| 84 | - suite.append( | ||
| 85 | - { | ||
| 86 | - "name": "delta_kl_doc", | ||
| 87 | - "kind": "delta_kl", | ||
| 88 | - "prompts": kl_prompts, | ||
| 89 | - "assert_mean_gte": 0.02, | ||
| 90 | - } | ||
| 91 | - ) | ||
| 92 | - if instruction_probes: | ||
| 93 | - suite.append( | ||
| 94 | - { | ||
| 95 | - "name": "revert_check", | ||
| 96 | - "kind": "adapter_revert", | ||
| 97 | - "cases": [ | ||
| 98 | - {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)} | ||
| 99 | - for q, a in instruction_probes[:8] | ||
| 100 | - ], | ||
| 101 | - "assert_revert_rate_lt": 0.3, | ||
| 102 | - } | ||
| 103 | - ) | ||
| 104 | - if kl_prompts: | ||
| 105 | - suite.append( | ||
| 106 | - { | ||
| 107 | - "name": "prompt_collapse", | ||
| 108 | - "kind": "prompt_collapse", | ||
| 109 | - "prompts": kl_prompts[:4], | ||
| 110 | - "context_lengths": [0, 256, 512, 1024], | ||
| 111 | - "assert_half_life_tokens": 300, | ||
| 112 | - } | ||
| 113 | - ) | ||
| 114 | - | ||
| 115 | - # Attribution. | ||
| 116 | - if len(sections) >= 2: | ||
| 117 | - suite.append( | ||
| 118 | - { | ||
| 119 | - "name": "section_attribution", | ||
| 120 | - "kind": "section_internalization", | ||
| 121 | - "per_section_threshold": 0.05, | ||
| 122 | - } | ||
| 123 | - ) | ||
| 124 | - if instruction_probes: | ||
| 125 | - suite.append( | ||
| 126 | - { | ||
| 127 | - "name": "paraphrase_invariance", | ||
| 128 | - "kind": "paraphrase_invariance", | ||
| 129 | - "cases": [ | ||
| 130 | - {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)} | ||
| 131 | - for q, a in instruction_probes[:6] | ||
| 132 | - ], | ||
| 133 | - } | ||
| 134 | - ) | ||
| 135 | - has_preferences = any(s.kind == "preference" and s.preferences for s in sections) | ||
| 136 | - if has_preferences: | ||
| 137 | - suite.append( | ||
| 138 | - { | ||
| 139 | - "name": "preference_flip", | ||
| 140 | - "kind": "preference_flip", | ||
| 141 | - "assert_flip_rate_gte": 0.7, | ||
| 142 | - } | ||
| 143 | - ) | ||
| 144 | - | ||
| 145 | - # Calibration. | ||
| 146 | - if style_prompts: | ||
| 147 | - suite.append( | ||
| 148 | - { | ||
| 149 | - "name": "style_shift", | ||
| 150 | - "kind": "style_fingerprint", | ||
| 151 | - "prompts": style_prompts, | ||
| 152 | - } | ||
| 153 | - ) | ||
| 154 | - suite.append({"name": "general_knowledge", "kind": "calibration_drift"}) | ||
| 155 | - if any(s.kind == "prose" for s in sections): | ||
| 156 | - suite.append( | ||
| 157 | - { | ||
| 158 | - "name": "verbatim_leak", | ||
| 159 | - "kind": "leakage", | ||
| 160 | - "prefix_chars": 128, | ||
| 161 | - "continuation_chars": 256, | ||
| 162 | - } | ||
| 163 | - ) | ||
| 164 | - | ||
| 165 | - # Signature ablation — goes last because it's the most expensive. | ||
| 166 | - if kl_prompts: | ||
| 167 | - suite.append( | ||
| 168 | - { | ||
| 169 | - "name": "adapter_ablation", | ||
| 170 | - "kind": "adapter_ablation", | ||
| 171 | - "prompts": kl_prompts[:6], | ||
| 172 | - "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | ||
| 173 | - } | ||
| 174 | - ) | ||
| 175 | - | ||
| 176 | - return suite | ||
| 177 | - | ||
| 178 | - | ||
| 179 | -def _auto_paraphrases(prompt: str) -> list[str]: | ||
| 180 | - """Small, deterministic paraphrase set used when authors don't supply one. | ||
| 181 | - | ||
| 182 | - Purely heuristic — good enough to detect "did the model memorize the | ||
| 183 | - exact wording". Real paraphrase generation lives behind the | ||
| 184 | - ``semsim`` extra. | ||
| 185 | - """ | ||
| 186 | - variants: list[str] = [] | ||
| 187 | - stripped = prompt.rstrip("?. ") | ||
| 188 | - variants.append(f"Could you explain: {stripped}?") | ||
| 189 | - variants.append(f"I'd like to know — {stripped}.") | ||
| 190 | - variants.append(f"Please describe: {stripped}.") | ||
| 191 | - return variants[:3] | ||
sway/src/dlm_sway/integrations/dlm/resolver.pydeleted@@ -1,243 +0,0 @@ | |||
| 1 | -"""Resolve a ``.dlm`` file to the artifacts sway needs. | ||
| 2 | - | ||
| 3 | -Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything | ||
| 4 | -outside this package is oblivious to dlm's internal shape; the bridge | ||
| 5 | -is the only place that knows, e.g., that a dlm section carries a | ||
| 6 | -``kind`` field named ``type`` or that adapters live at | ||
| 7 | -``adapter/versions/vNNNN/``. | ||
| 8 | -""" | ||
| 9 | - | ||
| 10 | -from __future__ import annotations | ||
| 11 | - | ||
| 12 | -import hashlib | ||
| 13 | -from dataclasses import dataclass | ||
| 14 | -from pathlib import Path | ||
| 15 | - | ||
| 16 | -from dlm_sway.core.errors import SwayError | ||
| 17 | -from dlm_sway.core.sections import ( | ||
| 18 | - Section, | ||
| 19 | - SectionKind, | ||
| 20 | - SectionPreference, | ||
| 21 | - SectionProbe, | ||
| 22 | -) | ||
| 23 | - | ||
| 24 | - | ||
| 25 | -@dataclass(frozen=True, slots=True) | ||
| 26 | -class DlmHandle: | ||
| 27 | - """Everything the sway bridge pulls out of a ``.dlm`` file. | ||
| 28 | - | ||
| 29 | - Attributes | ||
| 30 | - ---------- | ||
| 31 | - dlm_id: | ||
| 32 | - Stable identifier from the frontmatter. | ||
| 33 | - base_model: | ||
| 34 | - Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape | ||
| 35 | - hatch, taken verbatim from the frontmatter. | ||
| 36 | - adapter_path: | ||
| 37 | - Directory containing the current trained PEFT adapter (resolved | ||
| 38 | - via dlm's own ``StorePath.for_dlm``). ``None`` if the document | ||
| 39 | - hasn't been trained yet. | ||
| 40 | - sections: | ||
| 41 | - Typed sections ready for sway's probes. | ||
| 42 | - doc_text: | ||
| 43 | - Concatenated raw content of all sections. Used by probes that | ||
| 44 | - need a whole-document stylistic reference (C1). | ||
| 45 | - """ | ||
| 46 | - | ||
| 47 | - dlm_id: str | ||
| 48 | - base_model: str | ||
| 49 | - adapter_path: Path | None | ||
| 50 | - sections: tuple[Section, ...] | ||
| 51 | - doc_text: str | ||
| 52 | - | ||
| 53 | - | ||
| 54 | -def resolve_dlm(dlm_path: Path) -> DlmHandle: | ||
| 55 | - """Parse ``dlm_path`` and return a :class:`DlmHandle`. | ||
| 56 | - | ||
| 57 | - Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message | ||
| 58 | - when the file is malformed or when the resolved adapter path doesn't | ||
| 59 | - exist on disk. | ||
| 60 | - """ | ||
| 61 | - try: | ||
| 62 | - from dlm.doc.parser import parse_file as dlm_parse_file | ||
| 63 | - except ImportError as exc: | ||
| 64 | - raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc | ||
| 65 | - | ||
| 66 | - parsed = dlm_parse_file(dlm_path) | ||
| 67 | - fm = parsed.frontmatter | ||
| 68 | - sections = tuple(_translate_section(s) for s in parsed.sections) | ||
| 69 | - doc_text = "\n\n".join(s.content for s in sections) | ||
| 70 | - | ||
| 71 | - adapter_path = _resolve_adapter_path(fm.dlm_id) | ||
| 72 | - base_hf_id = _resolve_base_model_to_hf_id(fm.base_model) | ||
| 73 | - | ||
| 74 | - return DlmHandle( | ||
| 75 | - dlm_id=fm.dlm_id, | ||
| 76 | - base_model=base_hf_id, | ||
| 77 | - adapter_path=adapter_path, | ||
| 78 | - sections=sections, | ||
| 79 | - doc_text=doc_text, | ||
| 80 | - ) | ||
| 81 | - | ||
| 82 | - | ||
| 83 | -def _resolve_base_model_to_hf_id(base_model: str) -> str: | ||
| 84 | - """Translate dlm's base-model *key* to a HuggingFace repo id. | ||
| 85 | - | ||
| 86 | - dlm's frontmatter stores registry keys like ``smollm2-135m`` which | ||
| 87 | - resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends | ||
| 88 | - call ``AutoModelForCausalLM.from_pretrained`` directly and need the | ||
| 89 | - HF id. The ``hf:org/name`` escape hatch passes through unchanged. | ||
| 90 | - """ | ||
| 91 | - if base_model.startswith("hf:"): | ||
| 92 | - return base_model[len("hf:") :] | ||
| 93 | - try: | ||
| 94 | - from dlm.base_models import resolve as resolve_base | ||
| 95 | - except ImportError: | ||
| 96 | - return base_model | ||
| 97 | - try: | ||
| 98 | - spec = resolve_base(base_model) | ||
| 99 | - except Exception: # noqa: BLE001 — unknown dlm errors | ||
| 100 | - return base_model | ||
| 101 | - hf_id = getattr(spec, "hf_id", None) | ||
| 102 | - return str(hf_id) if hf_id else base_model | ||
| 103 | - | ||
| 104 | - | ||
| 105 | -def _resolve_adapter_path(dlm_id: str) -> Path | None: | ||
| 106 | - """Locate the current adapter directory for ``dlm_id``. | ||
| 107 | - | ||
| 108 | - Uses dlm's module-level ``for_dlm`` helper if available, else falls | ||
| 109 | - back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt`` | ||
| 110 | - pointer. Returns ``None`` if no adapter has been trained yet. | ||
| 111 | - """ | ||
| 112 | - # Primary path: use dlm's own store-path helpers. | ||
| 113 | - try: | ||
| 114 | - from dlm.store.paths import for_dlm as _for_dlm | ||
| 115 | - except ImportError: | ||
| 116 | - _for_dlm = None | ||
| 117 | - | ||
| 118 | - if _for_dlm is not None: | ||
| 119 | - try: | ||
| 120 | - store = _for_dlm(dlm_id) | ||
| 121 | - except Exception: # noqa: BLE001 — unknown dlm exception shapes | ||
| 122 | - store = None | ||
| 123 | - if store is not None: | ||
| 124 | - try: | ||
| 125 | - resolved = store.resolve_current_adapter() | ||
| 126 | - except (AttributeError, FileNotFoundError): | ||
| 127 | - resolved = None | ||
| 128 | - if resolved is not None and Path(resolved).exists(): | ||
| 129 | - return Path(resolved) | ||
| 130 | - | ||
| 131 | - # Manual fallback. The ``current.txt`` pointer is relative to the | ||
| 132 | - # **store root**, not to current.txt's parent dir — so go up one level. | ||
| 133 | - import os | ||
| 134 | - | ||
| 135 | - home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser() | ||
| 136 | - store_root = home / "store" / dlm_id | ||
| 137 | - current_file = store_root / "adapter" / "current.txt" | ||
| 138 | - if current_file.exists(): | ||
| 139 | - pointer = current_file.read_text(encoding="utf-8").strip() | ||
| 140 | - candidate = (store_root / pointer).resolve() | ||
| 141 | - if candidate.exists(): | ||
| 142 | - return candidate | ||
| 143 | - return None | ||
| 144 | - | ||
| 145 | - | ||
| 146 | -def _translate_section(dlm_section: object) -> Section: | ||
| 147 | - """Adapt a ``dlm.doc.sections.Section`` to sway's section type. | ||
| 148 | - | ||
| 149 | - dlm's Section dataclass uses the attribute name ``type`` (not | ||
| 150 | - ``kind``) and stores instruction/preference content as raw markdown | ||
| 151 | - — dlm ships dedicated parsers (``parse_instruction_body``, | ||
| 152 | - ``parse_preference_body``) that we reuse here so any future dlm | ||
| 153 | - syntax additions land in sway for free. | ||
| 154 | - """ | ||
| 155 | - # dlm's current attribute is ``type``; older revisions used ``kind``. | ||
| 156 | - kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None)) | ||
| 157 | - kind = _normalize_kind(kind_raw) | ||
| 158 | - content = str(getattr(dlm_section, "content", "")) | ||
| 159 | - section_id = str( | ||
| 160 | - getattr(dlm_section, "section_id", None) | ||
| 161 | - or getattr(dlm_section, "id", None) | ||
| 162 | - or _content_hash(content) | ||
| 163 | - ) | ||
| 164 | - tag = getattr(dlm_section, "tag", None) | ||
| 165 | - | ||
| 166 | - probes: tuple[SectionProbe, ...] = () | ||
| 167 | - preferences: tuple[SectionPreference, ...] = () | ||
| 168 | - if kind == "instruction": | ||
| 169 | - probes = tuple(_parse_instruction(content, section_id=section_id)) | ||
| 170 | - elif kind == "preference": | ||
| 171 | - preferences = tuple(_parse_preference(content, section_id=section_id)) | ||
| 172 | - | ||
| 173 | - return Section( | ||
| 174 | - id=section_id, | ||
| 175 | - kind=kind, | ||
| 176 | - content=content, | ||
| 177 | - probes=probes, | ||
| 178 | - preferences=preferences, | ||
| 179 | - tag=tag if isinstance(tag, str) else None, | ||
| 180 | - ) | ||
| 181 | - | ||
| 182 | - | ||
| 183 | -def _normalize_kind(raw: object) -> SectionKind: | ||
| 184 | - """Map dlm's SectionType/str to sway's lowercase kind.""" | ||
| 185 | - if raw is None: | ||
| 186 | - return "prose" | ||
| 187 | - value = str(raw).lower() | ||
| 188 | - # dlm uses uppercase StrEnum values like "PROSE"; normalize. | ||
| 189 | - if value.endswith("prose") or "prose" in value: | ||
| 190 | - return "prose" | ||
| 191 | - if "instruction" in value: | ||
| 192 | - return "instruction" | ||
| 193 | - if "preference" in value: | ||
| 194 | - return "preference" | ||
| 195 | - return "prose" | ||
| 196 | - | ||
| 197 | - | ||
| 198 | -def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]: | ||
| 199 | - """Pull (Q, A) pairs out of a dlm INSTRUCTION section body. | ||
| 200 | - | ||
| 201 | - Delegates to dlm's own ``parse_instruction_body`` so syntax additions | ||
| 202 | - land in sway without code changes here. Falls back to an empty list | ||
| 203 | - on parse errors — the probe will fail gracefully. | ||
| 204 | - """ | ||
| 205 | - try: | ||
| 206 | - from dlm.data.instruction_parser import parse_instruction_body | ||
| 207 | - except ImportError: | ||
| 208 | - return [] | ||
| 209 | - try: | ||
| 210 | - pairs = parse_instruction_body(content, section_id=section_id) | ||
| 211 | - except Exception: # noqa: BLE001 — dlm raises InstructionParseError | ||
| 212 | - return [] | ||
| 213 | - out: list[SectionProbe] = [] | ||
| 214 | - for p in pairs: | ||
| 215 | - q = getattr(p, "question", getattr(p, "prompt", "")) | ||
| 216 | - a = getattr(p, "answer", getattr(p, "gold", "")) | ||
| 217 | - if q and a: | ||
| 218 | - out.append(SectionProbe(prompt=str(q), gold=str(a))) | ||
| 219 | - return out | ||
| 220 | - | ||
| 221 | - | ||
| 222 | -def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]: | ||
| 223 | - """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body.""" | ||
| 224 | - try: | ||
| 225 | - from dlm.data.preference_parser import parse_preference_body | ||
| 226 | - except ImportError: | ||
| 227 | - return [] | ||
| 228 | - try: | ||
| 229 | - triples = parse_preference_body(content, section_id=section_id) | ||
| 230 | - except Exception: # noqa: BLE001 — dlm raises PreferenceParseError | ||
| 231 | - return [] | ||
| 232 | - out: list[SectionPreference] = [] | ||
| 233 | - for t in triples: | ||
| 234 | - p = str(getattr(t, "prompt", "")) | ||
| 235 | - c = str(getattr(t, "chosen", "")) | ||
| 236 | - rej = str(getattr(t, "rejected", "")) | ||
| 237 | - if p and c and rej: | ||
| 238 | - out.append(SectionPreference(prompt=p, chosen=c, rejected=rej)) | ||
| 239 | - return out | ||
| 240 | - | ||
| 241 | - | ||
| 242 | -def _content_hash(content: str) -> str: | ||
| 243 | - return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] | ||
sway/src/dlm_sway/probes/__init__.pydeleted@@ -1,27 +0,0 @@ | |||
| 1 | -"""Probe primitives. Each module in this package implements one primitive. | ||
| 2 | - | ||
| 3 | -Importing this package eagerly imports every probe module so their | ||
| 4 | -``__init_subclass__`` hooks populate the registry. If you're hitting | ||
| 5 | -"unknown probe kind" from :func:`dlm_sway.probes.base.build_probe`, the | ||
| 6 | -fix is to ``import dlm_sway.probes`` before building the probe — which | ||
| 7 | -this ``__init__`` does for you. | ||
| 8 | -""" | ||
| 9 | - | ||
| 10 | -from __future__ import annotations | ||
| 11 | - | ||
| 12 | -# Register every shipped probe with the central registry by importing | ||
| 13 | -# its module. Order is not load-bearing for registration but matches the | ||
| 14 | -# categorical grouping in :mod:`dlm_sway.core.result`. | ||
| 15 | -from dlm_sway.probes import ( # noqa: F401 — imports register the probes | ||
| 16 | - adapter_ablation, | ||
| 17 | - adapter_revert, | ||
| 18 | - calibration_drift, | ||
| 19 | - delta_kl, | ||
| 20 | - leakage, | ||
| 21 | - null_adapter, | ||
| 22 | - paraphrase_invariance, | ||
| 23 | - preference_flip, | ||
| 24 | - prompt_collapse, | ||
| 25 | - section_internalization, | ||
| 26 | - style_fingerprint, | ||
| 27 | -) | ||
sway/src/dlm_sway/probes/_calibration_pack.pydeleted@@ -1,63 +0,0 @@ | |||
| 1 | -"""A small, built-in general-knowledge probe pack for C2. | ||
| 2 | - | ||
| 3 | -Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few | ||
| 4 | -tokens a competent base model should assign high probability to. The | ||
| 5 | -items are deliberately *factually trivial* — the point isn't "does the | ||
| 6 | -model know this?" but "did the fine-tune forget this?" — so the pack | ||
| 7 | -skews toward grade-school geography, chemistry, arithmetic, and | ||
| 8 | -high-frequency idiom. | ||
| 9 | - | ||
| 10 | -A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD + | ||
| 11 | -OpenBookQA. This 30-item seed lets the probe ship today and catches the | ||
| 12 | -most egregious over-fit cases. | ||
| 13 | -""" | ||
| 14 | - | ||
| 15 | -from __future__ import annotations | ||
| 16 | - | ||
| 17 | -from typing import Final | ||
| 18 | - | ||
| 19 | -CalibrationItem = tuple[str, str] | ||
| 20 | - | ||
| 21 | -BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = ( | ||
| 22 | - # Geography | ||
| 23 | - ("The capital of France is", " Paris"), | ||
| 24 | - ("The capital of Japan is", " Tokyo"), | ||
| 25 | - ("The largest ocean on Earth is the", " Pacific"), | ||
| 26 | - ("Mount Everest is located on the border of Nepal and", " China"), | ||
| 27 | - ("The longest river in South America is the", " Amazon"), | ||
| 28 | - # Natural sciences | ||
| 29 | - ("Water freezes at zero degrees", " Celsius"), | ||
| 30 | - ("The chemical symbol for gold is", " Au"), | ||
| 31 | - ("Light travels faster than", " sound"), | ||
| 32 | - ("Plants convert sunlight into energy through", " photosynthesis"), | ||
| 33 | - ("The Earth orbits around the", " Sun"), | ||
| 34 | - # Arithmetic | ||
| 35 | - ("Two plus two equals", " four"), | ||
| 36 | - ("Ten times ten equals", " one hundred"), | ||
| 37 | - ("Half of one hundred is", " fifty"), | ||
| 38 | - ("A dozen means", " twelve"), | ||
| 39 | - # Language and idiom | ||
| 40 | - ("A rose by any other name would smell as", " sweet"), | ||
| 41 | - ("To be or not to be, that is the", " question"), | ||
| 42 | - ("The early bird catches the", " worm"), | ||
| 43 | - ("Actions speak louder than", " words"), | ||
| 44 | - ("A picture is worth a thousand", " words"), | ||
| 45 | - # History | ||
| 46 | - ("World War II ended in the year", " 1945"), | ||
| 47 | - ("The first president of the United States was", " George Washington"), | ||
| 48 | - ("The Berlin Wall fell in", " 1989"), | ||
| 49 | - # Biology | ||
| 50 | - ("Humans have twenty", " fingers and toes"), | ||
| 51 | - ("The human body has two", " lungs"), | ||
| 52 | - ("Blood is pumped through the body by the", " heart"), | ||
| 53 | - # Technology | ||
| 54 | - ("HTML stands for HyperText", " Markup Language"), | ||
| 55 | - ("The World Wide Web was invented by Tim", " Berners-Lee"), | ||
| 56 | - # Miscellaneous | ||
| 57 | - ("One year has", " 365 days"), | ||
| 58 | - ("A week has seven", " days"), | ||
| 59 | - ("There are seven colors in a", " rainbow"), | ||
| 60 | -) | ||
| 61 | -"""30 items covering geography, science, arithmetic, language, history, | ||
| 62 | -biology, and technology. Pulled from public-domain grade-school facts so | ||
| 63 | -there's no licensing concern about shipping with the wheel.""" | ||
sway/src/dlm_sway/probes/_divergence.pydeleted@@ -1,102 +0,0 @@ | |||
| 1 | -"""Shared math for divergence-based probes. | ||
| 2 | - | ||
| 3 | -Extracted so :mod:`delta_kl`, :mod:`adapter_ablation`, and any future | ||
| 4 | -probe operating on next-token distributions reuse the same aligned- | ||
| 5 | -top-k KL / JS computation. Having one implementation keeps the numerical | ||
| 6 | -treatment consistent across the report. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -import math | ||
| 12 | -from typing import Literal | ||
| 13 | - | ||
| 14 | -import numpy as np | ||
| 15 | -from numpy.typing import NDArray | ||
| 16 | - | ||
| 17 | -from dlm_sway.core.scoring import TokenDist | ||
| 18 | - | ||
| 19 | -Divergence = Literal["kl", "js"] | ||
| 20 | - | ||
| 21 | - | ||
| 22 | -def aligned_probs( | ||
| 23 | - base: TokenDist, ft: TokenDist | ||
| 24 | -) -> tuple[NDArray[np.float64], NDArray[np.float64]]: | ||
| 25 | - """Return aligned probability vectors over the union of top-k tokens. | ||
| 26 | - | ||
| 27 | - Two ``TokenDist`` objects may surface different top-k indices if | ||
| 28 | - the two models disagree about the hot tokens. We build a shared | ||
| 29 | - support — ``union(base.token_ids, ft.token_ids)`` — and slot the | ||
| 30 | - known probabilities in. Unknown entries fall back to the | ||
| 31 | - per-distribution tail mass divided across the missing tokens, | ||
| 32 | - which is the maximum-entropy completion under the truncation. | ||
| 33 | - """ | ||
| 34 | - union_ids = np.union1d(base.token_ids, ft.token_ids) | ||
| 35 | - k = int(union_ids.size) | ||
| 36 | - | ||
| 37 | - base_probs = _to_support(base, union_ids, k) | ||
| 38 | - ft_probs = _to_support(ft, union_ids, k) | ||
| 39 | - | ||
| 40 | - # Normalize in case of floating noise from the fill-in. | ||
| 41 | - base_probs /= base_probs.sum() | ||
| 42 | - ft_probs /= ft_probs.sum() | ||
| 43 | - return base_probs, ft_probs | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -def _to_support(dist: TokenDist, support: NDArray[np.int64], k: int) -> NDArray[np.float64]: | ||
| 47 | - probs = np.exp(dist.logprobs.astype(np.float64)) | ||
| 48 | - out = np.zeros(k, dtype=np.float64) | ||
| 49 | - known_mass = float(probs.sum()) | ||
| 50 | - tail_mass = max(0.0, 1.0 - known_mass) | ||
| 51 | - | ||
| 52 | - id_to_idx = {int(tok): idx for idx, tok in enumerate(support.tolist())} | ||
| 53 | - missing = 0 | ||
| 54 | - for tok, p in zip(dist.token_ids.tolist(), probs.tolist(), strict=True): | ||
| 55 | - i = id_to_idx.get(int(tok)) | ||
| 56 | - if i is None: | ||
| 57 | - # Shouldn't happen given union construction. | ||
| 58 | - missing += 1 | ||
| 59 | - continue | ||
| 60 | - out[i] = float(p) | ||
| 61 | - | ||
| 62 | - # Spread the tail mass over the support entries that this dist | ||
| 63 | - # doesn't explicitly provide. Size of that set: | ||
| 64 | - n_unknown = int((out == 0.0).sum()) - missing | ||
| 65 | - if n_unknown > 0 and tail_mass > 0.0: | ||
| 66 | - per = tail_mass / n_unknown | ||
| 67 | - out[out == 0.0] = per | ||
| 68 | - | ||
| 69 | - return out | ||
| 70 | - | ||
| 71 | - | ||
| 72 | -def kl(p: NDArray[np.float64], q: NDArray[np.float64]) -> float: | ||
| 73 | - """KL(p || q) in nats. Robust to zeros in p (treated as 0·log0 = 0).""" | ||
| 74 | - mask = p > 0.0 | ||
| 75 | - safe_q = np.where(q > 0.0, q, 1e-12) | ||
| 76 | - return float(np.sum(p[mask] * (np.log(p[mask]) - np.log(safe_q[mask])))) | ||
| 77 | - | ||
| 78 | - | ||
| 79 | -def js(p: NDArray[np.float64], q: NDArray[np.float64]) -> float: | ||
| 80 | - """Jensen-Shannon divergence. Symmetric, bounded in [0, ln 2] (nats). | ||
| 81 | - | ||
| 82 | - The upper bound makes JS a nicer default for thresholding than raw | ||
| 83 | - KL — a user doesn't need to know their specific model's KL scale to | ||
| 84 | - pick a threshold. | ||
| 85 | - """ | ||
| 86 | - m = 0.5 * (p + q) | ||
| 87 | - return 0.5 * kl(p, m) + 0.5 * kl(q, m) | ||
| 88 | - | ||
| 89 | - | ||
| 90 | -def divergence(base: TokenDist, ft: TokenDist, kind: Divergence = "js") -> float: | ||
| 91 | - """Compute KL or JS between two ``TokenDist`` on a shared support.""" | ||
| 92 | - p, q = aligned_probs(base, ft) | ||
| 93 | - if kind == "js": | ||
| 94 | - return js(p, q) | ||
| 95 | - if kind == "kl": | ||
| 96 | - return kl(q, p) # KL(ft || base) — "how much does ft diverge from base" | ||
| 97 | - raise ValueError(f"unknown divergence kind: {kind!r}") | ||
| 98 | - | ||
| 99 | - | ||
| 100 | -def js_ln2() -> float: | ||
| 101 | - """Upper bound on JS in nats. Useful for normalization.""" | ||
| 102 | - return math.log(2.0) | ||
sway/src/dlm_sway/probes/adapter_ablation.pydeleted@@ -1,193 +0,0 @@ | |||
| 1 | -"""N2 AdapterAblation — the sway signature primitive. | ||
| 2 | - | ||
| 3 | -Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} | ||
| 4 | -and measures the mean divergence from the base distribution at each | ||
| 5 | -step. Fits a monotonic response curve; reports three shape metrics: | ||
| 6 | - | ||
| 7 | -- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means | ||
| 8 | - the adapter's effect scales predictably; low means it's "all or | ||
| 9 | - nothing" (degenerate). | ||
| 10 | -- **saturation_lambda**: the smallest λ at which divergence reaches | ||
| 11 | - 90% of the λ=1 value. Too low (<0.3) means the adapter fires at | ||
| 12 | - partial strength — fragile. Too high (>1.0) means the adapter is | ||
| 13 | - under-trained. | ||
| 14 | -- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the | ||
| 15 | - healthy "pushing past 1 still moves the model" signal. An overshoot | ||
| 16 | - below 1.0 suggests collapse. | ||
| 17 | - | ||
| 18 | -This is the single novel primitive that no generic eval harness | ||
| 19 | -provides — sway's position next to the adapter math makes it possible. | ||
| 20 | - | ||
| 21 | -Requires the backend to implement | ||
| 22 | -:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes | ||
| 23 | -SKIP gracefully on backends that don't. | ||
| 24 | -""" | ||
| 25 | - | ||
| 26 | -from __future__ import annotations | ||
| 27 | - | ||
| 28 | -from typing import Literal | ||
| 29 | - | ||
| 30 | -import numpy as np | ||
| 31 | -from pydantic import Field | ||
| 32 | - | ||
| 33 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 34 | -from dlm_sway.core.scoring import ScalableDifferentialBackend | ||
| 35 | -from dlm_sway.probes._divergence import Divergence, divergence | ||
| 36 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -class AdapterAblationSpec(ProbeSpec): | ||
| 40 | - kind: Literal["adapter_ablation"] = "adapter_ablation" | ||
| 41 | - prompts: list[str] = Field(default_factory=list) | ||
| 42 | - lambdas: list[float] = Field( | ||
| 43 | - default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | ||
| 44 | - min_length=3, | ||
| 45 | - ) | ||
| 46 | - divergence: Divergence = "js" | ||
| 47 | - top_k: int | None = None | ||
| 48 | - assert_linearity_gte: float = 0.85 | ||
| 49 | - assert_saturation_between: tuple[float, float] = (0.3, 1.05) | ||
| 50 | - assert_overshoot_gte: float = 1.02 | ||
| 51 | - | ||
| 52 | - | ||
| 53 | -class AdapterAblationProbe(Probe): | ||
| 54 | - kind = "adapter_ablation" | ||
| 55 | - spec_cls = AdapterAblationSpec | ||
| 56 | - category = "ablation" | ||
| 57 | - | ||
| 58 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 59 | - assert isinstance(spec, AdapterAblationSpec) | ||
| 60 | - if not spec.prompts: | ||
| 61 | - return ProbeResult( | ||
| 62 | - name=spec.name, | ||
| 63 | - kind=spec.kind, | ||
| 64 | - verdict=Verdict.ERROR, | ||
| 65 | - score=None, | ||
| 66 | - message="no prompts provided", | ||
| 67 | - ) | ||
| 68 | - if not isinstance(ctx.backend, ScalableDifferentialBackend): | ||
| 69 | - return ProbeResult( | ||
| 70 | - name=spec.name, | ||
| 71 | - kind=spec.kind, | ||
| 72 | - verdict=Verdict.SKIP, | ||
| 73 | - score=None, | ||
| 74 | - message=( | ||
| 75 | - "backend does not implement ScalableDifferentialBackend — " | ||
| 76 | - "adapter ablation requires LoRA-scale access" | ||
| 77 | - ), | ||
| 78 | - ) | ||
| 79 | - | ||
| 80 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | ||
| 81 | - | ||
| 82 | - # Reference distribution at λ=0 (adapter scaled to zero → base). | ||
| 83 | - lam_zero = min(spec.lambdas) | ||
| 84 | - per_lambda: list[float] = [] | ||
| 85 | - for lam in spec.lambdas: | ||
| 86 | - divs_for_lam: list[float] = [] | ||
| 87 | - for prompt in spec.prompts: | ||
| 88 | - with ctx.backend.as_scaled_adapter(lam_zero) as ref: | ||
| 89 | - ref_dist = ref.next_token_dist(prompt, top_k=top_k) | ||
| 90 | - with ctx.backend.as_scaled_adapter(lam) as scaled: | ||
| 91 | - scaled_dist = scaled.next_token_dist(prompt, top_k=top_k) | ||
| 92 | - divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence)) | ||
| 93 | - per_lambda.append(float(np.mean(divs_for_lam))) | ||
| 94 | - | ||
| 95 | - lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64) | ||
| 96 | - divs_arr = np.asarray(per_lambda, dtype=np.float64) | ||
| 97 | - | ||
| 98 | - linearity = _r_squared(lambdas_arr, divs_arr) | ||
| 99 | - saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr) | ||
| 100 | - overshoot = _overshoot(lambdas_arr, divs_arr) | ||
| 101 | - | ||
| 102 | - # Pass when all three shape metrics land in their healthy bands. | ||
| 103 | - sat_lo, sat_hi = spec.assert_saturation_between | ||
| 104 | - ok_lin = linearity >= spec.assert_linearity_gte | ||
| 105 | - ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi | ||
| 106 | - ok_over = overshoot >= spec.assert_overshoot_gte | ||
| 107 | - verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL | ||
| 108 | - | ||
| 109 | - lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6))) | ||
| 110 | - over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2)) | ||
| 111 | - sat_score = 1.0 if ok_sat else 0.3 | ||
| 112 | - score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score | ||
| 113 | - | ||
| 114 | - return ProbeResult( | ||
| 115 | - name=spec.name, | ||
| 116 | - kind=spec.kind, | ||
| 117 | - verdict=verdict, | ||
| 118 | - score=score, | ||
| 119 | - raw=linearity, | ||
| 120 | - evidence={ | ||
| 121 | - "lambdas": spec.lambdas, | ||
| 122 | - "mean_divergence_per_lambda": per_lambda, | ||
| 123 | - "linearity": linearity, | ||
| 124 | - "saturation_lambda": saturation_lambda, | ||
| 125 | - "overshoot": overshoot, | ||
| 126 | - "passed_linearity": ok_lin, | ||
| 127 | - "passed_saturation": ok_sat, | ||
| 128 | - "passed_overshoot": ok_over, | ||
| 129 | - "weight": spec.weight, | ||
| 130 | - }, | ||
| 131 | - message=( | ||
| 132 | - f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} " | ||
| 133 | - f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}" | ||
| 134 | - if saturation_lambda is not None | ||
| 135 | - else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}" | ||
| 136 | - ), | ||
| 137 | - ) | ||
| 138 | - | ||
| 139 | - | ||
| 140 | -def _r_squared(x: np.ndarray, y: np.ndarray) -> float: | ||
| 141 | - """Coefficient of determination for a linear fit of ``y`` on ``x``.""" | ||
| 142 | - if x.size < 2: | ||
| 143 | - return 0.0 | ||
| 144 | - xm = float(x.mean()) | ||
| 145 | - ym = float(y.mean()) | ||
| 146 | - denom = float(((x - xm) ** 2).sum()) | ||
| 147 | - if denom == 0.0: | ||
| 148 | - return 0.0 | ||
| 149 | - slope = float(((x - xm) * (y - ym)).sum()) / denom | ||
| 150 | - intercept = ym - slope * xm | ||
| 151 | - y_pred = slope * x + intercept | ||
| 152 | - ss_res = float(((y - y_pred) ** 2).sum()) | ||
| 153 | - ss_tot = float(((y - ym) ** 2).sum()) | ||
| 154 | - if ss_tot == 0.0: | ||
| 155 | - return 1.0 | ||
| 156 | - return max(0.0, 1.0 - ss_res / ss_tot) | ||
| 157 | - | ||
| 158 | - | ||
| 159 | -def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None: | ||
| 160 | - """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1).""" | ||
| 161 | - # Locate the index of λ=1.0 (or the closest entry ≤ 1.0). | ||
| 162 | - candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0] | ||
| 163 | - if candidates.size == 0: | ||
| 164 | - # Fall back to the largest λ ≤ 1.0. | ||
| 165 | - mask = lambdas <= 1.0 | ||
| 166 | - if not mask.any(): | ||
| 167 | - return None | ||
| 168 | - idx1 = int(np.argmax(lambdas * mask)) | ||
| 169 | - else: | ||
| 170 | - idx1 = int(candidates[0]) | ||
| 171 | - target = 0.9 * float(divs[idx1]) | ||
| 172 | - if target <= 0: | ||
| 173 | - return None | ||
| 174 | - for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False): | ||
| 175 | - if d >= target: | ||
| 176 | - return float(lam) | ||
| 177 | - return None | ||
| 178 | - | ||
| 179 | - | ||
| 180 | -def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float: | ||
| 181 | - """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0.""" | ||
| 182 | - idx_max = int(np.argmax(lambdas)) | ||
| 183 | - candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0] | ||
| 184 | - if candidates.size == 0: | ||
| 185 | - return 1.0 | ||
| 186 | - idx1 = int(candidates[0]) | ||
| 187 | - if idx_max == idx1: | ||
| 188 | - return 1.0 | ||
| 189 | - d1 = float(divs[idx1]) | ||
| 190 | - dmax = float(divs[idx_max]) | ||
| 191 | - if d1 <= 0: | ||
| 192 | - return 1.0 | ||
| 193 | - return dmax / d1 | ||
sway/src/dlm_sway/probes/adapter_revert.pydeleted@@ -1,178 +0,0 @@ | |||
| 1 | -"""A2 AdapterRevert — does the fine-tuned model drift back to base under pressure? | ||
| 2 | - | ||
| 3 | -For each test case the user provides a prompt, a "gold" answer (the | ||
| 4 | -adapter's intended response), and one or more adversarial paraphrases of | ||
| 5 | -the prompt. We generate base-model and ft-model completions on every | ||
| 6 | -paraphrase and ask: does the ft output cluster semantically with the | ||
| 7 | -base's output (revert) or with the gold (adhere)? | ||
| 8 | - | ||
| 9 | -Signal: ``revert_rate`` = fraction of (case, paraphrase) pairs where | ||
| 10 | -``cos(ft, base) > cos(ft, gold)``. A healthy fine-tune holds below 25%. | ||
| 11 | - | ||
| 12 | -Needs sentence embeddings. Without the ``semsim`` extra installed the | ||
| 13 | -probe returns :attr:`Verdict.SKIP` with a pip hint — deterministic | ||
| 14 | -n-gram fallbacks don't carry semantic equivalence reliably enough to | ||
| 15 | -drive a revert decision, and we'd rather be honest than lossy. | ||
| 16 | -""" | ||
| 17 | - | ||
| 18 | -from __future__ import annotations | ||
| 19 | - | ||
| 20 | -from typing import Any, Literal | ||
| 21 | - | ||
| 22 | -from pydantic import BaseModel, ConfigDict, Field | ||
| 23 | - | ||
| 24 | -from dlm_sway.core.errors import BackendNotAvailableError | ||
| 25 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 26 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 27 | - | ||
| 28 | - | ||
| 29 | -class AdapterRevertCase(BaseModel): | ||
| 30 | - """One revert test case.""" | ||
| 31 | - | ||
| 32 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 33 | - | ||
| 34 | - prompt: str | ||
| 35 | - gold: str | ||
| 36 | - """What the adapter is supposed to produce.""" | ||
| 37 | - paraphrases: list[str] = Field(default_factory=list, min_length=1) | ||
| 38 | - """At least one paraphrase is required — revert is observed under | ||
| 39 | - reframing, not on the original prompt.""" | ||
| 40 | - | ||
| 41 | - | ||
| 42 | -class AdapterRevertSpec(ProbeSpec): | ||
| 43 | - kind: Literal["adapter_revert"] = "adapter_revert" | ||
| 44 | - cases: list[AdapterRevertCase] = Field(default_factory=list) | ||
| 45 | - max_new_tokens: int = 64 | ||
| 46 | - embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" | ||
| 47 | - """HF id of the embedder. Default is ~80 MB, CPU-friendly.""" | ||
| 48 | - base_gold_similarity_cap: float = 0.75 | ||
| 49 | - """Skip pairs where base and gold are trivially similar — those | ||
| 50 | - can't distinguish revert from adherence, and including them would | ||
| 51 | - inflate the revert rate with noise.""" | ||
| 52 | - assert_revert_rate_lt: float = 0.25 | ||
| 53 | - | ||
| 54 | - | ||
| 55 | -class AdapterRevertProbe(Probe): | ||
| 56 | - kind = "adapter_revert" | ||
| 57 | - spec_cls = AdapterRevertSpec | ||
| 58 | - category = "adherence" | ||
| 59 | - | ||
| 60 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 61 | - assert isinstance(spec, AdapterRevertSpec) | ||
| 62 | - if not spec.cases: | ||
| 63 | - return ProbeResult( | ||
| 64 | - name=spec.name, | ||
| 65 | - kind=spec.kind, | ||
| 66 | - verdict=Verdict.ERROR, | ||
| 67 | - score=None, | ||
| 68 | - message="no cases provided", | ||
| 69 | - ) | ||
| 70 | - | ||
| 71 | - try: | ||
| 72 | - embed = _load_embedder(spec.embedding_model) | ||
| 73 | - except BackendNotAvailableError as exc: | ||
| 74 | - return ProbeResult( | ||
| 75 | - name=spec.name, | ||
| 76 | - kind=spec.kind, | ||
| 77 | - verdict=Verdict.SKIP, | ||
| 78 | - score=None, | ||
| 79 | - message=str(exc), | ||
| 80 | - ) | ||
| 81 | - | ||
| 82 | - import numpy as np | ||
| 83 | - | ||
| 84 | - total = 0 | ||
| 85 | - reverts = 0 | ||
| 86 | - dropped_trivial = 0 | ||
| 87 | - per_case: list[dict[str, Any]] = [] | ||
| 88 | - for case in spec.cases: | ||
| 89 | - gold_vec = embed([case.gold])[0] | ||
| 90 | - for pp in case.paraphrases: | ||
| 91 | - with ctx.backend.as_base() as bv: | ||
| 92 | - base_gen = bv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | ||
| 93 | - with ctx.backend.as_finetuned() as fv: | ||
| 94 | - ft_gen = fv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | ||
| 95 | - vecs = embed([base_gen, ft_gen]) | ||
| 96 | - base_vec, ft_vec = vecs[0], vecs[1] | ||
| 97 | - base_gold = _cosine(base_vec, gold_vec) | ||
| 98 | - if base_gold > spec.base_gold_similarity_cap: | ||
| 99 | - dropped_trivial += 1 | ||
| 100 | - continue | ||
| 101 | - cos_ft_base = _cosine(ft_vec, base_vec) | ||
| 102 | - cos_ft_gold = _cosine(ft_vec, gold_vec) | ||
| 103 | - total += 1 | ||
| 104 | - if cos_ft_base > cos_ft_gold: | ||
| 105 | - reverts += 1 | ||
| 106 | - per_case.append( | ||
| 107 | - { | ||
| 108 | - "prompt": pp[:80], | ||
| 109 | - "cos_ft_base": cos_ft_base, | ||
| 110 | - "cos_ft_gold": cos_ft_gold, | ||
| 111 | - "reverted": cos_ft_base > cos_ft_gold, | ||
| 112 | - } | ||
| 113 | - ) | ||
| 114 | - | ||
| 115 | - if total == 0: | ||
| 116 | - return ProbeResult( | ||
| 117 | - name=spec.name, | ||
| 118 | - kind=spec.kind, | ||
| 119 | - verdict=Verdict.WARN, | ||
| 120 | - score=0.5, | ||
| 121 | - message=( | ||
| 122 | - f"all {dropped_trivial} cases had base≈gold (> " | ||
| 123 | - f"{spec.base_gold_similarity_cap}) — no separable signal" | ||
| 124 | - ), | ||
| 125 | - evidence={"dropped_trivial": dropped_trivial, "weight": spec.weight}, | ||
| 126 | - ) | ||
| 127 | - | ||
| 128 | - rate = reverts / total | ||
| 129 | - verdict = Verdict.PASS if rate < spec.assert_revert_rate_lt else Verdict.FAIL | ||
| 130 | - score = max(0.0, 1.0 - rate / max(spec.assert_revert_rate_lt, 1e-6)) | ||
| 131 | - score = float(np.clip(score, 0.0, 1.0)) | ||
| 132 | - | ||
| 133 | - return ProbeResult( | ||
| 134 | - name=spec.name, | ||
| 135 | - kind=spec.kind, | ||
| 136 | - verdict=verdict, | ||
| 137 | - score=score, | ||
| 138 | - raw=rate, | ||
| 139 | - evidence={ | ||
| 140 | - "revert_rate": rate, | ||
| 141 | - "reverts": reverts, | ||
| 142 | - "total": total, | ||
| 143 | - "dropped_trivial": dropped_trivial, | ||
| 144 | - "per_case": per_case[:8], # cap to keep JSON bounded | ||
| 145 | - "weight": spec.weight, | ||
| 146 | - }, | ||
| 147 | - message=f"revert_rate={rate:.2%} (reverts={reverts}/{total}, dropped_trivial={dropped_trivial})", | ||
| 148 | - ) | ||
| 149 | - | ||
| 150 | - | ||
| 151 | -def _load_embedder(model_id: str): # type: ignore[no-untyped-def] | ||
| 152 | - """Return a callable ``list[str] -> np.ndarray`` over encoded vectors.""" | ||
| 153 | - try: | ||
| 154 | - from sentence_transformers import SentenceTransformer | ||
| 155 | - except ImportError as exc: | ||
| 156 | - raise BackendNotAvailableError( | ||
| 157 | - "adapter_revert", | ||
| 158 | - extra="semsim", | ||
| 159 | - hint="adapter_revert relies on sentence embeddings.", | ||
| 160 | - ) from exc | ||
| 161 | - st = SentenceTransformer(model_id) | ||
| 162 | - | ||
| 163 | - def _embed(texts: list[str]): # type: ignore[no-untyped-def] | ||
| 164 | - return st.encode(texts, convert_to_numpy=True, normalize_embeddings=True) | ||
| 165 | - | ||
| 166 | - return _embed | ||
| 167 | - | ||
| 168 | - | ||
| 169 | -def _cosine(a: Any, b: Any) -> float: | ||
| 170 | - import numpy as np | ||
| 171 | - | ||
| 172 | - av = np.asarray(a, dtype=np.float64) | ||
| 173 | - bv = np.asarray(b, dtype=np.float64) | ||
| 174 | - na = float(np.linalg.norm(av)) | ||
| 175 | - nb = float(np.linalg.norm(bv)) | ||
| 176 | - if na == 0.0 or nb == 0.0: | ||
| 177 | - return 0.0 | ||
| 178 | - return float(np.dot(av, bv) / (na * nb)) | ||
sway/src/dlm_sway/probes/base.pydeleted@@ -1,131 +0,0 @@ | |||
| 1 | -"""Probe abstract base + per-kind registry. | ||
| 2 | - | ||
| 3 | -The registry is the extension point. Adding a new probe means: | ||
| 4 | - | ||
| 5 | -1. Subclass :class:`ProbeSpec` with a unique ``kind`` field (Literal). | ||
| 6 | -2. Subclass :class:`Probe` setting ``kind`` and ``spec_cls``. | ||
| 7 | -3. Importing the probe module at least once (its subclass hook registers | ||
| 8 | - itself). | ||
| 9 | - | ||
| 10 | -The runner uses :func:`build_probe` to map each raw spec dict to a | ||
| 11 | -``(Probe, ProbeSpec)`` pair. Validation errors are turned into | ||
| 12 | -:class:`~dlm_sway.core.errors.SpecValidationError` with the probe name | ||
| 13 | -as the source so error messages localize to the offending entry. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | -from __future__ import annotations | ||
| 17 | - | ||
| 18 | -from abc import ABC, abstractmethod | ||
| 19 | -from dataclasses import dataclass, field | ||
| 20 | -from typing import Any, ClassVar | ||
| 21 | - | ||
| 22 | -from pydantic import BaseModel, ConfigDict, ValidationError | ||
| 23 | - | ||
| 24 | -from dlm_sway.core.errors import SpecValidationError | ||
| 25 | -from dlm_sway.core.result import ProbeResult | ||
| 26 | -from dlm_sway.core.scoring import DifferentialBackend | ||
| 27 | -from dlm_sway.core.sections import Section | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -class ProbeSpec(BaseModel): | ||
| 31 | - """Common fields for every probe's spec entry in ``sway.yaml``.""" | ||
| 32 | - | ||
| 33 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 34 | - | ||
| 35 | - name: str | ||
| 36 | - """Unique within a suite; surfaces in the report.""" | ||
| 37 | - kind: str | ||
| 38 | - """Discriminator — must match a registered :class:`Probe` subclass.""" | ||
| 39 | - enabled: bool = True | ||
| 40 | - """If ``False`` the runner records a :class:`~dlm_sway.core.result.Verdict.SKIP`.""" | ||
| 41 | - weight: float = 1.0 | ||
| 42 | - """Weight inside the probe's component (adherence / attribution / …).""" | ||
| 43 | - | ||
| 44 | - | ||
| 45 | -@dataclass(frozen=True, slots=True) | ||
| 46 | -class RunContext: | ||
| 47 | - """What a probe can read beyond its own spec. | ||
| 48 | - | ||
| 49 | - Probes should receive exactly what they need and nothing more; fat | ||
| 50 | - contexts encourage coupling between unrelated probes. | ||
| 51 | - | ||
| 52 | - Attributes | ||
| 53 | - ---------- | ||
| 54 | - backend: | ||
| 55 | - The differential backend holding base + fine-tuned views. | ||
| 56 | - seed: | ||
| 57 | - Seed for deterministic probe RNGs (paraphrase sampling, etc). | ||
| 58 | - top_k: | ||
| 59 | - Default truncation for next-token distributions. | ||
| 60 | - sections: | ||
| 61 | - Optional list of typed sections (populated by the .dlm bridge; | ||
| 62 | - ``None`` when sway is invoked against bare HF+PEFT). | ||
| 63 | - doc_text: | ||
| 64 | - Raw document text, if available. | ||
| 65 | - null_stats: | ||
| 66 | - Null-adapter baseline stats for z-score calibration, keyed by | ||
| 67 | - probe *kind*. Populated by the runner after it's executed the | ||
| 68 | - ``null_adapter`` probe (if configured). | ||
| 69 | - """ | ||
| 70 | - | ||
| 71 | - backend: DifferentialBackend | ||
| 72 | - seed: int = 0 | ||
| 73 | - top_k: int = 256 | ||
| 74 | - sections: tuple[Section, ...] | None = None | ||
| 75 | - doc_text: str | None = None | ||
| 76 | - null_stats: dict[str, dict[str, float]] = field(default_factory=dict) | ||
| 77 | - | ||
| 78 | - | ||
| 79 | -_REGISTRY: dict[str, type[Probe]] = {} | ||
| 80 | - | ||
| 81 | - | ||
| 82 | -class Probe(ABC): | ||
| 83 | - """Concrete probe. One instance per probe spec in the suite.""" | ||
| 84 | - | ||
| 85 | - kind: ClassVar[str] | ||
| 86 | - """The string used in ``sway.yaml``'s ``kind`` field.""" | ||
| 87 | - spec_cls: ClassVar[type[ProbeSpec]] | ||
| 88 | - """The pydantic model class that validates this probe's spec.""" | ||
| 89 | - category: ClassVar[str] = "adherence" | ||
| 90 | - """One of: ``adherence``, ``attribution``, ``calibration``, | ||
| 91 | - ``ablation``, ``baseline``. Drives composite scoring.""" | ||
| 92 | - | ||
| 93 | - def __init_subclass__(cls, **kwargs: Any) -> None: | ||
| 94 | - super().__init_subclass__(**kwargs) | ||
| 95 | - # The abstract class itself has no `kind`; skip registration. | ||
| 96 | - if "kind" not in cls.__dict__: | ||
| 97 | - return | ||
| 98 | - kind = cls.kind | ||
| 99 | - if kind in _REGISTRY: | ||
| 100 | - raise ValueError(f"duplicate probe kind {kind!r}: {_REGISTRY[kind]!r} vs {cls!r}") | ||
| 101 | - _REGISTRY[kind] = cls | ||
| 102 | - | ||
| 103 | - @abstractmethod | ||
| 104 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: ... | ||
| 105 | - | ||
| 106 | - | ||
| 107 | -def registry() -> dict[str, type[Probe]]: | ||
| 108 | - """Read-only view of registered probes.""" | ||
| 109 | - return dict(_REGISTRY) | ||
| 110 | - | ||
| 111 | - | ||
| 112 | -def build_probe(raw: dict[str, Any]) -> tuple[Probe, ProbeSpec]: | ||
| 113 | - """Validate a raw YAML probe entry and return (Probe instance, spec).""" | ||
| 114 | - kind = raw.get("kind") | ||
| 115 | - if not isinstance(kind, str): | ||
| 116 | - raise SpecValidationError( | ||
| 117 | - "probe entry missing string 'kind' field", | ||
| 118 | - source=str(raw.get("name", "<unknown>")), | ||
| 119 | - ) | ||
| 120 | - if kind not in _REGISTRY: | ||
| 121 | - known = ", ".join(sorted(_REGISTRY)) | ||
| 122 | - raise SpecValidationError( | ||
| 123 | - f"unknown probe kind {kind!r} (registered: {known})", | ||
| 124 | - source=str(raw.get("name", "<unknown>")), | ||
| 125 | - ) | ||
| 126 | - probe_cls = _REGISTRY[kind] | ||
| 127 | - try: | ||
| 128 | - spec = probe_cls.spec_cls.model_validate(raw) | ||
| 129 | - except ValidationError as exc: | ||
| 130 | - raise SpecValidationError(str(exc), source=str(raw.get("name", "<unknown>"))) from exc | ||
| 131 | - return probe_cls(), spec | ||
sway/src/dlm_sway/probes/calibration_drift.pydeleted@@ -1,135 +0,0 @@ | |||
| 1 | -"""C2 CalibrationDrift — did we break general knowledge while fitting the doc? | ||
| 2 | - | ||
| 3 | -The classic small-doc fine-tune failure mode: the adapter learned the | ||
| 4 | -document so well that it forgot the world. C2 catches this by scoring | ||
| 5 | -base and ft on a packaged set of general-knowledge completions (the | ||
| 6 | -``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts) | ||
| 7 | -and flagging items whose per-token logprob regressed significantly. | ||
| 8 | - | ||
| 9 | -A healthy fine-tune: some items drift slightly (mild confidence shift, | ||
| 10 | -normal), but essentially none regress below a nat of slack. An over-fit | ||
| 11 | -fine-tune: 20%+ of items regress, the adapter has torched its ability | ||
| 12 | -to answer anything outside the document. | ||
| 13 | - | ||
| 14 | -Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND | ||
| 15 | -``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default | ||
| 16 | -to values that trigger on genuine damage but tolerate normal drift. | ||
| 17 | -""" | ||
| 18 | - | ||
| 19 | -from __future__ import annotations | ||
| 20 | - | ||
| 21 | -import statistics | ||
| 22 | -from typing import Literal | ||
| 23 | - | ||
| 24 | -from pydantic import Field | ||
| 25 | - | ||
| 26 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 27 | -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK | ||
| 28 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -class CalibrationItemSpec(ProbeSpec): | ||
| 32 | - """Not used directly — documents the shape of an item override.""" | ||
| 33 | - | ||
| 34 | - kind: Literal["__calibration_item"] = "__calibration_item" | ||
| 35 | - prompt: str = "" | ||
| 36 | - gold: str = "" | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -class CalibrationDriftSpec(ProbeSpec): | ||
| 40 | - kind: Literal["calibration_drift"] = "calibration_drift" | ||
| 41 | - pack: Literal["builtin"] = "builtin" | ||
| 42 | - """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom | ||
| 43 | - packs will ship via a file reference in a later milestone.""" | ||
| 44 | - items_limit: int | None = None | ||
| 45 | - """If set, truncate the pack to this many items (for fast runs).""" | ||
| 46 | - assert_fraction_regressed_lt: float = 0.15 | ||
| 47 | - assert_mean_delta_gte: float = -0.5 | ||
| 48 | - """Mean per-token logprob delta (ft − base) across the pack. Slightly | ||
| 49 | - negative is tolerable; deeply negative is not.""" | ||
| 50 | - regression_nats: float = 1.0 | ||
| 51 | - """How many nats worse an item must get to count as regressed.""" | ||
| 52 | - items: list[tuple[str, str]] = Field(default_factory=list) | ||
| 53 | - """Optional inline override of the packaged items.""" | ||
| 54 | - | ||
| 55 | - | ||
| 56 | -class CalibrationDriftProbe(Probe): | ||
| 57 | - kind = "calibration_drift" | ||
| 58 | - spec_cls = CalibrationDriftSpec | ||
| 59 | - category = "calibration" | ||
| 60 | - | ||
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 62 | - assert isinstance(spec, CalibrationDriftSpec) | ||
| 63 | - items = list(spec.items) if spec.items else list(BUILT_IN_PACK) | ||
| 64 | - if spec.items_limit is not None: | ||
| 65 | - items = items[: spec.items_limit] | ||
| 66 | - if not items: | ||
| 67 | - return ProbeResult( | ||
| 68 | - name=spec.name, | ||
| 69 | - kind=spec.kind, | ||
| 70 | - verdict=Verdict.ERROR, | ||
| 71 | - score=None, | ||
| 72 | - message="no calibration items", | ||
| 73 | - ) | ||
| 74 | - | ||
| 75 | - deltas: list[float] = [] | ||
| 76 | - regressed = 0 | ||
| 77 | - worst: list[dict[str, float | str]] = [] | ||
| 78 | - | ||
| 79 | - for prompt, gold in items: | ||
| 80 | - tokens = max(_token_estimate(gold), 1) | ||
| 81 | - with ctx.backend.as_base() as b: | ||
| 82 | - lp_base = b.logprob_of(prompt, gold) / tokens | ||
| 83 | - with ctx.backend.as_finetuned() as f: | ||
| 84 | - lp_ft = f.logprob_of(prompt, gold) / tokens | ||
| 85 | - delta = lp_ft - lp_base | ||
| 86 | - deltas.append(delta) | ||
| 87 | - if delta < -spec.regression_nats: | ||
| 88 | - regressed += 1 | ||
| 89 | - worst.append({"prompt": prompt, "gold": gold, "delta": delta}) | ||
| 90 | - | ||
| 91 | - # Surface the worst offenders — up to 5. | ||
| 92 | - worst.sort(key=lambda d: float(d["delta"])) | ||
| 93 | - worst = worst[:5] | ||
| 94 | - | ||
| 95 | - frac_regressed = regressed / len(items) | ||
| 96 | - mean_delta = statistics.fmean(deltas) | ||
| 97 | - | ||
| 98 | - passed = ( | ||
| 99 | - frac_regressed < spec.assert_fraction_regressed_lt | ||
| 100 | - and mean_delta >= spec.assert_mean_delta_gte | ||
| 101 | - ) | ||
| 102 | - verdict = Verdict.PASS if passed else Verdict.FAIL | ||
| 103 | - # Score: 1.0 at zero regression + zero drift, declining with either. | ||
| 104 | - regress_component = max( | ||
| 105 | - 0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6) | ||
| 106 | - ) | ||
| 107 | - drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5)) | ||
| 108 | - score = 0.6 * regress_component + 0.4 * drift_component | ||
| 109 | - | ||
| 110 | - return ProbeResult( | ||
| 111 | - name=spec.name, | ||
| 112 | - kind=spec.kind, | ||
| 113 | - verdict=verdict, | ||
| 114 | - score=score, | ||
| 115 | - raw=frac_regressed, | ||
| 116 | - base_value=None, | ||
| 117 | - ft_value=mean_delta, | ||
| 118 | - evidence={ | ||
| 119 | - "fraction_regressed": frac_regressed, | ||
| 120 | - "mean_delta_nats": mean_delta, | ||
| 121 | - "regressed_count": regressed, | ||
| 122 | - "total_items": len(items), | ||
| 123 | - "worst_offenders": worst, | ||
| 124 | - "regression_nats_threshold": spec.regression_nats, | ||
| 125 | - "weight": spec.weight, | ||
| 126 | - }, | ||
| 127 | - message=( | ||
| 128 | - f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats " | ||
| 129 | - f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok" | ||
| 130 | - ), | ||
| 131 | - ) | ||
| 132 | - | ||
| 133 | - | ||
| 134 | -def _token_estimate(s: str) -> int: | ||
| 135 | - return max(1, len(s) // 4) | ||
sway/src/dlm_sway/probes/delta_kl.pydeleted@@ -1,121 +0,0 @@ | |||
| 1 | -"""A1 DeltaKL — the simplest adherence probe. | ||
| 2 | - | ||
| 3 | -For each prompt, compute the JS (default) or KL divergence between the | ||
| 4 | -base and fine-tuned model's next-token distributions at the position | ||
| 5 | -after the prompt. Aggregate across prompts with a mean. | ||
| 6 | - | ||
| 7 | -*What it tells you:* whether the adapter is distinguishable from the base | ||
| 8 | -on things the document cares about. A zero-divergence result is a red | ||
| 9 | -flag — the adapter is ignored. | ||
| 10 | - | ||
| 11 | -*What it can't tell you:* whether the change is semantically *correct*. | ||
| 12 | -Direction and correctness are what :mod:`dir`, :mod:`adapter_revert`, | ||
| 13 | -and the attribution probes cover. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | -from __future__ import annotations | ||
| 17 | - | ||
| 18 | -import statistics | ||
| 19 | -from typing import Literal | ||
| 20 | - | ||
| 21 | -from pydantic import Field | ||
| 22 | - | ||
| 23 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 24 | -from dlm_sway.probes._divergence import Divergence, divergence, js_ln2 | ||
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 26 | -from dlm_sway.probes.null_adapter import get_null_stats | ||
| 27 | - | ||
| 28 | - | ||
| 29 | -class DeltaKLSpec(ProbeSpec): | ||
| 30 | - """Spec for ``kind: delta_kl``.""" | ||
| 31 | - | ||
| 32 | - kind: Literal["delta_kl"] = "delta_kl" | ||
| 33 | - prompts: list[str] = Field(default_factory=list, min_length=0) | ||
| 34 | - """Inline prompts. At least one of ``prompts`` / ``prompts_from`` must | ||
| 35 | - be non-empty at run time; the prompts-from path is wired via | ||
| 36 | - :mod:`dlm_sway.integrations.dlm.autogen`.""" | ||
| 37 | - divergence: Divergence = "js" | ||
| 38 | - top_k: int | None = None | ||
| 39 | - """Override the suite-wide ``top_k``. ``None`` → use ``ctx.top_k``.""" | ||
| 40 | - assert_mean_gte: float = 0.02 | ||
| 41 | - """Fixed-threshold pass criterion when no null stats are available.""" | ||
| 42 | - assert_z_gte: float = 3.0 | ||
| 43 | - """Z-score pass criterion against the null-adapter baseline, when it | ||
| 44 | - exists. The more principled metric — prefer this over the raw | ||
| 45 | - threshold.""" | ||
| 46 | - | ||
| 47 | - | ||
| 48 | -class DeltaKLProbe(Probe): | ||
| 49 | - """The canonical "is the adapter changing anything?" probe.""" | ||
| 50 | - | ||
| 51 | - kind = "delta_kl" | ||
| 52 | - spec_cls = DeltaKLSpec | ||
| 53 | - category = "adherence" | ||
| 54 | - | ||
| 55 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 56 | - assert isinstance(spec, DeltaKLSpec) | ||
| 57 | - if not spec.prompts: | ||
| 58 | - return ProbeResult( | ||
| 59 | - name=spec.name, | ||
| 60 | - kind=spec.kind, | ||
| 61 | - verdict=Verdict.ERROR, | ||
| 62 | - score=None, | ||
| 63 | - message="no prompts provided (inline 'prompts' was empty)", | ||
| 64 | - ) | ||
| 65 | - | ||
| 66 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | ||
| 67 | - divergences: list[float] = [] | ||
| 68 | - for prompt in spec.prompts: | ||
| 69 | - with ctx.backend.as_base() as base_view: | ||
| 70 | - base_dist = base_view.next_token_dist(prompt, top_k=top_k) | ||
| 71 | - with ctx.backend.as_finetuned() as ft_view: | ||
| 72 | - ft_dist = ft_view.next_token_dist(prompt, top_k=top_k) | ||
| 73 | - divergences.append(divergence(base_dist, ft_dist, kind=spec.divergence)) | ||
| 74 | - | ||
| 75 | - raw_mean = statistics.fmean(divergences) | ||
| 76 | - raw_max = max(divergences) | ||
| 77 | - | ||
| 78 | - # Null-adapter calibration wins when available. | ||
| 79 | - null = get_null_stats(ctx, spec.kind) | ||
| 80 | - z = None | ||
| 81 | - if null is not None and null.get("std", 0.0) > 0.0: | ||
| 82 | - z = (raw_mean - null["mean"]) / null["std"] | ||
| 83 | - verdict = Verdict.PASS if z >= spec.assert_z_gte else Verdict.FAIL | ||
| 84 | - message = f"mean {spec.divergence}={raw_mean:.4f}, z={z:+.2f}σ vs null" | ||
| 85 | - else: | ||
| 86 | - verdict = Verdict.PASS if raw_mean >= spec.assert_mean_gte else Verdict.FAIL | ||
| 87 | - message = ( | ||
| 88 | - f"mean {spec.divergence}={raw_mean:.4f} " | ||
| 89 | - f"({'≥' if verdict == Verdict.PASS else '<'} {spec.assert_mean_gte})" | ||
| 90 | - ) | ||
| 91 | - | ||
| 92 | - # Normalized score for composite: JS is bounded by ln(2), so | ||
| 93 | - # sigmoid-ish on (z, or raw / bound) keeps the number in [0, 1]. | ||
| 94 | - if z is not None: | ||
| 95 | - score = _sigmoid(z / 3.0) | ||
| 96 | - else: | ||
| 97 | - bound = js_ln2() if spec.divergence == "js" else 1.0 | ||
| 98 | - score = min(1.0, raw_mean / bound) if bound > 0.0 else 0.0 | ||
| 99 | - | ||
| 100 | - return ProbeResult( | ||
| 101 | - name=spec.name, | ||
| 102 | - kind=spec.kind, | ||
| 103 | - verdict=verdict, | ||
| 104 | - score=score, | ||
| 105 | - raw=raw_mean, | ||
| 106 | - z_score=z, | ||
| 107 | - evidence={ | ||
| 108 | - "divergence_kind": spec.divergence, | ||
| 109 | - "per_prompt": divergences, | ||
| 110 | - "max": raw_max, | ||
| 111 | - "num_prompts": len(spec.prompts), | ||
| 112 | - "weight": spec.weight, | ||
| 113 | - }, | ||
| 114 | - message=message, | ||
| 115 | - ) | ||
| 116 | - | ||
| 117 | - | ||
| 118 | -def _sigmoid(x: float) -> float: | ||
| 119 | - import math | ||
| 120 | - | ||
| 121 | - return 1.0 / (1.0 + math.exp(-x)) | ||
sway/src/dlm_sway/probes/leakage.pydeleted@@ -1,194 +0,0 @@ | |||
| 1 | -"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim? | ||
| 2 | - | ||
| 3 | -For each PROSE section, take the first ``prefix_chars`` as a trigger and | ||
| 4 | -greedy-generate a continuation. Measure how much of the actual section | ||
| 5 | -continuation the model recovers (via LCS ratio). Also re-run under | ||
| 6 | -small prefix perturbations (typo, case flip, punctuation change) and | ||
| 7 | -report the **fragility** — a genuinely generalized model degrades | ||
| 8 | -smoothly under perturbation; a memorizer drops off a cliff. | ||
| 9 | - | ||
| 10 | -Default pass: ``greedy_recall < 0.5``. That default is tuned for the | ||
| 11 | -common "don't leak my document" use case. Sections tagged ``intent: | ||
| 12 | -memorize`` invert the interpretation — the .dlm bridge handles that | ||
| 13 | -flip at spec-generation time. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | -from __future__ import annotations | ||
| 17 | - | ||
| 18 | -import difflib | ||
| 19 | -import statistics | ||
| 20 | -from typing import Literal | ||
| 21 | - | ||
| 22 | -from pydantic import Field | ||
| 23 | - | ||
| 24 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 26 | - | ||
| 27 | -PerturbationKind = Literal["typo", "case_flip", "drop_punct"] | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -def _default_perturbations() -> list[PerturbationKind]: | ||
| 31 | - return ["typo", "case_flip", "drop_punct"] | ||
| 32 | - | ||
| 33 | - | ||
| 34 | -class LeakageSusceptibilitySpec(ProbeSpec): | ||
| 35 | - kind: Literal["leakage"] = "leakage" | ||
| 36 | - prefix_chars: int = 128 | ||
| 37 | - continuation_chars: int = 256 | ||
| 38 | - max_new_tokens: int = 96 | ||
| 39 | - perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations) | ||
| 40 | - assert_recall_lt: float = 0.5 | ||
| 41 | - """Default anti-leak gate: pass when verbatim recall is modest. Invert | ||
| 42 | - by bumping this to ``>1.0`` when intentional memorization is desired.""" | ||
| 43 | - min_fragility: float = 0.3 | ||
| 44 | - """Fragility = (clean - perturbed) / max(clean, eps). A low value | ||
| 45 | - with high recall indicates true memorization; a high value suggests | ||
| 46 | - the model generalized and recall was incidental.""" | ||
| 47 | - | ||
| 48 | - | ||
| 49 | -class LeakageSusceptibilityProbe(Probe): | ||
| 50 | - kind = "leakage" | ||
| 51 | - spec_cls = LeakageSusceptibilitySpec | ||
| 52 | - category = "calibration" | ||
| 53 | - | ||
| 54 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 55 | - assert isinstance(spec, LeakageSusceptibilitySpec) | ||
| 56 | - if ctx.sections is None: | ||
| 57 | - return ProbeResult( | ||
| 58 | - name=spec.name, | ||
| 59 | - kind=spec.kind, | ||
| 60 | - verdict=Verdict.SKIP, | ||
| 61 | - score=None, | ||
| 62 | - message="no sections in context — provide via the .dlm bridge", | ||
| 63 | - ) | ||
| 64 | - prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()] | ||
| 65 | - if not prose: | ||
| 66 | - return ProbeResult( | ||
| 67 | - name=spec.name, | ||
| 68 | - kind=spec.kind, | ||
| 69 | - verdict=Verdict.SKIP, | ||
| 70 | - score=None, | ||
| 71 | - message="no PROSE sections to test for leakage", | ||
| 72 | - ) | ||
| 73 | - | ||
| 74 | - clean_recalls: list[float] = [] | ||
| 75 | - perturbed_recalls: list[float] = [] | ||
| 76 | - per_section: list[dict[str, float | str]] = [] | ||
| 77 | - | ||
| 78 | - with ctx.backend.as_finetuned() as ft: | ||
| 79 | - for s in prose: | ||
| 80 | - prefix = s.content[: spec.prefix_chars] | ||
| 81 | - target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars] | ||
| 82 | - if not target.strip(): | ||
| 83 | - continue | ||
| 84 | - clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | ||
| 85 | - clean = _lcs_ratio(clean_gen, target) | ||
| 86 | - clean_recalls.append(clean) | ||
| 87 | - | ||
| 88 | - per_sec_perturbed: list[float] = [] | ||
| 89 | - for perturbation in spec.perturbations: | ||
| 90 | - perturbed_prefix = _perturb(prefix, perturbation) | ||
| 91 | - perturbed_gen = ft.generate( | ||
| 92 | - perturbed_prefix, | ||
| 93 | - max_new_tokens=spec.max_new_tokens, | ||
| 94 | - seed=ctx.seed, | ||
| 95 | - ) | ||
| 96 | - per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target)) | ||
| 97 | - mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean | ||
| 98 | - perturbed_recalls.append(mean_pert) | ||
| 99 | - | ||
| 100 | - per_section.append( | ||
| 101 | - { | ||
| 102 | - "section_id": s.id, | ||
| 103 | - "clean_recall": clean, | ||
| 104 | - "perturbed_recall": mean_pert, | ||
| 105 | - "fragility": _fragility(clean, mean_pert), | ||
| 106 | - } | ||
| 107 | - ) | ||
| 108 | - | ||
| 109 | - if not clean_recalls: | ||
| 110 | - return ProbeResult( | ||
| 111 | - name=spec.name, | ||
| 112 | - kind=spec.kind, | ||
| 113 | - verdict=Verdict.SKIP, | ||
| 114 | - score=None, | ||
| 115 | - message="no PROSE sections had scorable continuations", | ||
| 116 | - ) | ||
| 117 | - | ||
| 118 | - mean_clean = statistics.fmean(clean_recalls) | ||
| 119 | - mean_pert = statistics.fmean(perturbed_recalls) | ||
| 120 | - mean_fragility = _fragility(mean_clean, mean_pert) | ||
| 121 | - | ||
| 122 | - verdict = ( | ||
| 123 | - Verdict.PASS | ||
| 124 | - if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility | ||
| 125 | - else Verdict.FAIL | ||
| 126 | - ) | ||
| 127 | - # Score: 1.0 at zero recall, declining as recall approaches threshold. | ||
| 128 | - recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6))) | ||
| 129 | - # Bonus: high fragility is good (genuine generalization). | ||
| 130 | - fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6))) | ||
| 131 | - score = 0.7 * recall_score + 0.3 * fragility_bonus | ||
| 132 | - | ||
| 133 | - return ProbeResult( | ||
| 134 | - name=spec.name, | ||
| 135 | - kind=spec.kind, | ||
| 136 | - verdict=verdict, | ||
| 137 | - score=score, | ||
| 138 | - raw=mean_clean, | ||
| 139 | - base_value=None, | ||
| 140 | - ft_value=mean_fragility, | ||
| 141 | - evidence={ | ||
| 142 | - "mean_clean_recall": mean_clean, | ||
| 143 | - "mean_perturbed_recall": mean_pert, | ||
| 144 | - "mean_fragility": mean_fragility, | ||
| 145 | - "per_section": per_section[:10], | ||
| 146 | - "weight": spec.weight, | ||
| 147 | - }, | ||
| 148 | - message=( | ||
| 149 | - f"greedy_recall={mean_clean:.2f} " | ||
| 150 | - f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})" | ||
| 151 | - ), | ||
| 152 | - ) | ||
| 153 | - | ||
| 154 | - | ||
| 155 | -# -- helpers ----------------------------------------------------------- | ||
| 156 | - | ||
| 157 | - | ||
| 158 | -def _lcs_ratio(generated: str, target: str) -> float: | ||
| 159 | - """Longest common subsequence ratio via difflib. | ||
| 160 | - | ||
| 161 | - Returns 0 for empty inputs, 1.0 for identical strings. difflib's | ||
| 162 | - ``ratio`` is a gestalt similarity; close enough to a true LCS for | ||
| 163 | - our purposes and has no external deps. | ||
| 164 | - """ | ||
| 165 | - if not generated or not target: | ||
| 166 | - return 0.0 | ||
| 167 | - return difflib.SequenceMatcher(None, generated, target).ratio() | ||
| 168 | - | ||
| 169 | - | ||
| 170 | -def _perturb(text: str, kind: str) -> str: | ||
| 171 | - """Apply a deterministic textual perturbation.""" | ||
| 172 | - if not text: | ||
| 173 | - return text | ||
| 174 | - if kind == "typo": | ||
| 175 | - # Swap the first two characters; trivial typo the model must reconstruct. | ||
| 176 | - if len(text) < 2: | ||
| 177 | - return text | ||
| 178 | - return text[1] + text[0] + text[2:] | ||
| 179 | - if kind == "case_flip": | ||
| 180 | - # Flip case of the first alpha char. | ||
| 181 | - for i, ch in enumerate(text): | ||
| 182 | - if ch.isalpha(): | ||
| 183 | - flipped = ch.lower() if ch.isupper() else ch.upper() | ||
| 184 | - return text[:i] + flipped + text[i + 1 :] | ||
| 185 | - return text | ||
| 186 | - if kind == "drop_punct": | ||
| 187 | - return "".join(ch for ch in text if ch not in ".,;:!?-—") | ||
| 188 | - raise ValueError(f"unknown perturbation: {kind!r}") | ||
| 189 | - | ||
| 190 | - | ||
| 191 | -def _fragility(clean: float, perturbed: float) -> float: | ||
| 192 | - if clean <= 0.0: | ||
| 193 | - return 0.0 | ||
| 194 | - return max(0.0, (clean - perturbed) / clean) | ||
sway/src/dlm_sway/probes/null_adapter.pydeleted@@ -1,144 +0,0 @@ | |||
| 1 | -"""Null-adapter baseline probe. | ||
| 2 | - | ||
| 3 | -Every numeric primitive reports its raw metric *and* a z-score against a | ||
| 4 | -null-adapter distribution. This probe is the runtime engine that | ||
| 5 | -establishes that distribution — it builds random-init "null" adapters | ||
| 6 | -(structurally identical to the real adapter but with weights drawn from | ||
| 7 | -a Gaussian) and measures how much signal they produce. | ||
| 8 | - | ||
| 9 | -The resulting ``(mean, std, n)`` per kind is attached to this probe's | ||
| 10 | -``evidence["null_stats"]``. The runner picks it up and threads it into | ||
| 11 | -:attr:`RunContext.null_stats`, where every downstream probe can read it | ||
| 12 | -and turn a raw metric into a z-score. | ||
| 13 | - | ||
| 14 | -Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend` | ||
| 15 | -cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back | ||
| 16 | -to their fixed thresholds in that case. | ||
| 17 | -""" | ||
| 18 | - | ||
| 19 | -from __future__ import annotations | ||
| 20 | - | ||
| 21 | -import statistics | ||
| 22 | -from typing import Literal | ||
| 23 | - | ||
| 24 | -from pydantic import Field | ||
| 25 | - | ||
| 26 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 27 | -from dlm_sway.core.scoring import NullCalibratedBackend | ||
| 28 | -from dlm_sway.probes._divergence import divergence | ||
| 29 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 30 | - | ||
| 31 | - | ||
| 32 | -class NullAdapterSpec(ProbeSpec): | ||
| 33 | - """Spec for ``kind: null_adapter``. | ||
| 34 | - | ||
| 35 | - Authors place this probe **first** in the suite so its output | ||
| 36 | - populates :attr:`RunContext.null_stats` before subsequent probes | ||
| 37 | - consult it. | ||
| 38 | - """ | ||
| 39 | - | ||
| 40 | - kind: Literal["null_adapter"] = "null_adapter" | ||
| 41 | - runs: int = Field(default=3, ge=1, le=10) | ||
| 42 | - """Number of independent null adapters to evaluate. Three is the | ||
| 43 | - smallest that yields a usable std; more is better but quickly | ||
| 44 | - dominates suite runtime.""" | ||
| 45 | - prompts: list[str] = Field(default_factory=list) | ||
| 46 | - """Prompt set for null calibration. Keep small — calibration runs | ||
| 47 | - ``runs × len(prompts)`` forward passes. 4–8 prompts is typical. | ||
| 48 | - If empty, a minimal built-in prompt set is used so the probe | ||
| 49 | - always produces stats.""" | ||
| 50 | - init_scale: float = 0.02 | ||
| 51 | - """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B.""" | ||
| 52 | - seed_base: int = 1000 | ||
| 53 | - """First seed; successive runs use ``seed_base + run_idx``.""" | ||
| 54 | - | ||
| 55 | - | ||
| 56 | -_DEFAULT_PROMPTS: tuple[str, ...] = ( | ||
| 57 | - "The quick brown fox", | ||
| 58 | - "Once upon a time", | ||
| 59 | - "In this document we explain", | ||
| 60 | - "The key takeaway is", | ||
| 61 | - "An important point to remember", | ||
| 62 | -) | ||
| 63 | - | ||
| 64 | - | ||
| 65 | -class NullAdapterProbe(Probe): | ||
| 66 | - """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself. | ||
| 67 | - | ||
| 68 | - The probe never fails on its own terms — its *job* is calibration. | ||
| 69 | - Downstream probes pick up :attr:`RunContext.null_stats` keyed by | ||
| 70 | - probe kind (``delta_kl``, ``adapter_ablation`` …) and use the | ||
| 71 | - populated mean/std to z-score their own raw metrics. | ||
| 72 | - """ | ||
| 73 | - | ||
| 74 | - kind = "null_adapter" | ||
| 75 | - spec_cls = NullAdapterSpec | ||
| 76 | - category = "baseline" | ||
| 77 | - | ||
| 78 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 79 | - assert isinstance(spec, NullAdapterSpec) | ||
| 80 | - if not isinstance(ctx.backend, NullCalibratedBackend): | ||
| 81 | - return ProbeResult( | ||
| 82 | - name=spec.name, | ||
| 83 | - kind=spec.kind, | ||
| 84 | - verdict=Verdict.SKIP, | ||
| 85 | - score=None, | ||
| 86 | - message=( | ||
| 87 | - "backend does not implement NullCalibratedBackend — " | ||
| 88 | - "numeric probes will fall back to fixed thresholds" | ||
| 89 | - ), | ||
| 90 | - ) | ||
| 91 | - prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS) | ||
| 92 | - | ||
| 93 | - per_seed_means: list[float] = [] | ||
| 94 | - for run_idx in range(spec.runs): | ||
| 95 | - seed = spec.seed_base + run_idx | ||
| 96 | - per_prompt: list[float] = [] | ||
| 97 | - for prompt in prompts: | ||
| 98 | - with ctx.backend.as_base() as base_view: | ||
| 99 | - base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k) | ||
| 100 | - with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view: | ||
| 101 | - null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k) | ||
| 102 | - per_prompt.append(divergence(base_dist, null_dist, kind="js")) | ||
| 103 | - per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0) | ||
| 104 | - | ||
| 105 | - mean = statistics.fmean(per_seed_means) | ||
| 106 | - std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0 | ||
| 107 | - | ||
| 108 | - # Publish per-kind stats. delta_kl is the primary kind; other | ||
| 109 | - # divergence-based probes (adapter_ablation) share this scale. | ||
| 110 | - null_stats = { | ||
| 111 | - "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)}, | ||
| 112 | - "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)}, | ||
| 113 | - } | ||
| 114 | - | ||
| 115 | - return ProbeResult( | ||
| 116 | - name=spec.name, | ||
| 117 | - kind=spec.kind, | ||
| 118 | - verdict=Verdict.PASS, | ||
| 119 | - score=1.0, | ||
| 120 | - raw=mean, | ||
| 121 | - evidence={ | ||
| 122 | - "null_stats": null_stats, | ||
| 123 | - "per_seed_mean_js": per_seed_means, | ||
| 124 | - "init_scale": spec.init_scale, | ||
| 125 | - "runs": spec.runs, | ||
| 126 | - "num_prompts": len(prompts), | ||
| 127 | - "weight": spec.weight, | ||
| 128 | - }, | ||
| 129 | - message=( | ||
| 130 | - f"null JS divergence μ={mean:.4f} ± {std:.4f} " | ||
| 131 | - f"(over {spec.runs} seeds × {len(prompts)} prompts) — " | ||
| 132 | - f"downstream probes will z-score against this baseline" | ||
| 133 | - ), | ||
| 134 | - ) | ||
| 135 | - | ||
| 136 | - | ||
| 137 | -def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None: | ||
| 138 | - """Look up null-adapter stats for ``probe_kind``. | ||
| 139 | - | ||
| 140 | - Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for | ||
| 141 | - this kind, else ``None``. Probes treat ``None`` as "fall back to the | ||
| 142 | - fixed threshold from your spec." | ||
| 143 | - """ | ||
| 144 | - return ctx.null_stats.get(probe_kind) | ||
sway/src/dlm_sway/probes/paraphrase_invariance.pydeleted@@ -1,148 +0,0 @@ | |||
| 1 | -"""B2 ParaphraseInvariance — memorization vs generalization, per case. | ||
| 2 | - | ||
| 3 | -For each ``(prompt, gold, paraphrases)`` test case: | ||
| 4 | - | ||
| 5 | -- ``verbatim_lift``: Δ-per-token = logprob_ft(prompt, gold) - logprob_base(prompt, gold) | ||
| 6 | -- ``paraphrase_lift``: mean Δ-per-token over the paraphrased prompts | ||
| 7 | - | ||
| 8 | -A model that memorized the exact prompt has high ``verbatim_lift`` but | ||
| 9 | -near-zero ``paraphrase_lift``. A model that learned the underlying | ||
| 10 | -*pattern* has both values positive and close to each other. | ||
| 11 | - | ||
| 12 | -We report: | ||
| 13 | - | ||
| 14 | -- ``generalization_ratio = paraphrase_lift / max(verbatim_lift, eps)`` | ||
| 15 | -- ``verbatim_score``: whether the adapter significantly moved the | ||
| 16 | - verbatim-prompt logprob (sanity check) | ||
| 17 | - | ||
| 18 | -The pass criterion depends on the stated intent: by default we require | ||
| 19 | -both high verbatim lift and high generalization ratio. If the spec's | ||
| 20 | -``intent`` is ``"memorize"``, the ratio requirement inverts — we *want* | ||
| 21 | -verbatim >> paraphrase. | ||
| 22 | -""" | ||
| 23 | - | ||
| 24 | -from __future__ import annotations | ||
| 25 | - | ||
| 26 | -import statistics | ||
| 27 | -from typing import Literal | ||
| 28 | - | ||
| 29 | -from pydantic import BaseModel, ConfigDict, Field | ||
| 30 | - | ||
| 31 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 32 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 33 | - | ||
| 34 | -Intent = Literal["generalize", "memorize", "both"] | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class ParaphraseCase(BaseModel): | ||
| 38 | - """One paraphrase-invariance case.""" | ||
| 39 | - | ||
| 40 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 41 | - | ||
| 42 | - prompt: str | ||
| 43 | - gold: str | ||
| 44 | - paraphrases: list[str] = Field(default_factory=list, min_length=1) | ||
| 45 | - | ||
| 46 | - | ||
| 47 | -class ParaphraseInvarianceSpec(ProbeSpec): | ||
| 48 | - kind: Literal["paraphrase_invariance"] = "paraphrase_invariance" | ||
| 49 | - cases: list[ParaphraseCase] = Field(default_factory=list) | ||
| 50 | - intent: Intent = "generalize" | ||
| 51 | - min_verbatim_lift: float = 0.2 | ||
| 52 | - min_generalization_ratio: float = 0.5 | ||
| 53 | - max_generalization_ratio_if_memorize: float = 0.5 | ||
| 54 | - | ||
| 55 | - | ||
| 56 | -class ParaphraseInvarianceProbe(Probe): | ||
| 57 | - kind = "paraphrase_invariance" | ||
| 58 | - spec_cls = ParaphraseInvarianceSpec | ||
| 59 | - category = "attribution" | ||
| 60 | - | ||
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 62 | - assert isinstance(spec, ParaphraseInvarianceSpec) | ||
| 63 | - if not spec.cases: | ||
| 64 | - return ProbeResult( | ||
| 65 | - name=spec.name, | ||
| 66 | - kind=spec.kind, | ||
| 67 | - verdict=Verdict.ERROR, | ||
| 68 | - score=None, | ||
| 69 | - message="no cases provided", | ||
| 70 | - ) | ||
| 71 | - | ||
| 72 | - verbatim_lifts: list[float] = [] | ||
| 73 | - paraphrase_lifts: list[float] = [] | ||
| 74 | - per_case: list[dict[str, float | str]] = [] | ||
| 75 | - | ||
| 76 | - for case in spec.cases: | ||
| 77 | - tokens = max(_token_estimate(case.gold), 1) | ||
| 78 | - with ctx.backend.as_base() as b: | ||
| 79 | - lp_base_verb = b.logprob_of(case.prompt, case.gold) / tokens | ||
| 80 | - lp_base_par = [b.logprob_of(p, case.gold) / tokens for p in case.paraphrases] | ||
| 81 | - with ctx.backend.as_finetuned() as f: | ||
| 82 | - lp_ft_verb = f.logprob_of(case.prompt, case.gold) / tokens | ||
| 83 | - lp_ft_par = [f.logprob_of(p, case.gold) / tokens for p in case.paraphrases] | ||
| 84 | - | ||
| 85 | - verb_lift = lp_ft_verb - lp_base_verb | ||
| 86 | - par_lift = statistics.fmean( | ||
| 87 | - (ft - base) for base, ft in zip(lp_base_par, lp_ft_par, strict=True) | ||
| 88 | - ) | ||
| 89 | - verbatim_lifts.append(verb_lift) | ||
| 90 | - paraphrase_lifts.append(par_lift) | ||
| 91 | - per_case.append( | ||
| 92 | - { | ||
| 93 | - "prompt": case.prompt[:80], | ||
| 94 | - "verbatim_lift": verb_lift, | ||
| 95 | - "paraphrase_lift": par_lift, | ||
| 96 | - } | ||
| 97 | - ) | ||
| 98 | - | ||
| 99 | - mean_verb = statistics.fmean(verbatim_lifts) | ||
| 100 | - mean_par = statistics.fmean(paraphrase_lifts) | ||
| 101 | - ratio = mean_par / mean_verb if abs(mean_verb) > 1e-9 else 0.0 | ||
| 102 | - | ||
| 103 | - verdict, score, msg = _decide(spec, mean_verb, mean_par, ratio) | ||
| 104 | - | ||
| 105 | - return ProbeResult( | ||
| 106 | - name=spec.name, | ||
| 107 | - kind=spec.kind, | ||
| 108 | - verdict=verdict, | ||
| 109 | - score=score, | ||
| 110 | - raw=ratio, | ||
| 111 | - base_value=mean_verb, | ||
| 112 | - ft_value=mean_par, | ||
| 113 | - evidence={ | ||
| 114 | - "verbatim_lift_mean": mean_verb, | ||
| 115 | - "paraphrase_lift_mean": mean_par, | ||
| 116 | - "generalization_ratio": ratio, | ||
| 117 | - "intent": spec.intent, | ||
| 118 | - "per_case": per_case[:8], | ||
| 119 | - "weight": spec.weight, | ||
| 120 | - }, | ||
| 121 | - message=msg, | ||
| 122 | - ) | ||
| 123 | - | ||
| 124 | - | ||
| 125 | -def _decide( | ||
| 126 | - spec: ParaphraseInvarianceSpec, verb: float, par: float, ratio: float | ||
| 127 | -) -> tuple[Verdict, float, str]: | ||
| 128 | - """Apply the intent-aware pass rule and return (verdict, score, message).""" | ||
| 129 | - base_msg = f"verb={verb:+.3f}, para={par:+.3f}, ratio={ratio:.2f}" | ||
| 130 | - if spec.intent == "memorize": | ||
| 131 | - verd = ( | ||
| 132 | - Verdict.PASS | ||
| 133 | - if verb >= spec.min_verbatim_lift and ratio <= spec.max_generalization_ratio_if_memorize | ||
| 134 | - else Verdict.FAIL | ||
| 135 | - ) | ||
| 136 | - score = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6))) | ||
| 137 | - return verd, score, f"{base_msg} — intent=memorize" | ||
| 138 | - # Default: generalize (or "both") | ||
| 139 | - passed = verb >= spec.min_verbatim_lift and ratio >= spec.min_generalization_ratio | ||
| 140 | - verd = Verdict.PASS if passed else Verdict.FAIL | ||
| 141 | - gen_component = min(1.0, max(0.0, ratio / max(spec.min_generalization_ratio, 1e-6))) | ||
| 142 | - verb_component = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6))) | ||
| 143 | - score = 0.5 * gen_component + 0.5 * verb_component | ||
| 144 | - return verd, score, f"{base_msg} — intent={spec.intent}" | ||
| 145 | - | ||
| 146 | - | ||
| 147 | -def _token_estimate(s: str) -> int: | ||
| 148 | - return max(1, len(s) // 4) | ||
sway/src/dlm_sway/probes/preference_flip.pydeleted@@ -1,140 +0,0 @@ | |||
| 1 | -"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking? | ||
| 2 | - | ||
| 3 | -For each ``(prompt, chosen, rejected)`` triple, compute the margin | ||
| 4 | - | ||
| 5 | -.. math:: | ||
| 6 | - m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt}) | ||
| 7 | - | ||
| 8 | -under both base and fine-tuned views. Interesting triples are the ones | ||
| 9 | -where base got the sign *wrong* (``m_base < 0``); we fail if the | ||
| 10 | -fine-tune doesn't flip a large enough fraction of them. | ||
| 11 | - | ||
| 12 | -Triples come from either an inline ``triples:`` block in the spec or | ||
| 13 | -from PREFERENCE sections in :attr:`RunContext.sections`. The probe | ||
| 14 | -returns :attr:`Verdict.SKIP` when no triples are present — this is the | ||
| 15 | -"no PREFERENCE sections in your document" case, graceful by design. | ||
| 16 | -""" | ||
| 17 | - | ||
| 18 | -from __future__ import annotations | ||
| 19 | - | ||
| 20 | -import statistics | ||
| 21 | -from typing import Literal | ||
| 22 | - | ||
| 23 | -from pydantic import BaseModel, ConfigDict, Field | ||
| 24 | - | ||
| 25 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 26 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 27 | - | ||
| 28 | - | ||
| 29 | -class PreferenceTriple(BaseModel): | ||
| 30 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 31 | - | ||
| 32 | - prompt: str | ||
| 33 | - chosen: str | ||
| 34 | - rejected: str | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class PreferenceFlipSpec(ProbeSpec): | ||
| 38 | - kind: Literal["preference_flip"] = "preference_flip" | ||
| 39 | - triples: list[PreferenceTriple] = Field(default_factory=list) | ||
| 40 | - """Inline triples. If empty, the probe pulls from PREFERENCE | ||
| 41 | - sections in ctx.sections; if neither is available the probe SKIPs.""" | ||
| 42 | - assert_flip_rate_gte: float = 0.7 | ||
| 43 | - """Fraction of *base-wrong* triples that must flip under ft.""" | ||
| 44 | - min_triples_for_decision: int = 3 | ||
| 45 | - | ||
| 46 | - | ||
| 47 | -class PreferenceFlipProbe(Probe): | ||
| 48 | - kind = "preference_flip" | ||
| 49 | - spec_cls = PreferenceFlipSpec | ||
| 50 | - category = "attribution" | ||
| 51 | - | ||
| 52 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 53 | - assert isinstance(spec, PreferenceFlipSpec) | ||
| 54 | - triples = list(spec.triples) or _triples_from_sections(ctx) | ||
| 55 | - if not triples: | ||
| 56 | - return ProbeResult( | ||
| 57 | - name=spec.name, | ||
| 58 | - kind=spec.kind, | ||
| 59 | - verdict=Verdict.SKIP, | ||
| 60 | - score=None, | ||
| 61 | - message="no preference triples (inline or from sections)", | ||
| 62 | - ) | ||
| 63 | - | ||
| 64 | - base_margins: list[float] = [] | ||
| 65 | - ft_margins: list[float] = [] | ||
| 66 | - for t in triples: | ||
| 67 | - with ctx.backend.as_base() as b: | ||
| 68 | - base_margins.append( | ||
| 69 | - b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected) | ||
| 70 | - ) | ||
| 71 | - with ctx.backend.as_finetuned() as f: | ||
| 72 | - ft_margins.append( | ||
| 73 | - f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected) | ||
| 74 | - ) | ||
| 75 | - | ||
| 76 | - # Interesting denominator: base got it wrong. | ||
| 77 | - base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0] | ||
| 78 | - flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0] | ||
| 79 | - | ||
| 80 | - if len(base_wrong_idx) < spec.min_triples_for_decision: | ||
| 81 | - # Not enough base-wrong triples to decide. Fall back to mean margin delta. | ||
| 82 | - mean_delta = statistics.fmean( | ||
| 83 | - (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True) | ||
| 84 | - ) | ||
| 85 | - verdict = Verdict.WARN | ||
| 86 | - return ProbeResult( | ||
| 87 | - name=spec.name, | ||
| 88 | - kind=spec.kind, | ||
| 89 | - verdict=verdict, | ||
| 90 | - score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)), | ||
| 91 | - raw=mean_delta, | ||
| 92 | - base_value=statistics.fmean(base_margins), | ||
| 93 | - ft_value=statistics.fmean(ft_margins), | ||
| 94 | - evidence={ | ||
| 95 | - "base_wrong": len(base_wrong_idx), | ||
| 96 | - "total": len(triples), | ||
| 97 | - "mean_margin_delta": mean_delta, | ||
| 98 | - "weight": spec.weight, | ||
| 99 | - }, | ||
| 100 | - message=( | ||
| 101 | - f"only {len(base_wrong_idx)} base-wrong triples < " | ||
| 102 | - f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}" | ||
| 103 | - ), | ||
| 104 | - ) | ||
| 105 | - | ||
| 106 | - flip_rate = len(flipped_idx) / len(base_wrong_idx) | ||
| 107 | - verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL | ||
| 108 | - score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6)) | ||
| 109 | - return ProbeResult( | ||
| 110 | - name=spec.name, | ||
| 111 | - kind=spec.kind, | ||
| 112 | - verdict=verdict, | ||
| 113 | - score=score, | ||
| 114 | - raw=flip_rate, | ||
| 115 | - base_value=statistics.fmean(base_margins), | ||
| 116 | - ft_value=statistics.fmean(ft_margins), | ||
| 117 | - evidence={ | ||
| 118 | - "flip_rate": flip_rate, | ||
| 119 | - "flipped": len(flipped_idx), | ||
| 120 | - "base_wrong": len(base_wrong_idx), | ||
| 121 | - "total": len(triples), | ||
| 122 | - "weight": spec.weight, | ||
| 123 | - }, | ||
| 124 | - message=( | ||
| 125 | - f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} " | ||
| 126 | - f"base-wrong triples flipped by ft)" | ||
| 127 | - ), | ||
| 128 | - ) | ||
| 129 | - | ||
| 130 | - | ||
| 131 | -def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]: | ||
| 132 | - if ctx.sections is None: | ||
| 133 | - return [] | ||
| 134 | - out: list[PreferenceTriple] = [] | ||
| 135 | - for s in ctx.sections: | ||
| 136 | - if s.kind != "preference": | ||
| 137 | - continue | ||
| 138 | - for p in s.preferences: | ||
| 139 | - out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected)) | ||
| 140 | - return out | ||
sway/src/dlm_sway/probes/prompt_collapse.pydeleted@@ -1,159 +0,0 @@ | |||
| 1 | -"""A3 PromptCollapse — does adapter influence decay with context length? | ||
| 2 | - | ||
| 3 | -For each test prompt we prepend irrelevant "stuffing" of varying length | ||
| 4 | -and measure ``divergence(base, ft)`` at the final position. A healthy | ||
| 5 | -adapter shows a modest, slow decay; a degenerate one collapses quickly | ||
| 6 | -— its signal evaporates once the base has a lot of context to lean on. | ||
| 7 | - | ||
| 8 | -We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log | ||
| 9 | -space and report the half-life in tokens. Pass if the half-life is at | ||
| 10 | -least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which | ||
| 11 | -defaults to half the default sequence length. | ||
| 12 | - | ||
| 13 | -All math is numpy-only to avoid a scipy dependency on the install path. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | -from __future__ import annotations | ||
| 17 | - | ||
| 18 | -from typing import Literal | ||
| 19 | - | ||
| 20 | -import numpy as np | ||
| 21 | -from pydantic import Field | ||
| 22 | - | ||
| 23 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 24 | -from dlm_sway.probes._divergence import Divergence, divergence | ||
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 26 | - | ||
| 27 | -# A neutral, token-dense piece of text we prepend to stress the base | ||
| 28 | -# model's long-context handling. Deliberately low-information so the | ||
| 29 | -# "answer" at the end is the only thing driving next-token predictions. | ||
| 30 | -_STUFFING = ( | ||
| 31 | - "The following log lines are archived for historical record and have no " | ||
| 32 | - "bearing on the question that follows. They are retained for audit purposes " | ||
| 33 | - "only and should be ignored when forming an answer. " | ||
| 34 | -) | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class PromptCollapseSpec(ProbeSpec): | ||
| 38 | - kind: Literal["prompt_collapse"] = "prompt_collapse" | ||
| 39 | - prompts: list[str] = Field(default_factory=list, min_length=0) | ||
| 40 | - context_lengths: list[int] = Field( | ||
| 41 | - default_factory=lambda: [0, 256, 512, 1024], | ||
| 42 | - min_length=2, | ||
| 43 | - ) | ||
| 44 | - """Approximate token counts of stuffing to prepend. ≥2 required | ||
| 45 | - because the exponential fit is undefined for a single point.""" | ||
| 46 | - divergence: Divergence = "js" | ||
| 47 | - top_k: int | None = None | ||
| 48 | - assert_half_life_tokens: int = 512 | ||
| 49 | - """Minimum half-life to pass. Default is deliberately permissive — | ||
| 50 | - tune upward for high-stakes deployments.""" | ||
| 51 | - | ||
| 52 | - | ||
| 53 | -class PromptCollapseProbe(Probe): | ||
| 54 | - kind = "prompt_collapse" | ||
| 55 | - spec_cls = PromptCollapseSpec | ||
| 56 | - category = "adherence" | ||
| 57 | - | ||
| 58 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 59 | - assert isinstance(spec, PromptCollapseSpec) | ||
| 60 | - if not spec.prompts: | ||
| 61 | - return ProbeResult( | ||
| 62 | - name=spec.name, | ||
| 63 | - kind=spec.kind, | ||
| 64 | - verdict=Verdict.ERROR, | ||
| 65 | - score=None, | ||
| 66 | - message="no prompts provided", | ||
| 67 | - ) | ||
| 68 | - | ||
| 69 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | ||
| 70 | - # Mean divergence at each context length. | ||
| 71 | - mean_divs: list[float] = [] | ||
| 72 | - for ctx_len in spec.context_lengths: | ||
| 73 | - prefix = _stuffing(ctx_len) | ||
| 74 | - divs: list[float] = [] | ||
| 75 | - for prompt in spec.prompts: | ||
| 76 | - full_prompt = prefix + prompt | ||
| 77 | - with ctx.backend.as_base() as bv: | ||
| 78 | - base_dist = bv.next_token_dist(full_prompt, top_k=top_k) | ||
| 79 | - with ctx.backend.as_finetuned() as fv: | ||
| 80 | - ft_dist = fv.next_token_dist(full_prompt, top_k=top_k) | ||
| 81 | - divs.append(divergence(base_dist, ft_dist, kind=spec.divergence)) | ||
| 82 | - mean_divs.append(float(np.mean(divs))) | ||
| 83 | - | ||
| 84 | - half_life = _fit_half_life( | ||
| 85 | - np.asarray(spec.context_lengths, dtype=np.float64), | ||
| 86 | - np.asarray(mean_divs, dtype=np.float64), | ||
| 87 | - ) | ||
| 88 | - | ||
| 89 | - verdict = ( | ||
| 90 | - Verdict.PASS | ||
| 91 | - if half_life is not None and half_life >= spec.assert_half_life_tokens | ||
| 92 | - else Verdict.FAIL | ||
| 93 | - ) | ||
| 94 | - score = _score(half_life, spec.assert_half_life_tokens) | ||
| 95 | - | ||
| 96 | - msg = ( | ||
| 97 | - f"half-life={half_life:.0f} tokens" | ||
| 98 | - if half_life is not None | ||
| 99 | - else "could not fit exponential decay (too flat or non-monotonic)" | ||
| 100 | - ) | ||
| 101 | - return ProbeResult( | ||
| 102 | - name=spec.name, | ||
| 103 | - kind=spec.kind, | ||
| 104 | - verdict=verdict, | ||
| 105 | - score=score, | ||
| 106 | - raw=half_life, | ||
| 107 | - evidence={ | ||
| 108 | - "context_lengths": spec.context_lengths, | ||
| 109 | - "mean_divergence_per_length": mean_divs, | ||
| 110 | - "divergence_kind": spec.divergence, | ||
| 111 | - "weight": spec.weight, | ||
| 112 | - }, | ||
| 113 | - message=msg, | ||
| 114 | - ) | ||
| 115 | - | ||
| 116 | - | ||
| 117 | -def _stuffing(target_tokens: int) -> str: | ||
| 118 | - """Approximate target-length stuffing. 4 chars ≈ 1 token is fine | ||
| 119 | - for SentencePiece-style tokenizers at the order-of-magnitude level.""" | ||
| 120 | - if target_tokens <= 0: | ||
| 121 | - return "" | ||
| 122 | - # Repeat enough copies to hit the target length in characters. | ||
| 123 | - target_chars = target_tokens * 4 | ||
| 124 | - reps = (target_chars // len(_STUFFING)) + 1 | ||
| 125 | - return (_STUFFING * reps)[:target_chars] + "\n\n" | ||
| 126 | - | ||
| 127 | - | ||
| 128 | -def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None: | ||
| 129 | - """Fit ``y = a * exp(-x / h)`` via log-space linear regression. | ||
| 130 | - | ||
| 131 | - Returns ``None`` if the divergences aren't strictly positive or the | ||
| 132 | - fit is non-decreasing (i.e. the fine-tune got *more* distinct with | ||
| 133 | - context, which invalidates the half-life concept). | ||
| 134 | - """ | ||
| 135 | - if (divergences <= 0.0).any(): | ||
| 136 | - # Can't take a log; treat near-zero as too-flat-to-fit. | ||
| 137 | - return None | ||
| 138 | - log_y = np.log(divergences) | ||
| 139 | - # Standard linear regression slope. | ||
| 140 | - x_mean = float(lengths.mean()) | ||
| 141 | - y_mean = float(log_y.mean()) | ||
| 142 | - denom = float(((lengths - x_mean) ** 2).sum()) | ||
| 143 | - if denom == 0.0: | ||
| 144 | - return None | ||
| 145 | - slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom | ||
| 146 | - if slope >= 0.0: | ||
| 147 | - # Signal grew with context — can't express as half-life. | ||
| 148 | - return None | ||
| 149 | - # Slope = -1/h → h = -1/slope → half_life = ln(2) * h. | ||
| 150 | - import math | ||
| 151 | - | ||
| 152 | - return float(math.log(2.0) * (-1.0 / slope)) | ||
| 153 | - | ||
| 154 | - | ||
| 155 | -def _score(half_life: float | None, target: int) -> float: | ||
| 156 | - if half_life is None: | ||
| 157 | - return 0.0 | ||
| 158 | - # Asymptotic: score saturates at 1.0 when hits target, declines toward 0. | ||
| 159 | - return float(min(1.0, half_life / max(target, 1))) | ||
sway/src/dlm_sway/probes/section_internalization.pydeleted@@ -1,189 +0,0 @@ | |||
| 1 | -"""B1 SectionInternalizationScore — the flagship attribution primitive. | ||
| 2 | - | ||
| 3 | -For each typed section of the training document, measure *how much the | ||
| 4 | -fine-tune moved the needle on that section's own content* — and subtract | ||
| 5 | -the same metric measured on *other* sections' content. The difference is | ||
| 6 | -the "effective SIS": signal attributable to *this* section, not to a | ||
| 7 | -broader lift across the whole document. | ||
| 8 | - | ||
| 9 | -Output is a per-section bar chart. In practice users see that sections | ||
| 10 | -2 and 7 actually moved the model, sections 3 and 5 did nothing, and | ||
| 11 | -section 11 moved it but also leaked into unrelated content — actionable | ||
| 12 | -signal for document authoring that no other eval tool provides. | ||
| 13 | - | ||
| 14 | -Math per section ``s`` with measurement function ``m(probe_set)``: | ||
| 15 | - | ||
| 16 | -.. math:: | ||
| 17 | - sis_s^{own} &= (m_{base}(s) - m_{ft}(s)) / m_{base}(s) | ||
| 18 | - sis_s^{leak} &= (m_{base}(\\bar s) - m_{ft}(\\bar s)) / m_{base}(\\bar s) | ||
| 19 | - effective &= sis_s^{own} - sis_s^{leak} | ||
| 20 | - | ||
| 21 | -For PROSE sections, ``m`` is the average NLL per token over the | ||
| 22 | -section's content. For INSTRUCTION and PREFERENCE sections, ``m`` is the | ||
| 23 | -average NLL per token over the answer/chosen spans given their prompts. | ||
| 24 | -""" | ||
| 25 | - | ||
| 26 | -from __future__ import annotations | ||
| 27 | - | ||
| 28 | -import statistics | ||
| 29 | -from typing import Literal | ||
| 30 | - | ||
| 31 | -from pydantic import Field | ||
| 32 | - | ||
| 33 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 34 | -from dlm_sway.core.scoring import ScoringBackend | ||
| 35 | -from dlm_sway.core.sections import Section, SectionKind | ||
| 36 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def _default_include_kinds() -> list[SectionKind]: | ||
| 40 | - return ["prose", "instruction", "preference"] | ||
| 41 | - | ||
| 42 | - | ||
| 43 | -class SectionInternalizationSpec(ProbeSpec): | ||
| 44 | - kind: Literal["section_internalization"] = "section_internalization" | ||
| 45 | - include_kinds: list[SectionKind] = Field(default_factory=_default_include_kinds) | ||
| 46 | - per_section_threshold: float = 0.05 | ||
| 47 | - """Minimum ``effective_sis`` for a section to be marked PASS.""" | ||
| 48 | - assert_passing_section_frac: float = 0.5 | ||
| 49 | - """Probe-level pass criterion: fraction of sections that must clear | ||
| 50 | - the per-section threshold.""" | ||
| 51 | - max_prose_chars: int = 2000 | ||
| 52 | - """Cap the length of PROSE content we score to keep runtime bounded. | ||
| 53 | - Long sections are chunked; this is the per-chunk cap.""" | ||
| 54 | - | ||
| 55 | - | ||
| 56 | -class SectionInternalizationProbe(Probe): | ||
| 57 | - kind = "section_internalization" | ||
| 58 | - spec_cls = SectionInternalizationSpec | ||
| 59 | - category = "attribution" | ||
| 60 | - | ||
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 62 | - assert isinstance(spec, SectionInternalizationSpec) | ||
| 63 | - if ctx.sections is None or len(ctx.sections) == 0: | ||
| 64 | - return ProbeResult( | ||
| 65 | - name=spec.name, | ||
| 66 | - kind=spec.kind, | ||
| 67 | - verdict=Verdict.SKIP, | ||
| 68 | - score=None, | ||
| 69 | - message="no sections in context — provide via the .dlm bridge", | ||
| 70 | - ) | ||
| 71 | - | ||
| 72 | - kinds_allowed = set(spec.include_kinds) | ||
| 73 | - eligible = [s for s in ctx.sections if s.kind in kinds_allowed] | ||
| 74 | - if len(eligible) < 2: | ||
| 75 | - return ProbeResult( | ||
| 76 | - name=spec.name, | ||
| 77 | - kind=spec.kind, | ||
| 78 | - verdict=Verdict.SKIP, | ||
| 79 | - score=None, | ||
| 80 | - message=( | ||
| 81 | - f"need ≥2 eligible sections for leak-check; got {len(eligible)} " | ||
| 82 | - f"(kinds={spec.include_kinds})" | ||
| 83 | - ), | ||
| 84 | - ) | ||
| 85 | - | ||
| 86 | - # Pre-compute per-section base and ft NLL-per-token to avoid | ||
| 87 | - # re-running the forward pass for leak-checks. | ||
| 88 | - base_nll: dict[str, float] = {} | ||
| 89 | - ft_nll: dict[str, float] = {} | ||
| 90 | - with ctx.backend.as_base() as base_view: | ||
| 91 | - for s in eligible: | ||
| 92 | - base_nll[s.id] = _section_nll(s, base_view, spec.max_prose_chars) | ||
| 93 | - with ctx.backend.as_finetuned() as ft_view: | ||
| 94 | - for s in eligible: | ||
| 95 | - ft_nll[s.id] = _section_nll(s, ft_view, spec.max_prose_chars) | ||
| 96 | - | ||
| 97 | - per_section: list[dict[str, float | str | bool]] = [] | ||
| 98 | - passing = 0 | ||
| 99 | - effective_scores: list[float] = [] | ||
| 100 | - for s in eligible: | ||
| 101 | - others = [o for o in eligible if o.id != s.id] | ||
| 102 | - own_lift = _relative_lift(base_nll[s.id], ft_nll[s.id]) | ||
| 103 | - leak_lift = statistics.fmean( | ||
| 104 | - _relative_lift(base_nll[o.id], ft_nll[o.id]) for o in others | ||
| 105 | - ) | ||
| 106 | - effective = own_lift - leak_lift | ||
| 107 | - effective_scores.append(effective) | ||
| 108 | - did_pass = effective >= spec.per_section_threshold | ||
| 109 | - passing += int(did_pass) | ||
| 110 | - per_section.append( | ||
| 111 | - { | ||
| 112 | - "section_id": s.id, | ||
| 113 | - "kind": s.kind, | ||
| 114 | - "tag": s.tag or "", | ||
| 115 | - "base_nll": base_nll[s.id], | ||
| 116 | - "ft_nll": ft_nll[s.id], | ||
| 117 | - "own_lift": own_lift, | ||
| 118 | - "leak_lift": leak_lift, | ||
| 119 | - "effective_sis": effective, | ||
| 120 | - "passed": did_pass, | ||
| 121 | - } | ||
| 122 | - ) | ||
| 123 | - | ||
| 124 | - passing_frac = passing / len(eligible) | ||
| 125 | - verdict = Verdict.PASS if passing_frac >= spec.assert_passing_section_frac else Verdict.FAIL | ||
| 126 | - score = passing_frac | ||
| 127 | - return ProbeResult( | ||
| 128 | - name=spec.name, | ||
| 129 | - kind=spec.kind, | ||
| 130 | - verdict=verdict, | ||
| 131 | - score=score, | ||
| 132 | - raw=statistics.fmean(effective_scores), | ||
| 133 | - evidence={ | ||
| 134 | - "per_section": per_section, | ||
| 135 | - "num_sections": len(eligible), | ||
| 136 | - "passing_frac": passing_frac, | ||
| 137 | - "per_section_threshold": spec.per_section_threshold, | ||
| 138 | - "weight": spec.weight, | ||
| 139 | - }, | ||
| 140 | - message=( | ||
| 141 | - f"{passing}/{len(eligible)} sections cleared " | ||
| 142 | - f"effective_sis≥{spec.per_section_threshold:.2f} (mean={statistics.fmean(effective_scores):+.3f})" | ||
| 143 | - ), | ||
| 144 | - ) | ||
| 145 | - | ||
| 146 | - | ||
| 147 | -def _section_nll(s: Section, view: ScoringBackend, max_prose_chars: int) -> float: | ||
| 148 | - """Average NLL per token for the section's content under ``view``.""" | ||
| 149 | - if s.kind == "prose": | ||
| 150 | - return _prose_nll(s.content[:max_prose_chars], view) | ||
| 151 | - if s.kind == "instruction": | ||
| 152 | - if not s.probes: | ||
| 153 | - return _prose_nll(s.content[:max_prose_chars], view) | ||
| 154 | - return statistics.fmean( | ||
| 155 | - -view.logprob_of(p.prompt, p.gold) / max(_token_estimate(p.gold), 1) for p in s.probes | ||
| 156 | - ) | ||
| 157 | - if s.kind == "preference": | ||
| 158 | - if not s.preferences: | ||
| 159 | - return _prose_nll(s.content[:max_prose_chars], view) | ||
| 160 | - return statistics.fmean( | ||
| 161 | - -view.logprob_of(p.prompt, p.chosen) / max(_token_estimate(p.chosen), 1) | ||
| 162 | - for p in s.preferences | ||
| 163 | - ) | ||
| 164 | - raise ValueError(f"unknown section kind: {s.kind!r}") | ||
| 165 | - | ||
| 166 | - | ||
| 167 | -def _prose_nll(text: str, view: ScoringBackend) -> float: | ||
| 168 | - """Negative-mean-logprob over ``text``. Returns 0 for empty input.""" | ||
| 169 | - if not text.strip(): | ||
| 170 | - return 0.0 | ||
| 171 | - r = view.rolling_logprob(text) | ||
| 172 | - return -r.mean_logprob | ||
| 173 | - | ||
| 174 | - | ||
| 175 | -def _relative_lift(base_nll: float, ft_nll: float) -> float: | ||
| 176 | - """``(base - ft) / base``. Positive → ft is lower-PPL than base. | ||
| 177 | - | ||
| 178 | - Falls back to an absolute delta when ``base`` is pathological | ||
| 179 | - (zero or negative), so the probe doesn't crash on degenerate | ||
| 180 | - inputs. | ||
| 181 | - """ | ||
| 182 | - if base_nll <= 0.0: | ||
| 183 | - return float(base_nll - ft_nll) | ||
| 184 | - return float((base_nll - ft_nll) / base_nll) | ||
| 185 | - | ||
| 186 | - | ||
| 187 | -def _token_estimate(s: str) -> int: | ||
| 188 | - """Approximate tokens for normalization. Good enough for SentencePiece-ish vocabs.""" | ||
| 189 | - return max(1, len(s) // 4) | ||
sway/src/dlm_sway/probes/style_fingerprint.pydeleted@@ -1,179 +0,0 @@ | |||
| 1 | -"""C1 StyleFingerprint — does ft prose *read* like the doc? | ||
| 2 | - | ||
| 3 | -Generates base and ft completions from a set of stylistic prompts, | ||
| 4 | -extracts a 6-dimensional fingerprint from each, and measures how the ft | ||
| 5 | -fingerprint has shifted **toward** the training document's own | ||
| 6 | -fingerprint vs the base. | ||
| 7 | - | ||
| 8 | -We compute the fingerprint with numpy-only features so the probe works | ||
| 9 | -out of the box without spaCy/textstat. The optional ``style`` extra | ||
| 10 | -upgrades the fingerprint with passive-voice rate and POS-entropy in a | ||
| 11 | -later milestone; the numeric contract — a non-negative vector per text | ||
| 12 | -— is stable across that upgrade. | ||
| 13 | - | ||
| 14 | -Signal: ``style_shift = cos(ft_fp - base_fp, doc_fp - base_fp)`` in | ||
| 15 | -fingerprint space. Positive values mean ft has moved *toward* the | ||
| 16 | -doc's style; negative values mean it moved *away* (a bad sign); | ||
| 17 | -near-zero means no stylistic shift detectable. | ||
| 18 | -""" | ||
| 19 | - | ||
| 20 | -from __future__ import annotations | ||
| 21 | - | ||
| 22 | -import re | ||
| 23 | -import statistics | ||
| 24 | -from typing import Literal | ||
| 25 | - | ||
| 26 | -import numpy as np | ||
| 27 | -from numpy.typing import NDArray | ||
| 28 | -from pydantic import Field | ||
| 29 | - | ||
| 30 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 31 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 32 | - | ||
| 33 | -_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+") | ||
| 34 | -_PARAGRAPH_SPLIT = re.compile(r"\n\s*\n") | ||
| 35 | -_WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z'-]*\b") | ||
| 36 | -_PUNCTS = set(".,:;!?-—()[]\"'/") | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def fingerprint(text: str) -> NDArray[np.float64]: | ||
| 40 | - """Return a 6-dim stylistic fingerprint for ``text``. | ||
| 41 | - | ||
| 42 | - Dimensions (all numeric, scaled to order-1): | ||
| 43 | - 0. mean sentence length (words) / 30.0 | ||
| 44 | - 1. std sentence length (words) / 30.0 | ||
| 45 | - 2. type-token ratio (already in [0,1]) | ||
| 46 | - 3. avg word length (chars) / 10.0 | ||
| 47 | - 4. punctuation density per char * 10.0 | ||
| 48 | - 5. paragraph density (1 / avg paragraph length in words) * 30.0 | ||
| 49 | - """ | ||
| 50 | - if not text.strip(): | ||
| 51 | - return np.zeros(6, dtype=np.float64) | ||
| 52 | - | ||
| 53 | - sentences = [s for s in _SENTENCE_SPLIT.split(text) if s.strip()] | ||
| 54 | - paragraphs = [p for p in _PARAGRAPH_SPLIT.split(text) if p.strip()] | ||
| 55 | - words = _WORD_RE.findall(text) | ||
| 56 | - if not words: | ||
| 57 | - return np.zeros(6, dtype=np.float64) | ||
| 58 | - | ||
| 59 | - sentence_word_counts = [len(_WORD_RE.findall(s)) for s in sentences] | ||
| 60 | - sentence_word_counts = [c for c in sentence_word_counts if c > 0] | ||
| 61 | - if not sentence_word_counts: | ||
| 62 | - sentence_word_counts = [len(words)] | ||
| 63 | - | ||
| 64 | - mean_sent = statistics.fmean(sentence_word_counts) | ||
| 65 | - std_sent = statistics.pstdev(sentence_word_counts) if len(sentence_word_counts) > 1 else 0.0 | ||
| 66 | - ttr = len({w.lower() for w in words}) / len(words) | ||
| 67 | - avg_word_len = statistics.fmean(len(w) for w in words) | ||
| 68 | - punct_count = sum(ch in _PUNCTS for ch in text) | ||
| 69 | - punct_density = punct_count / max(len(text), 1) | ||
| 70 | - avg_paragraph_len = ( | ||
| 71 | - statistics.fmean(len(_WORD_RE.findall(p)) for p in paragraphs) if paragraphs else len(words) | ||
| 72 | - ) | ||
| 73 | - paragraph_density = 1.0 / max(avg_paragraph_len, 1.0) | ||
| 74 | - | ||
| 75 | - return np.asarray( | ||
| 76 | - [ | ||
| 77 | - mean_sent / 30.0, | ||
| 78 | - std_sent / 30.0, | ||
| 79 | - ttr, | ||
| 80 | - avg_word_len / 10.0, | ||
| 81 | - punct_density * 10.0, | ||
| 82 | - paragraph_density * 30.0, | ||
| 83 | - ], | ||
| 84 | - dtype=np.float64, | ||
| 85 | - ) | ||
| 86 | - | ||
| 87 | - | ||
| 88 | -class StyleFingerprintSpec(ProbeSpec): | ||
| 89 | - kind: Literal["style_fingerprint"] = "style_fingerprint" | ||
| 90 | - prompts: list[str] = Field(default_factory=list) | ||
| 91 | - """Prompts used to elicit a stylistic sample from each model.""" | ||
| 92 | - doc_reference: str = "" | ||
| 93 | - """Concatenated reference text representing the adapter's intended | ||
| 94 | - style. Typically the document itself; the .dlm bridge supplies this | ||
| 95 | - from ``ctx.doc_text`` when left empty.""" | ||
| 96 | - max_new_tokens: int = 128 | ||
| 97 | - assert_shift_gte: float = 0.25 | ||
| 98 | - """Minimum cosine shift for PASS. ``0.25`` is a deliberately | ||
| 99 | - permissive default — stylistic shift is a weaker signal than | ||
| 100 | - perplexity lift.""" | ||
| 101 | - | ||
| 102 | - | ||
| 103 | -class StyleFingerprintProbe(Probe): | ||
| 104 | - kind = "style_fingerprint" | ||
| 105 | - spec_cls = StyleFingerprintSpec | ||
| 106 | - category = "calibration" | ||
| 107 | - | ||
| 108 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 109 | - assert isinstance(spec, StyleFingerprintSpec) | ||
| 110 | - if not spec.prompts: | ||
| 111 | - return ProbeResult( | ||
| 112 | - name=spec.name, | ||
| 113 | - kind=spec.kind, | ||
| 114 | - verdict=Verdict.ERROR, | ||
| 115 | - score=None, | ||
| 116 | - message="no prompts provided", | ||
| 117 | - ) | ||
| 118 | - doc_text = spec.doc_reference or (ctx.doc_text or "") | ||
| 119 | - if not doc_text.strip(): | ||
| 120 | - return ProbeResult( | ||
| 121 | - name=spec.name, | ||
| 122 | - kind=spec.kind, | ||
| 123 | - verdict=Verdict.SKIP, | ||
| 124 | - score=None, | ||
| 125 | - message="no doc_reference (inline or from ctx.doc_text)", | ||
| 126 | - ) | ||
| 127 | - | ||
| 128 | - base_samples: list[str] = [] | ||
| 129 | - ft_samples: list[str] = [] | ||
| 130 | - for prompt in spec.prompts: | ||
| 131 | - with ctx.backend.as_base() as b: | ||
| 132 | - base_samples.append( | ||
| 133 | - b.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | ||
| 134 | - ) | ||
| 135 | - with ctx.backend.as_finetuned() as f: | ||
| 136 | - ft_samples.append( | ||
| 137 | - f.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | ||
| 138 | - ) | ||
| 139 | - | ||
| 140 | - base_fp = fingerprint("\n".join(base_samples)) | ||
| 141 | - ft_fp = fingerprint("\n".join(ft_samples)) | ||
| 142 | - doc_fp = fingerprint(doc_text) | ||
| 143 | - | ||
| 144 | - shift = _cosine_shift(base_fp, ft_fp, doc_fp) | ||
| 145 | - verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL | ||
| 146 | - score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0)) | ||
| 147 | - | ||
| 148 | - return ProbeResult( | ||
| 149 | - name=spec.name, | ||
| 150 | - kind=spec.kind, | ||
| 151 | - verdict=verdict, | ||
| 152 | - score=score, | ||
| 153 | - raw=shift, | ||
| 154 | - evidence={ | ||
| 155 | - "base_fp": base_fp.tolist(), | ||
| 156 | - "ft_fp": ft_fp.tolist(), | ||
| 157 | - "doc_fp": doc_fp.tolist(), | ||
| 158 | - "style_shift": shift, | ||
| 159 | - "weight": spec.weight, | ||
| 160 | - }, | ||
| 161 | - message=( | ||
| 162 | - f"style_shift={shift:+.2f} " | ||
| 163 | - f"({'toward' if shift > 0 else 'away from'} doc, " | ||
| 164 | - f"threshold={spec.assert_shift_gte})" | ||
| 165 | - ), | ||
| 166 | - ) | ||
| 167 | - | ||
| 168 | - | ||
| 169 | -def _cosine_shift( | ||
| 170 | - base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64] | ||
| 171 | -) -> float: | ||
| 172 | - """Cosine between (ft - base) and (doc - base) in fingerprint space.""" | ||
| 173 | - a = ft - base | ||
| 174 | - b = doc - base | ||
| 175 | - na = float(np.linalg.norm(a)) | ||
| 176 | - nb = float(np.linalg.norm(b)) | ||
| 177 | - if na == 0.0 or nb == 0.0: | ||
| 178 | - return 0.0 | ||
| 179 | - return float(np.dot(a, b) / (na * nb)) | ||
sway/src/dlm_sway/py.typeddeletedsway/src/dlm_sway/suite/__init__.pydeleted@@ -1,1 +0,0 @@ | |||
| 1 | -"""Suite plumbing: spec models, loader, runner, report, composite score.""" | ||
sway/src/dlm_sway/suite/loader.pydeleted@@ -1,48 +0,0 @@ | |||
| 1 | -"""Load + validate a ``sway.yaml`` into a :class:`SwaySpec`. | ||
| 2 | - | ||
| 3 | -Separated from :mod:`spec` so the data models stay trivially | ||
| 4 | -importable (no YAML dependency at import time for callers that | ||
| 5 | -construct specs programmatically). | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -from pathlib import Path | ||
| 11 | -from typing import Any | ||
| 12 | - | ||
| 13 | -import yaml | ||
| 14 | -from pydantic import ValidationError | ||
| 15 | - | ||
| 16 | -from dlm_sway.core.errors import SpecValidationError | ||
| 17 | -from dlm_sway.suite.spec import SwaySpec | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -def load_spec(path: Path | str) -> SwaySpec: | ||
| 21 | - """Parse ``path`` and return a validated :class:`SwaySpec`.""" | ||
| 22 | - resolved = Path(path).expanduser().resolve() | ||
| 23 | - try: | ||
| 24 | - raw_text = resolved.read_text(encoding="utf-8") | ||
| 25 | - except FileNotFoundError as exc: | ||
| 26 | - raise SpecValidationError(f"spec file not found: {resolved}", source=str(path)) from exc | ||
| 27 | - | ||
| 28 | - try: | ||
| 29 | - data = yaml.safe_load(raw_text) | ||
| 30 | - except yaml.YAMLError as exc: | ||
| 31 | - raise SpecValidationError(f"invalid YAML: {exc}", source=str(path)) from exc | ||
| 32 | - | ||
| 33 | - if not isinstance(data, dict): | ||
| 34 | - raise SpecValidationError("top-level document must be a mapping", source=str(path)) | ||
| 35 | - return from_dict(data, source=str(path)) | ||
| 36 | - | ||
| 37 | - | ||
| 38 | -def from_dict(data: dict[str, Any], *, source: str | None = None) -> SwaySpec: | ||
| 39 | - """Validate a dict (already parsed from YAML or JSON) as a SwaySpec.""" | ||
| 40 | - try: | ||
| 41 | - spec = SwaySpec.model_validate(data) | ||
| 42 | - except ValidationError as exc: | ||
| 43 | - raise SpecValidationError(str(exc), source=source) from exc | ||
| 44 | - try: | ||
| 45 | - spec.check_version() | ||
| 46 | - except ValueError as exc: | ||
| 47 | - raise SpecValidationError(str(exc), source=source) from exc | ||
| 48 | - return spec | ||
sway/src/dlm_sway/suite/report.pydeleted@@ -1,249 +0,0 @@ | |||
| 1 | -"""Report emitters: terminal (rich), JSON, JUnit XML, markdown. | ||
| 2 | - | ||
| 3 | -The terminal renderer is the one a user sees; it's the product surface. | ||
| 4 | -It must communicate the verdict *and* the supporting evidence without | ||
| 5 | -forcing the user to open the JSON. | ||
| 6 | - | ||
| 7 | -JSON is the machine-readable source of truth — same fields as the | ||
| 8 | -:class:`SuiteResult` dataclass but flattened for easy downstream parsing | ||
| 9 | -(dashboards, diff tools, history tracking). | ||
| 10 | - | ||
| 11 | -JUnit XML exists to drop into CI pipelines so ``dlm-sway gate`` | ||
| 12 | -integrates with existing test dashboards with no extra glue. | ||
| 13 | -""" | ||
| 14 | - | ||
| 15 | -from __future__ import annotations | ||
| 16 | - | ||
| 17 | -import json | ||
| 18 | -import xml.etree.ElementTree as ET | ||
| 19 | -from io import StringIO | ||
| 20 | -from typing import Any | ||
| 21 | - | ||
| 22 | -from rich.console import Console | ||
| 23 | -from rich.panel import Panel | ||
| 24 | -from rich.table import Table | ||
| 25 | -from rich.text import Text | ||
| 26 | - | ||
| 27 | -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict | ||
| 28 | - | ||
| 29 | -_VERDICT_STYLE = { | ||
| 30 | - Verdict.PASS: "bold green", | ||
| 31 | - Verdict.FAIL: "bold red", | ||
| 32 | - Verdict.WARN: "bold yellow", | ||
| 33 | - Verdict.SKIP: "dim", | ||
| 34 | - Verdict.ERROR: "bold magenta", | ||
| 35 | -} | ||
| 36 | - | ||
| 37 | - | ||
| 38 | -def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None: | ||
| 39 | - """Render the report to a rich Console (stdout by default).""" | ||
| 40 | - c = console or Console() | ||
| 41 | - | ||
| 42 | - header = Text.assemble( | ||
| 43 | - ("dlm-sway report — ", "bold"), | ||
| 44 | - (suite.base_model_id, "cyan"), | ||
| 45 | - (" vs ", "dim"), | ||
| 46 | - (_adapter_label(suite.adapter_id), "cyan"), | ||
| 47 | - ) | ||
| 48 | - c.print(Panel(header, expand=False, border_style="blue")) | ||
| 49 | - | ||
| 50 | - c.print() | ||
| 51 | - c.print( | ||
| 52 | - Text.assemble( | ||
| 53 | - ("overall: ", "bold"), | ||
| 54 | - (f"{score.overall:.2f}", _score_style(score.overall)), | ||
| 55 | - (" ", ""), | ||
| 56 | - (f"[ {score.band} ]", _band_style(score.band)), | ||
| 57 | - ) | ||
| 58 | - ) | ||
| 59 | - | ||
| 60 | - # Component breakdown | ||
| 61 | - comp_table = Table.grid(padding=(0, 2)) | ||
| 62 | - comp_table.add_column(justify="left") | ||
| 63 | - comp_table.add_column(justify="right") | ||
| 64 | - comp_table.add_column() | ||
| 65 | - for cat in ("adherence", "attribution", "calibration", "ablation", "baseline"): | ||
| 66 | - if cat not in score.components: | ||
| 67 | - continue | ||
| 68 | - v = score.components[cat] | ||
| 69 | - comp_table.add_row(cat, f"{v:.2f}", _bar(v)) | ||
| 70 | - c.print(comp_table) | ||
| 71 | - | ||
| 72 | - c.print() | ||
| 73 | - # Per-probe detail | ||
| 74 | - detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1)) | ||
| 75 | - detail.add_column("name", style="cyan") | ||
| 76 | - detail.add_column("kind", style="dim") | ||
| 77 | - detail.add_column("verdict") | ||
| 78 | - detail.add_column("score", justify="right") | ||
| 79 | - detail.add_column("raw", justify="right") | ||
| 80 | - detail.add_column("z", justify="right") | ||
| 81 | - detail.add_column("note", style="dim") | ||
| 82 | - for r in suite.probes: | ||
| 83 | - detail.add_row( | ||
| 84 | - r.name, | ||
| 85 | - r.kind, | ||
| 86 | - Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]), | ||
| 87 | - f"{r.score:.2f}" if r.score is not None else "—", | ||
| 88 | - f"{r.raw:.3f}" if r.raw is not None else "—", | ||
| 89 | - f"{r.z_score:+.2f}σ" if r.z_score is not None else "—", | ||
| 90 | - (r.message[:80] + "…") if len(r.message) > 80 else r.message, | ||
| 91 | - ) | ||
| 92 | - c.print(detail) | ||
| 93 | - | ||
| 94 | - if score.findings: | ||
| 95 | - c.print() | ||
| 96 | - c.print(Text("top findings:", style="bold")) | ||
| 97 | - for i, f in enumerate(score.findings, start=1): | ||
| 98 | - c.print(f" {i}. {f}") | ||
| 99 | - | ||
| 100 | - c.print() | ||
| 101 | - c.print(Text(f"wall: {suite.wall_seconds:.2f}s | sway {suite.sway_version}", style="dim")) | ||
| 102 | - | ||
| 103 | - | ||
| 104 | -def to_json(suite: SuiteResult, score: SwayScore) -> str: | ||
| 105 | - """Serialize the suite + composite score as JSON. | ||
| 106 | - | ||
| 107 | - Stable schema; downstream tools rely on it. Breaking changes bump a | ||
| 108 | - ``schema_version`` field (not yet present — this is v0.1). | ||
| 109 | - """ | ||
| 110 | - return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True) | ||
| 111 | - | ||
| 112 | - | ||
| 113 | -def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]: | ||
| 114 | - return { | ||
| 115 | - "schema_version": 1, | ||
| 116 | - "sway_version": suite.sway_version, | ||
| 117 | - "spec_path": suite.spec_path, | ||
| 118 | - "base_model_id": suite.base_model_id, | ||
| 119 | - "adapter_id": suite.adapter_id, | ||
| 120 | - "started_at": suite.started_at.isoformat(), | ||
| 121 | - "finished_at": suite.finished_at.isoformat(), | ||
| 122 | - "wall_seconds": suite.wall_seconds, | ||
| 123 | - "score": { | ||
| 124 | - "overall": score.overall, | ||
| 125 | - "band": score.band, | ||
| 126 | - "components": score.components, | ||
| 127 | - "weights": score.weights, | ||
| 128 | - "findings": list(score.findings), | ||
| 129 | - }, | ||
| 130 | - "null_stats": suite.null_stats, | ||
| 131 | - "probes": [_probe_to_jsonable(p) for p in suite.probes], | ||
| 132 | - } | ||
| 133 | - | ||
| 134 | - | ||
| 135 | -def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]: | ||
| 136 | - return { | ||
| 137 | - "name": r.name, | ||
| 138 | - "kind": r.kind, | ||
| 139 | - "verdict": r.verdict.value, | ||
| 140 | - "score": r.score, | ||
| 141 | - "raw": r.raw, | ||
| 142 | - "z_score": r.z_score, | ||
| 143 | - "base_value": r.base_value, | ||
| 144 | - "ft_value": r.ft_value, | ||
| 145 | - "evidence": r.evidence, | ||
| 146 | - "message": r.message, | ||
| 147 | - "duration_s": r.duration_s, | ||
| 148 | - } | ||
| 149 | - | ||
| 150 | - | ||
| 151 | -def to_junit(suite: SuiteResult, score: SwayScore) -> str: | ||
| 152 | - """Serialize as JUnit XML. One ``<testcase>`` per probe.""" | ||
| 153 | - testsuite = ET.Element( | ||
| 154 | - "testsuite", | ||
| 155 | - { | ||
| 156 | - "name": "dlm-sway", | ||
| 157 | - "tests": str(len(suite.probes)), | ||
| 158 | - "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)), | ||
| 159 | - "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)), | ||
| 160 | - "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)), | ||
| 161 | - "time": f"{suite.wall_seconds:.3f}", | ||
| 162 | - }, | ||
| 163 | - ) | ||
| 164 | - # Properties — the composite score and category breakdown. | ||
| 165 | - props = ET.SubElement(testsuite, "properties") | ||
| 166 | - ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"}) | ||
| 167 | - ET.SubElement(props, "property", {"name": "band", "value": score.band}) | ||
| 168 | - for cat, v in score.components.items(): | ||
| 169 | - ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"}) | ||
| 170 | - | ||
| 171 | - for r in suite.probes: | ||
| 172 | - tc = ET.SubElement( | ||
| 173 | - testsuite, | ||
| 174 | - "testcase", | ||
| 175 | - {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"}, | ||
| 176 | - ) | ||
| 177 | - if r.verdict == Verdict.FAIL: | ||
| 178 | - ET.SubElement(tc, "failure", {"message": r.message or "failed"}) | ||
| 179 | - elif r.verdict == Verdict.ERROR: | ||
| 180 | - ET.SubElement(tc, "error", {"message": r.message or "errored"}) | ||
| 181 | - elif r.verdict == Verdict.SKIP: | ||
| 182 | - ET.SubElement(tc, "skipped", {"message": r.message or "skipped"}) | ||
| 183 | - | ||
| 184 | - return ET.tostring(testsuite, encoding="unicode") | ||
| 185 | - | ||
| 186 | - | ||
| 187 | -def to_markdown(suite: SuiteResult, score: SwayScore) -> str: | ||
| 188 | - """A portable, CI-friendly markdown report.""" | ||
| 189 | - buf = StringIO() | ||
| 190 | - buf.write("# dlm-sway report\n\n") | ||
| 191 | - buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`) \n") | ||
| 192 | - buf.write(f"**Base:** `{suite.base_model_id}` \n") | ||
| 193 | - buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}` \n") | ||
| 194 | - buf.write(f"**Wall:** {suite.wall_seconds:.2f}s \n\n") | ||
| 195 | - | ||
| 196 | - buf.write("## Components\n\n") | ||
| 197 | - buf.write("| category | score |\n|---|---:|\n") | ||
| 198 | - for cat, v in score.components.items(): | ||
| 199 | - buf.write(f"| {cat} | {v:.2f} |\n") | ||
| 200 | - buf.write("\n## Probes\n\n") | ||
| 201 | - buf.write("| name | kind | verdict | score | note |\n|---|---|---|---:|---|\n") | ||
| 202 | - for r in suite.probes: | ||
| 203 | - buf.write( | ||
| 204 | - f"| {r.name} | `{r.kind}` | {r.verdict.value} | " | ||
| 205 | - f"{f'{r.score:.2f}' if r.score is not None else '—'} | " | ||
| 206 | - f"{r.message[:60]} |\n" | ||
| 207 | - ) | ||
| 208 | - if score.findings: | ||
| 209 | - buf.write("\n## Top findings\n\n") | ||
| 210 | - for f in score.findings: | ||
| 211 | - buf.write(f"- {f}\n") | ||
| 212 | - return buf.getvalue() | ||
| 213 | - | ||
| 214 | - | ||
| 215 | -# -- helpers ----------------------------------------------------------- | ||
| 216 | - | ||
| 217 | - | ||
| 218 | -def _adapter_label(adapter_id: str) -> str: | ||
| 219 | - if not adapter_id: | ||
| 220 | - return "(base only)" | ||
| 221 | - # Only the trailing path chunk is useful in the header. | ||
| 222 | - parts = adapter_id.rstrip("/").split("/") | ||
| 223 | - return "/".join(parts[-3:]) if len(parts) > 3 else adapter_id | ||
| 224 | - | ||
| 225 | - | ||
| 226 | -def _score_style(v: float) -> str: | ||
| 227 | - if v >= 0.6: | ||
| 228 | - return "bold green" | ||
| 229 | - if v >= 0.3: | ||
| 230 | - return "bold yellow" | ||
| 231 | - return "bold red" | ||
| 232 | - | ||
| 233 | - | ||
| 234 | -def _band_style(band: str) -> str: | ||
| 235 | - return { | ||
| 236 | - "noise": "red", | ||
| 237 | - "partial": "yellow", | ||
| 238 | - "healthy": "green", | ||
| 239 | - "suspicious": "magenta", | ||
| 240 | - }.get(band, "white") | ||
| 241 | - | ||
| 242 | - | ||
| 243 | -def _bar(v: float, *, width: int = 10) -> str: | ||
| 244 | - clamped = max(0.0, min(1.0, v)) | ||
| 245 | - filled = int(round(clamped * width)) | ||
| 246 | - return "█" * filled + "░" * (width - filled) | ||
| 247 | - | ||
| 248 | - | ||
| 249 | -__all__ = ["to_terminal", "to_json", "to_junit", "to_markdown"] | ||
sway/src/dlm_sway/suite/runner.pydeleted@@ -1,136 +0,0 @@ | |||
| 1 | -"""Suite runner. | ||
| 2 | - | ||
| 3 | -Iterates the probe list, materializes each into a ``(Probe, Spec)`` via | ||
| 4 | -the registry, executes it with a :class:`~dlm_sway.probes.base.RunContext`, | ||
| 5 | -and assembles a :class:`~dlm_sway.core.result.SuiteResult`. | ||
| 6 | - | ||
| 7 | -Runtime contract: | ||
| 8 | - | ||
| 9 | -- Probes are executed in declaration order (not sorted, not parallelized). | ||
| 10 | - The null-adapter baseline has to run before any probe that needs z-scores, | ||
| 11 | - so authoring order is load-bearing. | ||
| 12 | -- A probe that raises is recorded as | ||
| 13 | - :attr:`~dlm_sway.core.result.Verdict.ERROR` and the suite continues — | ||
| 14 | - one broken probe doesn't torch the whole report. | ||
| 15 | -- The backend is the caller's responsibility: the runner does not build | ||
| 16 | - or close it, so callers can reuse a backend across multiple suites. | ||
| 17 | -""" | ||
| 18 | - | ||
| 19 | -from __future__ import annotations | ||
| 20 | - | ||
| 21 | -import time | ||
| 22 | - | ||
| 23 | -from dlm_sway import __version__ | ||
| 24 | -from dlm_sway.core.errors import ProbeError | ||
| 25 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | ||
| 26 | -from dlm_sway.core.scoring import DifferentialBackend | ||
| 27 | -from dlm_sway.core.sections import Section | ||
| 28 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 29 | -from dlm_sway.probes.null_adapter import NullAdapterSpec, get_null_stats | ||
| 30 | -from dlm_sway.suite.spec import SwaySpec | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -def run( | ||
| 34 | - spec: SwaySpec, | ||
| 35 | - backend: DifferentialBackend, | ||
| 36 | - *, | ||
| 37 | - spec_path: str = "<memory>", | ||
| 38 | - doc_text: str | None = None, | ||
| 39 | - sections: tuple[Section, ...] | None = None, | ||
| 40 | -) -> SuiteResult: | ||
| 41 | - """Execute every probe in ``spec`` against ``backend``.""" | ||
| 42 | - started = utcnow() | ||
| 43 | - ctx = RunContext( | ||
| 44 | - backend=backend, | ||
| 45 | - seed=spec.defaults.seed, | ||
| 46 | - top_k=spec.defaults.top_k, | ||
| 47 | - sections=sections, | ||
| 48 | - doc_text=doc_text, | ||
| 49 | - ) | ||
| 50 | - | ||
| 51 | - results: list[ProbeResult] = [] | ||
| 52 | - null_stats: dict[str, dict[str, float]] = {} | ||
| 53 | - | ||
| 54 | - for raw in spec.suite: | ||
| 55 | - probe, probe_spec = build_probe(raw) | ||
| 56 | - if not probe_spec.enabled: | ||
| 57 | - results.append( | ||
| 58 | - ProbeResult( | ||
| 59 | - name=probe_spec.name, | ||
| 60 | - kind=probe_spec.kind, | ||
| 61 | - verdict=Verdict.SKIP, | ||
| 62 | - score=None, | ||
| 63 | - message="disabled in spec", | ||
| 64 | - ) | ||
| 65 | - ) | ||
| 66 | - continue | ||
| 67 | - | ||
| 68 | - t0 = time.perf_counter() | ||
| 69 | - try: | ||
| 70 | - result = probe.run(probe_spec, ctx) | ||
| 71 | - except ProbeError as exc: | ||
| 72 | - result = ProbeResult( | ||
| 73 | - name=probe_spec.name, | ||
| 74 | - kind=probe_spec.kind, | ||
| 75 | - verdict=Verdict.ERROR, | ||
| 76 | - score=None, | ||
| 77 | - message=str(exc), | ||
| 78 | - ) | ||
| 79 | - except Exception as exc: # noqa: BLE001 — probe impls may raise anything | ||
| 80 | - result = ProbeResult( | ||
| 81 | - name=probe_spec.name, | ||
| 82 | - kind=probe_spec.kind, | ||
| 83 | - verdict=Verdict.ERROR, | ||
| 84 | - score=None, | ||
| 85 | - message=f"{type(exc).__name__}: {exc}", | ||
| 86 | - ) | ||
| 87 | - duration = time.perf_counter() - t0 | ||
| 88 | - # Re-stamp duration (probes don't know their own wall time). | ||
| 89 | - result = _with_duration(result, duration) | ||
| 90 | - results.append(result) | ||
| 91 | - | ||
| 92 | - # Null-adapter result seeds ctx.null_stats for subsequent probes. | ||
| 93 | - if isinstance(probe_spec, NullAdapterSpec) and result.evidence.get("null_stats"): | ||
| 94 | - null_stats.update(result.evidence["null_stats"]) | ||
| 95 | - # RunContext is frozen; swap in a fresh one so later probes | ||
| 96 | - # see the populated stats. | ||
| 97 | - ctx = RunContext( | ||
| 98 | - backend=ctx.backend, | ||
| 99 | - seed=ctx.seed, | ||
| 100 | - top_k=ctx.top_k, | ||
| 101 | - sections=ctx.sections, | ||
| 102 | - doc_text=ctx.doc_text, | ||
| 103 | - null_stats=null_stats, | ||
| 104 | - ) | ||
| 105 | - | ||
| 106 | - finished = utcnow() | ||
| 107 | - return SuiteResult( | ||
| 108 | - spec_path=spec_path, | ||
| 109 | - started_at=started, | ||
| 110 | - finished_at=finished, | ||
| 111 | - base_model_id=spec.models.base.base, | ||
| 112 | - adapter_id=str(spec.models.ft.adapter) if spec.models.ft.adapter else "", | ||
| 113 | - sway_version=__version__, | ||
| 114 | - probes=tuple(results), | ||
| 115 | - null_stats=null_stats, | ||
| 116 | - ) | ||
| 117 | - | ||
| 118 | - | ||
| 119 | -def _with_duration(result: ProbeResult, duration: float) -> ProbeResult: | ||
| 120 | - """Return a copy of ``result`` with :attr:`ProbeResult.duration_s` set.""" | ||
| 121 | - return ProbeResult( | ||
| 122 | - name=result.name, | ||
| 123 | - kind=result.kind, | ||
| 124 | - verdict=result.verdict, | ||
| 125 | - score=result.score, | ||
| 126 | - raw=result.raw, | ||
| 127 | - z_score=result.z_score, | ||
| 128 | - base_value=result.base_value, | ||
| 129 | - ft_value=result.ft_value, | ||
| 130 | - evidence=result.evidence, | ||
| 131 | - message=result.message, | ||
| 132 | - duration_s=duration, | ||
| 133 | - ) | ||
| 134 | - | ||
| 135 | - | ||
| 136 | -__all__ = ["get_null_stats", "run"] | ||
sway/src/dlm_sway/suite/score.pydeleted@@ -1,106 +0,0 @@ | |||
| 1 | -"""Composite :class:`~dlm_sway.core.result.SwayScore` from a suite result. | ||
| 2 | - | ||
| 3 | -The score is a weighted mean over four categories | ||
| 4 | -(adherence / attribution / calibration / ablation). Each category's | ||
| 5 | -value is the weighted mean of its pass/score values (with SKIP/ERROR | ||
| 6 | -excluded so a broken probe doesn't silently depress the composite). | ||
| 7 | - | ||
| 8 | -All weighting is explicit, user-overridable, and surfaced in the report | ||
| 9 | -alongside the number — no black-box scoring. | ||
| 10 | -""" | ||
| 11 | - | ||
| 12 | -from __future__ import annotations | ||
| 13 | - | ||
| 14 | -from dlm_sway.core.result import ( | ||
| 15 | - DEFAULT_COMPONENT_WEIGHTS, | ||
| 16 | - ProbeResult, | ||
| 17 | - SuiteResult, | ||
| 18 | - SwayScore, | ||
| 19 | - Verdict, | ||
| 20 | -) | ||
| 21 | -from dlm_sway.probes.base import registry | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def compute( | ||
| 25 | - suite: SuiteResult, | ||
| 26 | - *, | ||
| 27 | - weights: dict[str, float] | None = None, | ||
| 28 | -) -> SwayScore: | ||
| 29 | - """Fold a :class:`SuiteResult` into a :class:`SwayScore`.""" | ||
| 30 | - w = weights if weights is not None else dict(DEFAULT_COMPONENT_WEIGHTS) | ||
| 31 | - registered = registry() | ||
| 32 | - | ||
| 33 | - # Bucket probes by their declared category. | ||
| 34 | - buckets: dict[str, list[ProbeResult]] = {k: [] for k in w} | ||
| 35 | - for r in suite.probes: | ||
| 36 | - if r.verdict in {Verdict.SKIP, Verdict.ERROR}: | ||
| 37 | - continue | ||
| 38 | - if r.score is None: | ||
| 39 | - continue | ||
| 40 | - probe_cls = registered.get(r.kind) | ||
| 41 | - category = probe_cls.category if probe_cls is not None else "adherence" | ||
| 42 | - buckets.setdefault(category, []).append(r) | ||
| 43 | - | ||
| 44 | - component_scores: dict[str, float] = {} | ||
| 45 | - for cat, probes in buckets.items(): | ||
| 46 | - if not probes: | ||
| 47 | - component_scores[cat] = 0.0 | ||
| 48 | - continue | ||
| 49 | - total_w = sum(max(_spec_weight(p), 0.0) for p in probes) or 1.0 | ||
| 50 | - weighted = sum(max(_spec_weight(p), 0.0) * (p.score or 0.0) for p in probes) | ||
| 51 | - component_scores[cat] = weighted / total_w | ||
| 52 | - | ||
| 53 | - # Fold to composite, weighted by the user's category weights, but | ||
| 54 | - # ignoring components that had no contributing probes (so a | ||
| 55 | - # PREFERENCE-free document doesn't get penalized for missing B3). | ||
| 56 | - active_weights = {k: v for k, v in w.items() if buckets.get(k)} | ||
| 57 | - total_w = sum(active_weights.values()) or 1.0 | ||
| 58 | - overall = sum(active_weights[k] * component_scores[k] for k in active_weights) / total_w | ||
| 59 | - | ||
| 60 | - findings = _findings(suite, component_scores) | ||
| 61 | - | ||
| 62 | - return SwayScore( | ||
| 63 | - overall=overall, | ||
| 64 | - components=component_scores, | ||
| 65 | - weights=w, | ||
| 66 | - band=SwayScore.band_for(overall), | ||
| 67 | - findings=findings, | ||
| 68 | - ) | ||
| 69 | - | ||
| 70 | - | ||
| 71 | -def _spec_weight(result: ProbeResult) -> float: | ||
| 72 | - """Recover a probe's declared weight from its ``evidence`` payload. | ||
| 73 | - | ||
| 74 | - The runner stores ``spec.weight`` on evidence so the scorer can read | ||
| 75 | - it without re-validating specs. Falls back to 1.0 when absent (older | ||
| 76 | - runs, custom probes, etc). | ||
| 77 | - """ | ||
| 78 | - w = result.evidence.get("weight") | ||
| 79 | - if isinstance(w, int | float): | ||
| 80 | - return float(w) | ||
| 81 | - return 1.0 | ||
| 82 | - | ||
| 83 | - | ||
| 84 | -def _findings(suite: SuiteResult, components: dict[str, float]) -> tuple[str, ...]: | ||
| 85 | - """Surface the 2–3 most diagnostic notes for the terminal report.""" | ||
| 86 | - notes: list[str] = [] | ||
| 87 | - | ||
| 88 | - failed = [r for r in suite.probes if r.verdict == Verdict.FAIL] | ||
| 89 | - if failed: | ||
| 90 | - top = failed[0] | ||
| 91 | - notes.append( | ||
| 92 | - f"{top.name} ({top.kind}) failed" + (f": {top.message}" if top.message else "") | ||
| 93 | - ) | ||
| 94 | - | ||
| 95 | - for cat, score in components.items(): | ||
| 96 | - if score < 0.3 and components.get(cat, 1.0) != 0.0: | ||
| 97 | - notes.append(f"{cat} score is {score:.2f} — below the noise threshold") | ||
| 98 | - | ||
| 99 | - errors = [r for r in suite.probes if r.verdict == Verdict.ERROR] | ||
| 100 | - if errors: | ||
| 101 | - notes.append(f"{len(errors)} probe(s) errored — see full report for details") | ||
| 102 | - | ||
| 103 | - return tuple(notes[:5]) | ||
| 104 | - | ||
| 105 | - | ||
| 106 | -__all__ = ["compute"] | ||
sway/src/dlm_sway/suite/spec.pydeleted@@ -1,72 +0,0 @@ | |||
| 1 | -"""Top-level ``sway.yaml`` spec models. | ||
| 2 | - | ||
| 3 | -Per-probe specs live next to their implementations in | ||
| 4 | -:mod:`dlm_sway.probes`. This module owns the *outer* envelope — | ||
| 5 | -``version``, ``models``, ``defaults``, ``suite`` — plus the runtime | ||
| 6 | -bind between raw probe dicts and registered probe classes. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -from typing import Annotated, Any | ||
| 12 | - | ||
| 13 | -from pydantic import BaseModel, ConfigDict, Field | ||
| 14 | - | ||
| 15 | -from dlm_sway.core.model import ModelSpec | ||
| 16 | - | ||
| 17 | -SUPPORTED_VERSION = 1 | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -class SuiteModels(BaseModel): | ||
| 21 | - """Named model handles the suite references — ``base`` + ``ft``.""" | ||
| 22 | - | ||
| 23 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 24 | - | ||
| 25 | - base: ModelSpec | ||
| 26 | - ft: ModelSpec | ||
| 27 | - | ||
| 28 | - | ||
| 29 | -class SuiteDefaults(BaseModel): | ||
| 30 | - """Shared defaults for the whole suite. Probes may override per-entry.""" | ||
| 31 | - | ||
| 32 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 33 | - | ||
| 34 | - seed: int = 0 | ||
| 35 | - top_k: int = 256 | ||
| 36 | - differential: bool = True | ||
| 37 | - """If ``False``, the runner loads base + ft as two separate models | ||
| 38 | - instead of toggling on one. More memory-heavy; only useful when a | ||
| 39 | - backend can't do in-place toggling.""" | ||
| 40 | - coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6 | ||
| 41 | - """Minimum composite score for ``dlm-sway gate`` to pass.""" | ||
| 42 | - | ||
| 43 | - | ||
| 44 | -class SwaySpec(BaseModel): | ||
| 45 | - """Root of ``sway.yaml``.""" | ||
| 46 | - | ||
| 47 | - model_config = ConfigDict(extra="forbid", frozen=True) | ||
| 48 | - | ||
| 49 | - version: int = 1 | ||
| 50 | - models: SuiteModels | ||
| 51 | - defaults: SuiteDefaults = SuiteDefaults() | ||
| 52 | - suite: list[dict[str, Any]] = Field(default_factory=list) | ||
| 53 | - """Raw probe entries. Validated one-at-a-time by the probe registry | ||
| 54 | - via :func:`dlm_sway.probes.base.build_probe` so that the set of | ||
| 55 | - allowed probe kinds is an open registry rather than a closed | ||
| 56 | - discriminated union.""" | ||
| 57 | - dlm_source: str | None = None | ||
| 58 | - """Optional path to a ``.dlm`` file. When present, the runner asks | ||
| 59 | - :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and | ||
| 60 | - hands them to probes via :attr:`RunContext.sections`. Auto-populated | ||
| 61 | - by ``dlm-sway autogen``.""" | ||
| 62 | - | ||
| 63 | - def check_version(self) -> None: | ||
| 64 | - """Raise ``ValueError`` if the spec version is unsupported. | ||
| 65 | - | ||
| 66 | - Called explicitly by the loader after validation so the error | ||
| 67 | - surfaces with a loader-source tag rather than a pydantic stack. | ||
| 68 | - """ | ||
| 69 | - if self.version != SUPPORTED_VERSION: | ||
| 70 | - raise ValueError( | ||
| 71 | - f"unsupported sway spec version: {self.version} (this build supports {SUPPORTED_VERSION})" | ||
| 72 | - ) | ||
sway/src/dlm_sway/visualize.pydeleted@@ -1,137 +0,0 @@ | |||
| 1 | -"""Optional matplotlib-based visualizations. | ||
| 2 | - | ||
| 3 | -Behind the ``viz`` extra. Three functions cover the three plots that | ||
| 4 | -make the sway report come alive in a notebook or saved PNG: | ||
| 5 | - | ||
| 6 | -- :func:`plot_section_sis`: per-section bar chart of effective SIS | ||
| 7 | - (the flagship attribution view). | ||
| 8 | -- :func:`plot_adapter_ablation`: the λ-scaled divergence curve — the | ||
| 9 | - sway signature plot. | ||
| 10 | -- :func:`plot_kl_histogram`: distribution of per-prompt KL divergences | ||
| 11 | - (the raw data behind A1 DeltaKL). | ||
| 12 | - | ||
| 13 | -Each function raises :class:`~dlm_sway.core.errors.BackendNotAvailableError` | ||
| 14 | -with a pip hint when matplotlib isn't installed. No function writes to | ||
| 15 | -disk on your behalf — the caller decides (``fig.savefig(...)``). | ||
| 16 | -""" | ||
| 17 | - | ||
| 18 | -from __future__ import annotations | ||
| 19 | - | ||
| 20 | -from typing import Any | ||
| 21 | - | ||
| 22 | -from dlm_sway.core.errors import BackendNotAvailableError | ||
| 23 | -from dlm_sway.core.result import SuiteResult | ||
| 24 | - | ||
| 25 | - | ||
| 26 | -def _require_mpl() -> Any: | ||
| 27 | - try: | ||
| 28 | - import matplotlib.pyplot as plt | ||
| 29 | - | ||
| 30 | - return plt | ||
| 31 | - except ImportError as exc: | ||
| 32 | - raise BackendNotAvailableError( | ||
| 33 | - "visualize", | ||
| 34 | - extra="viz", | ||
| 35 | - hint="sway's visualization module needs matplotlib.", | ||
| 36 | - ) from exc | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def plot_section_sis(suite: SuiteResult) -> Any: | ||
| 40 | - """Render a per-section ``effective_sis`` bar chart. | ||
| 41 | - | ||
| 42 | - Returns the matplotlib ``Figure``; the caller handles display / save. | ||
| 43 | - """ | ||
| 44 | - plt = _require_mpl() | ||
| 45 | - | ||
| 46 | - probe = _find_probe(suite, "section_internalization") | ||
| 47 | - if probe is None or not probe.evidence.get("per_section"): | ||
| 48 | - raise ValueError("suite has no section_internalization evidence to plot") | ||
| 49 | - | ||
| 50 | - rows: list[dict[str, Any]] = list(probe.evidence["per_section"]) | ||
| 51 | - labels = [f"{row['tag'] or row['section_id'][:8]}\n({row['kind']})" for row in rows] | ||
| 52 | - values = [float(row["effective_sis"]) for row in rows] | ||
| 53 | - colors = ["#2ca02c" if row["passed"] else "#d62728" for row in rows] | ||
| 54 | - | ||
| 55 | - fig, ax = plt.subplots(figsize=(max(6.0, 0.7 * len(rows)), 4.0)) | ||
| 56 | - ax.bar(range(len(rows)), values, color=colors) | ||
| 57 | - ax.axhline( | ||
| 58 | - float(probe.evidence.get("per_section_threshold", 0.0)), | ||
| 59 | - color="gray", | ||
| 60 | - linestyle="--", | ||
| 61 | - linewidth=1, | ||
| 62 | - label="threshold", | ||
| 63 | - ) | ||
| 64 | - ax.set_xticks(range(len(rows))) | ||
| 65 | - ax.set_xticklabels(labels, rotation=30, ha="right") | ||
| 66 | - ax.set_ylabel("effective SIS") | ||
| 67 | - ax.set_title("Section Internalization Score") | ||
| 68 | - ax.legend(loc="best") | ||
| 69 | - fig.tight_layout() | ||
| 70 | - return fig | ||
| 71 | - | ||
| 72 | - | ||
| 73 | -def plot_adapter_ablation(suite: SuiteResult) -> Any: | ||
| 74 | - """Render the signature λ-scaled divergence curve.""" | ||
| 75 | - plt = _require_mpl() | ||
| 76 | - | ||
| 77 | - probe = _find_probe(suite, "adapter_ablation") | ||
| 78 | - if probe is None or not probe.evidence.get("lambdas"): | ||
| 79 | - raise ValueError("suite has no adapter_ablation evidence to plot") | ||
| 80 | - | ||
| 81 | - lambdas = list(probe.evidence["lambdas"]) | ||
| 82 | - divs = list(probe.evidence["mean_divergence_per_lambda"]) | ||
| 83 | - | ||
| 84 | - fig, ax = plt.subplots(figsize=(7.0, 4.0)) | ||
| 85 | - ax.plot(lambdas, divs, marker="o", linewidth=2, color="#1f77b4") | ||
| 86 | - ax.axvline(1.0, color="gray", linestyle=":", linewidth=1, label="λ=1 (trained)") | ||
| 87 | - sat = probe.evidence.get("saturation_lambda") | ||
| 88 | - if sat is not None: | ||
| 89 | - ax.axvline( | ||
| 90 | - float(sat), | ||
| 91 | - color="#2ca02c", | ||
| 92 | - linestyle="--", | ||
| 93 | - linewidth=1, | ||
| 94 | - label=f"sat λ={float(sat):.2f}", | ||
| 95 | - ) | ||
| 96 | - ax.set_xlabel("λ (adapter scale)") | ||
| 97 | - ax.set_ylabel("mean JS divergence vs λ=0") | ||
| 98 | - ax.set_title( | ||
| 99 | - f"Adapter Ablation (R²={float(probe.evidence.get('linearity', 0.0)):.2f}, " | ||
| 100 | - f"overshoot={float(probe.evidence.get('overshoot', 0.0)):.2f})" | ||
| 101 | - ) | ||
| 102 | - ax.legend(loc="best") | ||
| 103 | - fig.tight_layout() | ||
| 104 | - return fig | ||
| 105 | - | ||
| 106 | - | ||
| 107 | -def plot_kl_histogram(suite: SuiteResult) -> Any: | ||
| 108 | - """Render the per-prompt KL distribution from a DeltaKL probe.""" | ||
| 109 | - plt = _require_mpl() | ||
| 110 | - | ||
| 111 | - probe = _find_probe(suite, "delta_kl") | ||
| 112 | - if probe is None or not probe.evidence.get("per_prompt"): | ||
| 113 | - raise ValueError("suite has no delta_kl evidence to plot") | ||
| 114 | - | ||
| 115 | - values = list(probe.evidence["per_prompt"]) | ||
| 116 | - fig, ax = plt.subplots(figsize=(7.0, 4.0)) | ||
| 117 | - ax.hist(values, bins=max(5, min(20, len(values) // 2)), color="#ff7f0e", edgecolor="white") | ||
| 118 | - ax.axvline( | ||
| 119 | - float(probe.raw or 0.0), | ||
| 120 | - color="black", | ||
| 121 | - linestyle="--", | ||
| 122 | - linewidth=1, | ||
| 123 | - label=f"mean={float(probe.raw or 0.0):.3f}", | ||
| 124 | - ) | ||
| 125 | - ax.set_xlabel(probe.evidence.get("divergence_kind", "divergence")) | ||
| 126 | - ax.set_ylabel("count") | ||
| 127 | - ax.set_title("DeltaKL — per-prompt distribution") | ||
| 128 | - ax.legend(loc="best") | ||
| 129 | - fig.tight_layout() | ||
| 130 | - return fig | ||
| 131 | - | ||
| 132 | - | ||
| 133 | -def _find_probe(suite: SuiteResult, kind: str) -> Any: | ||
| 134 | - for p in suite.probes: | ||
| 135 | - if p.kind == kind: | ||
| 136 | - return p | ||
| 137 | - return None | ||
sway/tests/__init__.pydeletedsway/tests/conftest.pydeleted@@ -1,29 +0,0 @@ | |||
| 1 | -"""Shared test fixtures. | ||
| 2 | - | ||
| 3 | -Keep the default fast-test environment offline and deterministic so unit | ||
| 4 | -tests stay below ~1 s per file. Integration tests override these via | ||
| 5 | -their own ``conftest`` when they need network access. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -import pytest | ||
| 11 | - | ||
| 12 | -# Import the probes package once so every shipped probe registers itself | ||
| 13 | -# with the central registry. Tests that exercise build_probe("delta_kl", | ||
| 14 | -# …) rely on this. | ||
| 15 | -import dlm_sway.probes # noqa: F401 | ||
| 16 | - | ||
| 17 | - | ||
| 18 | -@pytest.fixture(autouse=True) | ||
| 19 | -def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: | ||
| 20 | - """Unit tests never touch the network. | ||
| 21 | - | ||
| 22 | - Any backend test that needs HF should be marked ``@pytest.mark.online`` | ||
| 23 | - and clear these vars explicitly. | ||
| 24 | - """ | ||
| 25 | - monkeypatch.setenv("HF_HUB_OFFLINE", "1") | ||
| 26 | - monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1") | ||
| 27 | - monkeypatch.setenv("HF_DATASETS_OFFLINE", "1") | ||
| 28 | - monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1") | ||
| 29 | - monkeypatch.setenv("DO_NOT_TRACK", "1") | ||
sway/tests/fixtures/__init__.pydeletedsway/tests/fixtures/tiny_model.pydeleted@@ -1,53 +0,0 @@ | |||
| 1 | -"""Tiny-model fixture for integration tests. | ||
| 2 | - | ||
| 3 | -Mirrors ``dlm.tests.fixtures.tiny_model``: session-scoped snapshot of | ||
| 4 | -SmolLM2-135M-Instruct, reused across the whole test run. The model is | ||
| 5 | -small enough (~280 MB on disk, ~600 MB in fp32 VRAM) to make integration | ||
| 6 | -tests feasible in CI. | ||
| 7 | - | ||
| 8 | -Tests using this fixture must carry ``@pytest.mark.slow`` and | ||
| 9 | -``@pytest.mark.online`` — the default test selection excludes both. | ||
| 10 | -""" | ||
| 11 | - | ||
| 12 | -from __future__ import annotations | ||
| 13 | - | ||
| 14 | -import os | ||
| 15 | -from collections.abc import Iterator | ||
| 16 | -from pathlib import Path | ||
| 17 | - | ||
| 18 | -import pytest | ||
| 19 | - | ||
| 20 | -TINY_MODEL_HF_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" | ||
| 21 | -TINY_MODEL_REVISION = os.environ.get("DLM_SWAY_TINY_MODEL_REVISION", "main") | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def _offline_mode() -> bool: | ||
| 25 | - return os.environ.get("SWAY_OFFLINE", "0") == "1" | ||
| 26 | - | ||
| 27 | - | ||
| 28 | -@pytest.fixture(scope="session") | ||
| 29 | -def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: | ||
| 30 | - """Download (or reuse) the tiny model; yield the cached directory. | ||
| 31 | - | ||
| 32 | - Test opts in via ``@pytest.mark.online`` — the session-wide offline | ||
| 33 | - env vars are cleared inside this fixture so ``snapshot_download`` | ||
| 34 | - actually fetches. | ||
| 35 | - """ | ||
| 36 | - from huggingface_hub import snapshot_download | ||
| 37 | - | ||
| 38 | - # Clear offline env guards (set by the unit-test autouse fixture). | ||
| 39 | - prior = { | ||
| 40 | - k: os.environ.pop(k, None) | ||
| 41 | - for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE") | ||
| 42 | - } | ||
| 43 | - try: | ||
| 44 | - path = snapshot_download( | ||
| 45 | - repo_id=TINY_MODEL_HF_ID, | ||
| 46 | - revision=TINY_MODEL_REVISION, | ||
| 47 | - local_files_only=_offline_mode(), | ||
| 48 | - ) | ||
| 49 | - yield Path(path) | ||
| 50 | - finally: | ||
| 51 | - for k, v in prior.items(): | ||
| 52 | - if v is not None: | ||
| 53 | - os.environ[k] = v | ||
sway/tests/integration/__init__.pydeletedsway/tests/integration/conftest.pydeleted@@ -1,10 +0,0 @@ | |||
| 1 | -"""Integration-test configuration. | ||
| 2 | - | ||
| 3 | -Integration tests need network + heavy deps. Re-export the tiny_model | ||
| 4 | -fixture here so test modules can pick it up without a long import | ||
| 5 | -path. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -from tests.fixtures.tiny_model import tiny_model_dir # noqa: F401 — re-export | ||
sway/tests/integration/test_hf_adapter_toggle.pydeleted@@ -1,113 +0,0 @@ | |||
| 1 | -"""Integration test: PEFT ``disable_adapter`` actually changes logits. | ||
| 2 | - | ||
| 3 | -This is the load-bearing sanity check for the whole differential design. | ||
| 4 | -If a future ``peft`` release subtly breaks the disable-context semantics, | ||
| 5 | -sway's KL / SIS / ablation probes would all silently report zero signal. | ||
| 6 | -We catch that here, before the rest of the test battery runs. | ||
| 7 | - | ||
| 8 | -The test builds a random-init LoRA adapter on a tiny model so no network | ||
| 9 | -dependency beyond the base model snapshot itself. | ||
| 10 | -""" | ||
| 11 | - | ||
| 12 | -from __future__ import annotations | ||
| 13 | - | ||
| 14 | -from pathlib import Path | ||
| 15 | - | ||
| 16 | -import pytest | ||
| 17 | - | ||
| 18 | -from dlm_sway.backends.hf import HuggingFaceDifferentialBackend | ||
| 19 | -from dlm_sway.core.model import ModelSpec | ||
| 20 | - | ||
| 21 | -pytestmark = [pytest.mark.slow, pytest.mark.online] | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None: | ||
| 25 | - """Construct a LoRA adapter with random-init weights on ``base_dir``. | ||
| 26 | - | ||
| 27 | - The weights are kept small so the toggle-delta is clear but the | ||
| 28 | - adapter is structurally valid (correct ``adapter_config.json``, | ||
| 29 | - tokenizer files, safetensors layout). | ||
| 30 | - """ | ||
| 31 | - import torch | ||
| 32 | - from peft import LoraConfig, get_peft_model | ||
| 33 | - from transformers import AutoModelForCausalLM, AutoTokenizer | ||
| 34 | - | ||
| 35 | - torch.manual_seed(0) | ||
| 36 | - | ||
| 37 | - tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) | ||
| 38 | - if tokenizer.pad_token_id is None: | ||
| 39 | - tokenizer.pad_token = tokenizer.eos_token | ||
| 40 | - base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) | ||
| 41 | - | ||
| 42 | - cfg = LoraConfig( | ||
| 43 | - r=8, | ||
| 44 | - lora_alpha=16, | ||
| 45 | - target_modules=["q_proj", "v_proj"], | ||
| 46 | - lora_dropout=0.0, | ||
| 47 | - bias="none", | ||
| 48 | - task_type="CAUSAL_LM", | ||
| 49 | - ) | ||
| 50 | - peft_model = get_peft_model(base, cfg) | ||
| 51 | - | ||
| 52 | - # Explicitly scale lora_B out of its PEFT-default zero-init so the | ||
| 53 | - # adapter actually changes outputs. Real training does this via | ||
| 54 | - # gradients; we do it with a scaled normal. | ||
| 55 | - with torch.no_grad(): | ||
| 56 | - for name, param in peft_model.named_parameters(): | ||
| 57 | - if "lora_B" in name: | ||
| 58 | - param.copy_(torch.randn_like(param) * 0.05) | ||
| 59 | - | ||
| 60 | - peft_model.save_pretrained(str(out_dir)) | ||
| 61 | - tokenizer.save_pretrained(str(out_dir)) | ||
| 62 | - | ||
| 63 | - | ||
| 64 | -@pytest.fixture(scope="module") | ||
| 65 | -def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: | ||
| 66 | - adapter_dir = tmp_path_factory.mktemp("random-adapter") | ||
| 67 | - _build_random_lora_adapter(tiny_model_dir, adapter_dir) | ||
| 68 | - return adapter_dir | ||
| 69 | - | ||
| 70 | - | ||
| 71 | -def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None: | ||
| 72 | - """The keystone invariant: base view ≠ ft view on the same prompt.""" | ||
| 73 | - import numpy as np | ||
| 74 | - | ||
| 75 | - backend = HuggingFaceDifferentialBackend( | ||
| 76 | - base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), | ||
| 77 | - adapter_path=random_adapter, | ||
| 78 | - ) | ||
| 79 | - try: | ||
| 80 | - prompt = "The quick brown fox" | ||
| 81 | - with backend.as_base() as b: | ||
| 82 | - base_dist = b.next_token_dist(prompt, top_k=32) | ||
| 83 | - with backend.as_finetuned() as f: | ||
| 84 | - ft_dist = f.next_token_dist(prompt, top_k=32) | ||
| 85 | - | ||
| 86 | - # Top-k indices may shift under the adapter; take a safe shared | ||
| 87 | - # subset instead of asserting identical ordering. | ||
| 88 | - assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose( | ||
| 89 | - base_dist.logprobs, ft_dist.logprobs, atol=1e-5 | ||
| 90 | - ), "adapter toggle did not change next-token distribution" | ||
| 91 | - finally: | ||
| 92 | - backend.close() | ||
| 93 | - | ||
| 94 | - | ||
| 95 | -def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None: | ||
| 96 | - """as_base → as_finetuned → as_base yields a stable base view.""" | ||
| 97 | - import numpy as np | ||
| 98 | - | ||
| 99 | - backend = HuggingFaceDifferentialBackend( | ||
| 100 | - base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), | ||
| 101 | - adapter_path=random_adapter, | ||
| 102 | - ) | ||
| 103 | - try: | ||
| 104 | - prompt = "hello" | ||
| 105 | - with backend.as_base() as b: | ||
| 106 | - first = b.next_token_dist(prompt, top_k=16).logprobs | ||
| 107 | - with backend.as_finetuned() as f: | ||
| 108 | - f.next_token_dist(prompt, top_k=16) # toggle | ||
| 109 | - with backend.as_base() as b: | ||
| 110 | - second = b.next_token_dist(prompt, top_k=16).logprobs | ||
| 111 | - np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6) | ||
| 112 | - finally: | ||
| 113 | - backend.close() | ||
sway/tests/unit/__init__.pydeletedsway/tests/unit/test_backend_dummy.pydeleted@@ -1,102 +0,0 @@ | |||
| 1 | -"""Tests for :class:`dlm_sway.backends.dummy.DummyDifferentialBackend`. | ||
| 2 | - | ||
| 3 | -The dummy backend is used by every downstream probe unit test, so it | ||
| 4 | -gets a thorough own-right test here. Also verifies the view-exclusion | ||
| 5 | -invariant that catches stale-view bugs in probes. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -import numpy as np | ||
| 11 | -import pytest | ||
| 12 | - | ||
| 13 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 14 | -from dlm_sway.core.model import Model | ||
| 15 | -from dlm_sway.core.scoring import DifferentialBackend, ScoringBackend | ||
| 16 | - | ||
| 17 | - | ||
| 18 | -@pytest.fixture | ||
| 19 | -def backend() -> DummyDifferentialBackend: | ||
| 20 | - base = DummyResponses( | ||
| 21 | - generations={"hi": "hello"}, | ||
| 22 | - logprobs={("q", "a"): -3.0}, | ||
| 23 | - ) | ||
| 24 | - ft = DummyResponses( | ||
| 25 | - generations={"hi": "greetings, traveler"}, | ||
| 26 | - logprobs={("q", "a"): -1.2}, | ||
| 27 | - ) | ||
| 28 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -class TestViews: | ||
| 32 | - def test_as_base_and_as_ft_yield_distinct_generations( | ||
| 33 | - self, backend: DummyDifferentialBackend | ||
| 34 | - ) -> None: | ||
| 35 | - with backend.as_base() as b: | ||
| 36 | - assert b.generate("hi", max_new_tokens=5) == "hello" | ||
| 37 | - with backend.as_finetuned() as f: | ||
| 38 | - assert f.generate("hi", max_new_tokens=5) == "greetings, traveler" | ||
| 39 | - | ||
| 40 | - def test_logprob_differs_between_modes(self, backend: DummyDifferentialBackend) -> None: | ||
| 41 | - with backend.as_base() as b: | ||
| 42 | - base_score = b.logprob_of("q", "a") | ||
| 43 | - with backend.as_finetuned() as f: | ||
| 44 | - ft_score = f.logprob_of("q", "a") | ||
| 45 | - assert base_score == -3.0 | ||
| 46 | - assert ft_score == -1.2 | ||
| 47 | - | ||
| 48 | - def test_missing_generation_raises_keyerror(self, backend: DummyDifferentialBackend) -> None: | ||
| 49 | - with backend.as_base() as b, pytest.raises(KeyError, match="no canned generation"): | ||
| 50 | - b.generate("unconfigured", max_new_tokens=1) | ||
| 51 | - | ||
| 52 | - def test_missing_logprob_default(self, backend: DummyDifferentialBackend) -> None: | ||
| 53 | - with backend.as_base() as b: | ||
| 54 | - assert b.logprob_of("nonexistent", "target") == -10.0 | ||
| 55 | - | ||
| 56 | - | ||
| 57 | -class TestRollingLogprob: | ||
| 58 | - def test_synthesized_when_not_preseeded(self, backend: DummyDifferentialBackend) -> None: | ||
| 59 | - with backend.as_base() as b: | ||
| 60 | - r = b.rolling_logprob("a quick brown fox jumps") | ||
| 61 | - assert r.num_tokens == 5 | ||
| 62 | - assert r.logprobs.size == 4 | ||
| 63 | - assert np.all(r.logprobs == -2.0) | ||
| 64 | - | ||
| 65 | - def test_ft_perplexity_lower_than_base(self, backend: DummyDifferentialBackend) -> None: | ||
| 66 | - text = "a quick brown fox" | ||
| 67 | - with backend.as_base() as b: | ||
| 68 | - pb = b.rolling_logprob(text).perplexity | ||
| 69 | - with backend.as_finetuned() as f: | ||
| 70 | - pf = f.rolling_logprob(text).perplexity | ||
| 71 | - assert pf < pb # synthesized ft is less perplexed → lower PPL | ||
| 72 | - | ||
| 73 | - | ||
| 74 | -class TestTokenDist: | ||
| 75 | - def test_dists_differ_between_modes(self, backend: DummyDifferentialBackend) -> None: | ||
| 76 | - with backend.as_base() as b: | ||
| 77 | - base_dist = b.next_token_dist("any prompt") | ||
| 78 | - with backend.as_finetuned() as f: | ||
| 79 | - ft_dist = f.next_token_dist("any prompt") | ||
| 80 | - assert not np.array_equal(base_dist.logprobs, ft_dist.logprobs) | ||
| 81 | - | ||
| 82 | - | ||
| 83 | -class TestInvariants: | ||
| 84 | - def test_protocol_satisfaction(self, backend: DummyDifferentialBackend) -> None: | ||
| 85 | - assert isinstance(backend, DifferentialBackend) | ||
| 86 | - with backend.as_base() as view: | ||
| 87 | - assert isinstance(view, Model) | ||
| 88 | - assert isinstance(view, ScoringBackend) | ||
| 89 | - | ||
| 90 | - def test_nested_views_rejected(self, backend: DummyDifferentialBackend) -> None: | ||
| 91 | - with backend.as_base(), pytest.raises(RuntimeError, match="view already active"): | ||
| 92 | - with backend.as_finetuned(): | ||
| 93 | - pass | ||
| 94 | - | ||
| 95 | - def test_sequential_views_fine(self, backend: DummyDifferentialBackend) -> None: | ||
| 96 | - # Must be able to re-enter after exiting — common pattern in probes. | ||
| 97 | - with backend.as_base() as b: | ||
| 98 | - b.logprob_of("q", "a") | ||
| 99 | - with backend.as_finetuned() as f: | ||
| 100 | - f.logprob_of("q", "a") | ||
| 101 | - with backend.as_base() as b: | ||
| 102 | - b.logprob_of("q", "a") | ||
sway/tests/unit/test_backend_registry.pydeleted@@ -1,133 +0,0 @@ | |||
| 1 | -"""Tests for the backend registry in ``dlm_sway.backends``. | ||
| 2 | - | ||
| 3 | -The registry is the single place that maps a ModelSpec to a concrete | ||
| 4 | -backend. These tests check the error paths — actually materializing an | ||
| 5 | -HF backend requires model weights and is covered by the integration | ||
| 6 | -suite. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -from pathlib import Path | ||
| 12 | - | ||
| 13 | -import pytest | ||
| 14 | - | ||
| 15 | -from dlm_sway.backends import build | ||
| 16 | -from dlm_sway.core.errors import BackendNotAvailableError, SpecValidationError | ||
| 17 | -from dlm_sway.core.model import ModelSpec | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -class TestRegistry: | ||
| 21 | - def test_dummy_rejected_via_build(self) -> None: | ||
| 22 | - with pytest.raises(SpecValidationError, match="kind='dummy'"): | ||
| 23 | - build(ModelSpec(base="x", kind="dummy")) | ||
| 24 | - | ||
| 25 | - def test_hf_requires_adapter(self) -> None: | ||
| 26 | - with pytest.raises(SpecValidationError, match="adapter"): | ||
| 27 | - build(ModelSpec(base="x", kind="hf")) | ||
| 28 | - | ||
| 29 | - def test_mlx_requires_adapter(self) -> None: | ||
| 30 | - with pytest.raises(SpecValidationError, match="adapter"): | ||
| 31 | - build(ModelSpec(base="x", kind="mlx")) | ||
| 32 | - | ||
| 33 | - def test_mlx_dispatch_raises_when_mlx_missing(self) -> None: | ||
| 34 | - # On non-Apple-Silicon (or Apple without mlx installed), constructing | ||
| 35 | - # the MLX backend raises BackendNotAvailableError with a pip hint. | ||
| 36 | - # We skip this assertion if mlx happens to be installed. | ||
| 37 | - import importlib.util | ||
| 38 | - | ||
| 39 | - if importlib.util.find_spec("mlx") is not None: | ||
| 40 | - pytest.skip("mlx is installed; error path not exercised") | ||
| 41 | - with pytest.raises(BackendNotAvailableError) as exc_info: | ||
| 42 | - build(ModelSpec(base="x", kind="mlx", adapter=Path("/tmp/a"))) | ||
| 43 | - assert exc_info.value.backend == "mlx" | ||
| 44 | - | ||
| 45 | - def test_custom_requires_entry_point(self) -> None: | ||
| 46 | - with pytest.raises(SpecValidationError, match="entry_point"): | ||
| 47 | - build(ModelSpec(base="x", kind="custom", adapter=Path("/tmp/a"))) | ||
| 48 | - | ||
| 49 | - def test_custom_validates_entry_point_shape(self) -> None: | ||
| 50 | - with pytest.raises(SpecValidationError, match="pkg.module:ClassName"): | ||
| 51 | - build( | ||
| 52 | - ModelSpec( | ||
| 53 | - base="x", | ||
| 54 | - kind="custom", | ||
| 55 | - entry_point="not_a_valid_entry_point", | ||
| 56 | - adapter=Path("/tmp/a"), | ||
| 57 | - ) | ||
| 58 | - ) | ||
| 59 | - | ||
| 60 | - def test_custom_rejects_unimportable_module(self) -> None: | ||
| 61 | - with pytest.raises(SpecValidationError, match="cannot import"): | ||
| 62 | - build( | ||
| 63 | - ModelSpec( | ||
| 64 | - base="x", | ||
| 65 | - kind="custom", | ||
| 66 | - entry_point="nonexistent_pkg_xyz:Backend", | ||
| 67 | - adapter=Path("/tmp/a"), | ||
| 68 | - ) | ||
| 69 | - ) | ||
| 70 | - | ||
| 71 | - def test_custom_rejects_missing_class(self) -> None: | ||
| 72 | - with pytest.raises(SpecValidationError, match="has no attribute"): | ||
| 73 | - build( | ||
| 74 | - ModelSpec( | ||
| 75 | - base="x", | ||
| 76 | - kind="custom", | ||
| 77 | - entry_point="dlm_sway:NoSuchClass", | ||
| 78 | - adapter=Path("/tmp/a"), | ||
| 79 | - ) | ||
| 80 | - ) | ||
| 81 | - | ||
| 82 | - def test_custom_rejects_non_differential_class(self) -> None: | ||
| 83 | - # A class that accepts the canonical constructor args but doesn't | ||
| 84 | - # implement the protocol. | ||
| 85 | - import sys | ||
| 86 | - import types | ||
| 87 | - | ||
| 88 | - class _Bad: | ||
| 89 | - def __init__(self, base_spec, adapter_path): # type: ignore[no-untyped-def] | ||
| 90 | - del base_spec, adapter_path | ||
| 91 | - | ||
| 92 | - mod = types.ModuleType("_sway_bad_mod") | ||
| 93 | - mod.Bad = _Bad # type: ignore[attr-defined] | ||
| 94 | - sys.modules["_sway_bad_mod"] = mod | ||
| 95 | - | ||
| 96 | - with pytest.raises(SpecValidationError, match="DifferentialBackend"): | ||
| 97 | - build( | ||
| 98 | - ModelSpec( | ||
| 99 | - base="x", | ||
| 100 | - kind="custom", | ||
| 101 | - entry_point="_sway_bad_mod:Bad", | ||
| 102 | - adapter=Path("/tmp/a"), | ||
| 103 | - ) | ||
| 104 | - ) | ||
| 105 | - | ||
| 106 | - def test_custom_dispatches_to_valid_backend(self) -> None: | ||
| 107 | - # Use the dummy backend via a custom entry point. The dummy class's | ||
| 108 | - # __init__ takes different args, so we write a thin adapter class. | ||
| 109 | - from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 110 | - | ||
| 111 | - class _AdapterBackend(DummyDifferentialBackend): | ||
| 112 | - def __init__(self, base_spec, adapter_path): # type: ignore[no-untyped-def] | ||
| 113 | - super().__init__(base=DummyResponses(), ft=DummyResponses()) | ||
| 114 | - | ||
| 115 | - # Register on a throwaway module we can find by name. | ||
| 116 | - import sys | ||
| 117 | - import types | ||
| 118 | - | ||
| 119 | - mod = types.ModuleType("_sway_custom_test_mod") | ||
| 120 | - mod.AdapterBackend = _AdapterBackend # type: ignore[attr-defined] | ||
| 121 | - sys.modules["_sway_custom_test_mod"] = mod | ||
| 122 | - | ||
| 123 | - backend = build( | ||
| 124 | - ModelSpec( | ||
| 125 | - base="x", | ||
| 126 | - kind="custom", | ||
| 127 | - entry_point="_sway_custom_test_mod:AdapterBackend", | ||
| 128 | - adapter=Path("/tmp/a"), | ||
| 129 | - ) | ||
| 130 | - ) | ||
| 131 | - from dlm_sway.core.scoring import DifferentialBackend | ||
| 132 | - | ||
| 133 | - assert isinstance(backend, DifferentialBackend) | ||
sway/tests/unit/test_cli.pydeleted@@ -1,92 +0,0 @@ | |||
| 1 | -"""Smoke tests for the dlm-sway CLI. | ||
| 2 | - | ||
| 3 | -We avoid exercising backends (they need real models) and instead test | ||
| 4 | -arg parsing, error paths, and the read-only commands (``doctor``, | ||
| 5 | -``report``, and the help surface). | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -import json | ||
| 11 | -from pathlib import Path | ||
| 12 | - | ||
| 13 | -from typer.testing import CliRunner | ||
| 14 | - | ||
| 15 | -from dlm_sway.cli.app import app | ||
| 16 | - | ||
| 17 | - | ||
| 18 | -def test_version_exits_zero() -> None: | ||
| 19 | - result = CliRunner().invoke(app, ["--version"]) | ||
| 20 | - assert result.exit_code == 0 | ||
| 21 | - assert "dlm-sway" in result.stdout | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def test_help_lists_all_commands() -> None: | ||
| 25 | - result = CliRunner().invoke(app, ["--help"]) | ||
| 26 | - assert result.exit_code == 0 | ||
| 27 | - for cmd in ("run", "gate", "check", "diff", "autogen", "doctor", "report"): | ||
| 28 | - assert cmd in result.stdout | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -def test_doctor_runs(capsys) -> None: # type: ignore[no-untyped-def] | ||
| 32 | - result = CliRunner().invoke(app, ["doctor"]) | ||
| 33 | - assert result.exit_code == 0 | ||
| 34 | - # Rich applies color codes by default; assert the bare product name appears. | ||
| 35 | - assert "dlm-sway" in result.stdout | ||
| 36 | - assert "backends" in result.stdout | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def test_run_without_file_errors(tmp_path: Path) -> None: | ||
| 40 | - missing = tmp_path / "nope.yaml" | ||
| 41 | - result = CliRunner().invoke(app, ["run", str(missing)]) | ||
| 42 | - # Exit code 2 = SwayError bubble-up; 1 = typer missing-arg; accept either. | ||
| 43 | - assert result.exit_code != 0 | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -def test_report_from_json(tmp_path: Path) -> None: | ||
| 47 | - sample = { | ||
| 48 | - "schema_version": 1, | ||
| 49 | - "sway_version": "0.1.0.dev0", | ||
| 50 | - "base_model_id": "base", | ||
| 51 | - "adapter_id": "adp", | ||
| 52 | - "score": {"overall": 0.7, "band": "healthy", "components": {}, "findings": []}, | ||
| 53 | - "probes": [ | ||
| 54 | - { | ||
| 55 | - "name": "p1", | ||
| 56 | - "kind": "delta_kl", | ||
| 57 | - "verdict": "pass", | ||
| 58 | - "score": 0.7, | ||
| 59 | - "message": "ok", | ||
| 60 | - }, | ||
| 61 | - ], | ||
| 62 | - } | ||
| 63 | - path = tmp_path / "result.json" | ||
| 64 | - path.write_text(json.dumps(sample), encoding="utf-8") | ||
| 65 | - | ||
| 66 | - terminal = CliRunner().invoke(app, ["report", str(path)]) | ||
| 67 | - assert terminal.exit_code == 0 | ||
| 68 | - assert "p1" in terminal.stdout | ||
| 69 | - | ||
| 70 | - md = CliRunner().invoke(app, ["report", str(path), "--format", "md"]) | ||
| 71 | - assert md.exit_code == 0 | ||
| 72 | - assert "dlm-sway report" in md.stdout | ||
| 73 | - | ||
| 74 | - junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"]) | ||
| 75 | - assert junit.exit_code == 0 | ||
| 76 | - assert "<testsuite" in junit.stdout | ||
| 77 | - | ||
| 78 | - | ||
| 79 | -def test_autogen_without_dlm_extra_exits_nonzero(tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def] | ||
| 80 | - # Force the import path to fail so the CLI prints the extra hint. | ||
| 81 | - import builtins | ||
| 82 | - | ||
| 83 | - real_import = builtins.__import__ | ||
| 84 | - | ||
| 85 | - def fake_import(name: str, *args: object, **kwargs: object): # type: ignore[no-untyped-def] | ||
| 86 | - if name.startswith("dlm_sway.integrations.dlm"): | ||
| 87 | - raise ImportError("simulated missing extra") | ||
| 88 | - return real_import(name, *args, **kwargs) # type: ignore[no-untyped-call] | ||
| 89 | - | ||
| 90 | - monkeypatch.setattr(builtins, "__import__", fake_import) | ||
| 91 | - result = CliRunner().invoke(app, ["autogen", "any.dlm"]) | ||
| 92 | - assert result.exit_code != 0 | ||
sway/tests/unit/test_determinism.pydeleted@@ -1,47 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.core.determinism`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import os | ||
| 6 | -import random | ||
| 7 | - | ||
| 8 | -import numpy as np | ||
| 9 | - | ||
| 10 | -from dlm_sway.core.determinism import DeterminismSummary, seed_everything | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -class TestSeedEverything: | ||
| 14 | - def test_returns_summary(self) -> None: | ||
| 15 | - summary = seed_everything(0) | ||
| 16 | - assert isinstance(summary, DeterminismSummary) | ||
| 17 | - assert summary.seed == 0 | ||
| 18 | - assert summary.class_ in {"strict", "best_effort", "loose"} | ||
| 19 | - | ||
| 20 | - def test_idempotent_for_stdlib_random(self) -> None: | ||
| 21 | - seed_everything(42) | ||
| 22 | - a = [random.random() for _ in range(5)] | ||
| 23 | - seed_everything(42) | ||
| 24 | - b = [random.random() for _ in range(5)] | ||
| 25 | - assert a == b | ||
| 26 | - | ||
| 27 | - def test_idempotent_for_numpy(self) -> None: | ||
| 28 | - seed_everything(17) | ||
| 29 | - a = np.random.rand(5) | ||
| 30 | - seed_everything(17) | ||
| 31 | - b = np.random.rand(5) | ||
| 32 | - np.testing.assert_array_equal(a, b) | ||
| 33 | - | ||
| 34 | - def test_cublas_workspace_set_under_strict(self) -> None: | ||
| 35 | - os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) | ||
| 36 | - seed_everything(0, strict=True) | ||
| 37 | - assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8" | ||
| 38 | - | ||
| 39 | - def test_non_strict_does_not_set_cublas(self) -> None: | ||
| 40 | - os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) | ||
| 41 | - seed_everything(0, strict=False) | ||
| 42 | - # Non-strict mode must not leak the env var in either direction; | ||
| 43 | - # the host environment's prior value wins. | ||
| 44 | - assert ( | ||
| 45 | - "CUBLAS_WORKSPACE_CONFIG" not in os.environ | ||
| 46 | - or os.environ["CUBLAS_WORKSPACE_CONFIG"] != ":4096:8" | ||
| 47 | - ) | ||
sway/tests/unit/test_divergence.pydeleted@@ -1,73 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes._divergence`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import math | ||
| 6 | - | ||
| 7 | -import numpy as np | ||
| 8 | - | ||
| 9 | -from dlm_sway.core.scoring import TokenDist | ||
| 10 | -from dlm_sway.probes._divergence import aligned_probs, divergence, js, kl | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -def _dist(ids: list[int], probs: list[float], vocab: int = 100) -> TokenDist: | ||
| 14 | - return TokenDist( | ||
| 15 | - token_ids=np.asarray(ids, dtype=np.int64), | ||
| 16 | - logprobs=np.log(np.asarray(probs, dtype=np.float32)), | ||
| 17 | - vocab_size=vocab, | ||
| 18 | - ) | ||
| 19 | - | ||
| 20 | - | ||
| 21 | -class TestAligned: | ||
| 22 | - def test_identical_distributions(self) -> None: | ||
| 23 | - d = _dist([1, 2, 3], [0.5, 0.3, 0.2]) | ||
| 24 | - p, q = aligned_probs(d, d) | ||
| 25 | - np.testing.assert_allclose(p, q) | ||
| 26 | - | ||
| 27 | - def test_union_support_fills_missing(self) -> None: | ||
| 28 | - base = _dist([1, 2, 3], [0.5, 0.3, 0.2]) | ||
| 29 | - ft = _dist([2, 3, 4], [0.4, 0.4, 0.2]) | ||
| 30 | - p, q = aligned_probs(base, ft) | ||
| 31 | - assert p.shape == (4,) | ||
| 32 | - assert abs(p.sum() - 1.0) < 1e-9 | ||
| 33 | - assert abs(q.sum() - 1.0) < 1e-9 | ||
| 34 | - | ||
| 35 | - | ||
| 36 | -class TestKL: | ||
| 37 | - def test_zero_when_equal(self) -> None: | ||
| 38 | - p = np.array([0.5, 0.3, 0.2]) | ||
| 39 | - assert kl(p, p) == 0.0 | ||
| 40 | - | ||
| 41 | - def test_positive_when_different(self) -> None: | ||
| 42 | - p = np.array([0.7, 0.2, 0.1]) | ||
| 43 | - q = np.array([0.2, 0.3, 0.5]) | ||
| 44 | - assert kl(p, q) > 0.0 | ||
| 45 | - | ||
| 46 | - | ||
| 47 | -class TestJS: | ||
| 48 | - def test_zero_when_equal(self) -> None: | ||
| 49 | - p = np.array([0.5, 0.3, 0.2]) | ||
| 50 | - assert js(p, p) == 0.0 | ||
| 51 | - | ||
| 52 | - def test_symmetric(self) -> None: | ||
| 53 | - p = np.array([0.7, 0.2, 0.1]) | ||
| 54 | - q = np.array([0.2, 0.3, 0.5]) | ||
| 55 | - assert math.isclose(js(p, q), js(q, p), rel_tol=1e-9) | ||
| 56 | - | ||
| 57 | - def test_bounded_by_ln2(self) -> None: | ||
| 58 | - p = np.array([1.0, 0.0]) | ||
| 59 | - q = np.array([0.0, 1.0]) | ||
| 60 | - # With zeros handled as 0·log0 = 0 this approaches ln(2). | ||
| 61 | - assert js(p, q) <= math.log(2.0) + 1e-9 | ||
| 62 | - | ||
| 63 | - | ||
| 64 | -class TestDivergenceDispatch: | ||
| 65 | - def test_default_is_js(self) -> None: | ||
| 66 | - d1 = _dist([1, 2], [0.6, 0.4]) | ||
| 67 | - d2 = _dist([1, 2], [0.3, 0.7]) | ||
| 68 | - assert divergence(d1, d2) == divergence(d1, d2, kind="js") | ||
| 69 | - | ||
| 70 | - def test_kl_available(self) -> None: | ||
| 71 | - d1 = _dist([1, 2], [0.6, 0.4]) | ||
| 72 | - d2 = _dist([1, 2], [0.3, 0.7]) | ||
| 73 | - assert divergence(d1, d2, kind="kl") >= 0.0 | ||
sway/tests/unit/test_dlm_bridge.pydeleted@@ -1,252 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.integrations.dlm`. | ||
| 2 | - | ||
| 3 | -The bridge imports ``dlm.*`` modules lazily. We mock those via | ||
| 4 | -``sys.modules`` injection so the tests run without the ``dlm-sway[dlm]`` | ||
| 5 | -extra installed. A full end-to-end integration test against a real | ||
| 6 | -``.dlm`` lives under ``tests/integration/``. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -import sys | ||
| 12 | -import types | ||
| 13 | -from dataclasses import dataclass | ||
| 14 | -from pathlib import Path | ||
| 15 | - | ||
| 16 | -import pytest | ||
| 17 | -import yaml | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -@pytest.fixture | ||
| 21 | -def fake_dlm(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: | ||
| 22 | - """Install a fake ``dlm`` package so the resolver can import.""" | ||
| 23 | - | ||
| 24 | - # Build synthetic parsed .dlm structure. | ||
| 25 | - @dataclass | ||
| 26 | - class _Frontmatter: | ||
| 27 | - dlm_id: str = "01TESTULID" | ||
| 28 | - base_model: str = "smollm2-135m" | ||
| 29 | - | ||
| 30 | - @dataclass | ||
| 31 | - class _Section: | ||
| 32 | - section_id: str | ||
| 33 | - type: str | ||
| 34 | - content: str | ||
| 35 | - tag: str | None = None | ||
| 36 | - | ||
| 37 | - @dataclass | ||
| 38 | - class _Parsed: | ||
| 39 | - frontmatter: _Frontmatter | ||
| 40 | - sections: tuple[_Section, ...] | ||
| 41 | - | ||
| 42 | - def _parse_file(_path: Path): # type: ignore[no-untyped-def] | ||
| 43 | - return _Parsed( | ||
| 44 | - frontmatter=_Frontmatter(), | ||
| 45 | - sections=( | ||
| 46 | - _Section( | ||
| 47 | - section_id="prose-1", | ||
| 48 | - type="PROSE", | ||
| 49 | - content="This is a prose section with some information. Further detail follows.", | ||
| 50 | - ), | ||
| 51 | - _Section( | ||
| 52 | - section_id="instr-1", | ||
| 53 | - type="INSTRUCTION", | ||
| 54 | - content="### Q\nWhat is X?\n\n### A\nX is a concept\n", | ||
| 55 | - ), | ||
| 56 | - _Section( | ||
| 57 | - section_id="pref-1", | ||
| 58 | - type="PREFERENCE", | ||
| 59 | - content="chosen/rejected triple", | ||
| 60 | - ), | ||
| 61 | - ), | ||
| 62 | - ) | ||
| 63 | - | ||
| 64 | - # Fake ``dlm.doc.parser`` module. | ||
| 65 | - dlm_pkg = types.ModuleType("dlm") | ||
| 66 | - dlm_doc = types.ModuleType("dlm.doc") | ||
| 67 | - dlm_doc_parser = types.ModuleType("dlm.doc.parser") | ||
| 68 | - dlm_doc_parser.parse_file = _parse_file # type: ignore[attr-defined] | ||
| 69 | - | ||
| 70 | - # Fake ``dlm.store.paths`` that returns a resolvable path. | ||
| 71 | - dlm_store = types.ModuleType("dlm.store") | ||
| 72 | - dlm_store_paths = types.ModuleType("dlm.store.paths") | ||
| 73 | - | ||
| 74 | - adapter_dir = tmp_path / "adapter_v1" | ||
| 75 | - adapter_dir.mkdir() | ||
| 76 | - (adapter_dir / "adapter_config.json").write_text("{}", encoding="utf-8") | ||
| 77 | - | ||
| 78 | - class _StorePath: | ||
| 79 | - def __init__(self, path: Path) -> None: | ||
| 80 | - self._p = path | ||
| 81 | - | ||
| 82 | - def resolve_current_adapter(self) -> Path: | ||
| 83 | - return self._p | ||
| 84 | - | ||
| 85 | - def _for_dlm(_dlm_id: str) -> _StorePath: | ||
| 86 | - return _StorePath(adapter_dir) | ||
| 87 | - | ||
| 88 | - dlm_store_paths.StorePath = _StorePath # type: ignore[attr-defined] | ||
| 89 | - dlm_store_paths.for_dlm = _for_dlm # type: ignore[attr-defined] | ||
| 90 | - | ||
| 91 | - # Fake base-model resolver — returns a stub with an ``hf_id`` attribute. | ||
| 92 | - dlm_base = types.ModuleType("dlm.base_models") | ||
| 93 | - | ||
| 94 | - @dataclass | ||
| 95 | - class _BaseSpec: | ||
| 96 | - hf_id: str | ||
| 97 | - key: str | ||
| 98 | - | ||
| 99 | - def _resolve(key: str) -> _BaseSpec: | ||
| 100 | - return _BaseSpec(hf_id="HuggingFaceTB/SmolLM2-135M-Instruct", key=key) | ||
| 101 | - | ||
| 102 | - dlm_base.resolve = _resolve # type: ignore[attr-defined] | ||
| 103 | - | ||
| 104 | - # Fake instruction / preference parsers. | ||
| 105 | - dlm_data = types.ModuleType("dlm.data") | ||
| 106 | - dlm_data_instr = types.ModuleType("dlm.data.instruction_parser") | ||
| 107 | - dlm_data_pref = types.ModuleType("dlm.data.preference_parser") | ||
| 108 | - | ||
| 109 | - @dataclass | ||
| 110 | - class _QAPair: | ||
| 111 | - question: str | ||
| 112 | - answer: str | ||
| 113 | - | ||
| 114 | - @dataclass | ||
| 115 | - class _Triple: | ||
| 116 | - prompt: str | ||
| 117 | - chosen: str | ||
| 118 | - rejected: str | ||
| 119 | - | ||
| 120 | - def _parse_instr(body: str, *, section_id: str) -> list[_QAPair]: | ||
| 121 | - del section_id | ||
| 122 | - out: list[_QAPair] = [] | ||
| 123 | - parts = body.split("### Q") | ||
| 124 | - for part in parts[1:]: | ||
| 125 | - q_block, _, a_block = part.partition("### A") | ||
| 126 | - q = q_block.strip() | ||
| 127 | - a = a_block.strip() | ||
| 128 | - if q and a: | ||
| 129 | - out.append(_QAPair(question=q, answer=a)) | ||
| 130 | - return out | ||
| 131 | - | ||
| 132 | - def _parse_pref(body: str, *, section_id: str) -> list[_Triple]: | ||
| 133 | - del body, section_id | ||
| 134 | - return [_Triple(prompt="Which?", chosen="good answer", rejected="bad answer")] | ||
| 135 | - | ||
| 136 | - dlm_data_instr.parse_instruction_body = _parse_instr # type: ignore[attr-defined] | ||
| 137 | - dlm_data_pref.parse_preference_body = _parse_pref # type: ignore[attr-defined] | ||
| 138 | - | ||
| 139 | - monkeypatch.setitem(sys.modules, "dlm", dlm_pkg) | ||
| 140 | - monkeypatch.setitem(sys.modules, "dlm.doc", dlm_doc) | ||
| 141 | - monkeypatch.setitem(sys.modules, "dlm.doc.parser", dlm_doc_parser) | ||
| 142 | - monkeypatch.setitem(sys.modules, "dlm.store", dlm_store) | ||
| 143 | - monkeypatch.setitem(sys.modules, "dlm.store.paths", dlm_store_paths) | ||
| 144 | - monkeypatch.setitem(sys.modules, "dlm.base_models", dlm_base) | ||
| 145 | - monkeypatch.setitem(sys.modules, "dlm.data", dlm_data) | ||
| 146 | - monkeypatch.setitem(sys.modules, "dlm.data.instruction_parser", dlm_data_instr) | ||
| 147 | - monkeypatch.setitem(sys.modules, "dlm.data.preference_parser", dlm_data_pref) | ||
| 148 | - | ||
| 149 | - # Return a path to a fake .dlm file (the parser won't actually read it). | ||
| 150 | - dlm_file = tmp_path / "doc.dlm" | ||
| 151 | - dlm_file.write_text("---\ndlm_id: 01TEST\n---\n\nbody\n", encoding="utf-8") | ||
| 152 | - return dlm_file | ||
| 153 | - | ||
| 154 | - | ||
| 155 | -def test_resolve_dlm_maps_sections(fake_dlm: Path) -> None: | ||
| 156 | - from dlm_sway.integrations.dlm.resolver import resolve_dlm | ||
| 157 | - | ||
| 158 | - handle = resolve_dlm(fake_dlm) | ||
| 159 | - assert handle.dlm_id == "01TESTULID" | ||
| 160 | - assert handle.base_model == "HuggingFaceTB/SmolLM2-135M-Instruct" | ||
| 161 | - assert handle.adapter_path is not None | ||
| 162 | - assert handle.adapter_path.exists() | ||
| 163 | - assert len(handle.sections) == 3 | ||
| 164 | - # Kinds normalized from uppercase dlm enum values. | ||
| 165 | - assert {s.kind for s in handle.sections} == {"prose", "instruction", "preference"} | ||
| 166 | - # Instruction Q/A pair survived the translation. | ||
| 167 | - instr = next(s for s in handle.sections if s.kind == "instruction") | ||
| 168 | - assert instr.probes | ||
| 169 | - assert instr.probes[0].prompt == "What is X?" | ||
| 170 | - # Preference triple too. | ||
| 171 | - pref = next(s for s in handle.sections if s.kind == "preference") | ||
| 172 | - assert pref.preferences | ||
| 173 | - assert pref.preferences[0].chosen == "good answer" | ||
| 174 | - | ||
| 175 | - | ||
| 176 | -def test_resolve_without_dlm_installed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: | ||
| 177 | - """resolve_dlm surfaces a SwayError when the dlm package is missing.""" | ||
| 178 | - # Wipe any cached dlm modules so the lazy import fails. | ||
| 179 | - for mod in list(sys.modules): | ||
| 180 | - if mod == "dlm" or mod.startswith("dlm."): | ||
| 181 | - monkeypatch.delitem(sys.modules, mod, raising=False) | ||
| 182 | - | ||
| 183 | - import builtins | ||
| 184 | - | ||
| 185 | - real_import = builtins.__import__ | ||
| 186 | - | ||
| 187 | - def fake_import(name: str, *args, **kwargs): # type: ignore[no-untyped-def] | ||
| 188 | - if name.startswith("dlm."): | ||
| 189 | - raise ImportError("missing extra") | ||
| 190 | - return real_import(name, *args, **kwargs) | ||
| 191 | - | ||
| 192 | - monkeypatch.setattr(builtins, "__import__", fake_import) | ||
| 193 | - | ||
| 194 | - from dlm_sway.core.errors import SwayError | ||
| 195 | - from dlm_sway.integrations.dlm.resolver import resolve_dlm | ||
| 196 | - | ||
| 197 | - with pytest.raises(SwayError, match="dlm package not installed"): | ||
| 198 | - resolve_dlm(tmp_path / "doc.dlm") | ||
| 199 | - | ||
| 200 | - | ||
| 201 | -def test_autogen_writes_complete_suite(fake_dlm: Path, tmp_path: Path) -> None: | ||
| 202 | - from dlm_sway.integrations.dlm.autogen import write_sway_yaml | ||
| 203 | - | ||
| 204 | - out = tmp_path / "sway.yaml" | ||
| 205 | - write_sway_yaml(fake_dlm, out) | ||
| 206 | - data = yaml.safe_load(out.read_text(encoding="utf-8")) | ||
| 207 | - | ||
| 208 | - assert data["version"] == 1 | ||
| 209 | - assert data["models"]["base"]["base"] == "HuggingFaceTB/SmolLM2-135M-Instruct" | ||
| 210 | - assert data["models"]["ft"]["adapter"] is not None | ||
| 211 | - assert data["dlm_source"] == str(fake_dlm.resolve()) | ||
| 212 | - | ||
| 213 | - kinds = {entry["kind"] for entry in data["suite"]} | ||
| 214 | - # The full 11-primitive battery minus nothing is present (some may | ||
| 215 | - # be skipped when data is absent, but here we have one of every | ||
| 216 | - # section type). | ||
| 217 | - expected = { | ||
| 218 | - "null_adapter", | ||
| 219 | - "delta_kl", | ||
| 220 | - "adapter_revert", | ||
| 221 | - "prompt_collapse", | ||
| 222 | - "section_internalization", | ||
| 223 | - "paraphrase_invariance", | ||
| 224 | - "preference_flip", | ||
| 225 | - "style_fingerprint", | ||
| 226 | - "calibration_drift", | ||
| 227 | - "leakage", | ||
| 228 | - "adapter_ablation", | ||
| 229 | - } | ||
| 230 | - assert expected <= kinds, f"missing: {expected - kinds}" | ||
| 231 | - | ||
| 232 | - | ||
| 233 | -def test_build_spec_dict_skips_preference_when_absent() -> None: | ||
| 234 | - from dlm_sway.core.sections import Section | ||
| 235 | - from dlm_sway.integrations.dlm.autogen import build_spec_dict | ||
| 236 | - from dlm_sway.integrations.dlm.resolver import DlmHandle | ||
| 237 | - | ||
| 238 | - sections = ( | ||
| 239 | - Section(id="a", kind="prose", content="A prose section. Second sentence."), | ||
| 240 | - Section(id="b", kind="prose", content="Another prose section."), | ||
| 241 | - ) | ||
| 242 | - handle = DlmHandle( | ||
| 243 | - dlm_id="x", | ||
| 244 | - base_model="base", | ||
| 245 | - adapter_path=Path("/tmp/adapter"), | ||
| 246 | - sections=sections, | ||
| 247 | - doc_text="whole document", | ||
| 248 | - ) | ||
| 249 | - spec = build_spec_dict(handle) | ||
| 250 | - kinds = {entry["kind"] for entry in spec["suite"]} | ||
| 251 | - assert "preference_flip" not in kinds | ||
| 252 | - assert "section_internalization" in kinds | ||
sway/tests/unit/test_errors.pydeleted@@ -1,55 +0,0 @@ | |||
| 1 | -"""Tests for the exception hierarchy.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import pytest | ||
| 6 | - | ||
| 7 | -from dlm_sway.core.errors import ( | ||
| 8 | - BackendNotAvailableError, | ||
| 9 | - ProbeError, | ||
| 10 | - SpecValidationError, | ||
| 11 | - SwayError, | ||
| 12 | -) | ||
| 13 | - | ||
| 14 | - | ||
| 15 | -class TestSwayError: | ||
| 16 | - def test_is_root_exception(self) -> None: | ||
| 17 | - assert issubclass(SpecValidationError, SwayError) | ||
| 18 | - assert issubclass(BackendNotAvailableError, SwayError) | ||
| 19 | - assert issubclass(ProbeError, SwayError) | ||
| 20 | - | ||
| 21 | - def test_raised_and_caught_as_sway_error(self) -> None: | ||
| 22 | - with pytest.raises(SwayError): | ||
| 23 | - raise ProbeError("delta_kl", "shape mismatch") | ||
| 24 | - | ||
| 25 | - | ||
| 26 | -class TestSpecValidationError: | ||
| 27 | - def test_format_without_source(self) -> None: | ||
| 28 | - err = SpecValidationError("unknown key 'topp'") | ||
| 29 | - assert str(err) == "unknown key 'topp'" | ||
| 30 | - assert err.source is None | ||
| 31 | - | ||
| 32 | - def test_format_with_source(self) -> None: | ||
| 33 | - err = SpecValidationError("unknown key 'topp'", source="sway.yaml") | ||
| 34 | - assert str(err) == "sway.yaml: unknown key 'topp'" | ||
| 35 | - assert err.source == "sway.yaml" | ||
| 36 | - | ||
| 37 | - | ||
| 38 | -class TestBackendNotAvailableError: | ||
| 39 | - def test_hint_rendered_in_message(self) -> None: | ||
| 40 | - err = BackendNotAvailableError("hf", extra="hf") | ||
| 41 | - assert "pip install 'dlm-sway[hf]'" in str(err) | ||
| 42 | - assert err.backend == "hf" | ||
| 43 | - assert err.extra == "hf" | ||
| 44 | - | ||
| 45 | - def test_appends_optional_hint(self) -> None: | ||
| 46 | - err = BackendNotAvailableError("mlx", extra="mlx", hint="Apple Silicon only.") | ||
| 47 | - assert "Apple Silicon only." in str(err) | ||
| 48 | - | ||
| 49 | - | ||
| 50 | -class TestProbeError: | ||
| 51 | - def test_includes_probe_name(self) -> None: | ||
| 52 | - err = ProbeError("delta_kl", "NaN logits") | ||
| 53 | - assert "delta_kl" in str(err) | ||
| 54 | - assert "NaN logits" in str(err) | ||
| 55 | - assert err.probe == "delta_kl" | ||
sway/tests/unit/test_model.pydeleted@@ -1,78 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.core.model`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from pathlib import Path | ||
| 6 | - | ||
| 7 | -import pytest | ||
| 8 | -from pydantic import ValidationError | ||
| 9 | - | ||
| 10 | -from dlm_sway.core.model import LoadedModel, Model, ModelSpec | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -class TestModelSpec: | ||
| 14 | - def test_defaults(self) -> None: | ||
| 15 | - spec = ModelSpec(base="HuggingFaceTB/SmolLM2-135M-Instruct") | ||
| 16 | - assert spec.kind == "hf" | ||
| 17 | - assert spec.adapter is None | ||
| 18 | - assert spec.dtype == "auto" | ||
| 19 | - assert spec.device == "auto" | ||
| 20 | - assert spec.trust_remote_code is False | ||
| 21 | - assert spec.entry_point is None | ||
| 22 | - | ||
| 23 | - def test_frozen(self) -> None: | ||
| 24 | - spec = ModelSpec(base="x") | ||
| 25 | - with pytest.raises(ValidationError): | ||
| 26 | - spec.base = "y" # type: ignore[misc] | ||
| 27 | - | ||
| 28 | - def test_extra_fields_forbidden(self) -> None: | ||
| 29 | - with pytest.raises(ValidationError) as exc_info: | ||
| 30 | - ModelSpec(base="x", bogus="y") # type: ignore[call-arg] | ||
| 31 | - assert "bogus" in str(exc_info.value).lower() | ||
| 32 | - | ||
| 33 | - def test_kind_enum(self) -> None: | ||
| 34 | - ModelSpec(base="x", kind="hf") | ||
| 35 | - ModelSpec(base="x", kind="mlx") | ||
| 36 | - ModelSpec(base="x", kind="dummy") | ||
| 37 | - ModelSpec(base="x", kind="custom", entry_point="pkg.mod:Backend") | ||
| 38 | - with pytest.raises(ValidationError): | ||
| 39 | - ModelSpec(base="x", kind="ollama") # type: ignore[arg-type] | ||
| 40 | - | ||
| 41 | - def test_adapter_coerced_to_path(self) -> None: | ||
| 42 | - spec = ModelSpec(base="x", adapter="/tmp/adapter") # type: ignore[arg-type] | ||
| 43 | - assert isinstance(spec.adapter, Path) | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -class TestLoadedModel: | ||
| 47 | - def test_frozen_dataclass(self) -> None: | ||
| 48 | - loaded = LoadedModel( | ||
| 49 | - id="base", | ||
| 50 | - spec=ModelSpec(base="x"), | ||
| 51 | - model=object(), | ||
| 52 | - tokenizer=object(), | ||
| 53 | - meta={"device": "cpu"}, | ||
| 54 | - ) | ||
| 55 | - assert loaded.id == "base" | ||
| 56 | - assert loaded.meta["device"] == "cpu" | ||
| 57 | - | ||
| 58 | - | ||
| 59 | -class TestModelProtocol: | ||
| 60 | - def test_runtime_checkable(self) -> None: | ||
| 61 | - class FakeModel: | ||
| 62 | - id = "x" | ||
| 63 | - | ||
| 64 | - def generate( | ||
| 65 | - self, | ||
| 66 | - prompt: str, | ||
| 67 | - *, | ||
| 68 | - max_new_tokens: int, | ||
| 69 | - temperature: float = 0.0, | ||
| 70 | - top_p: float = 1.0, | ||
| 71 | - seed: int = 0, | ||
| 72 | - ) -> str: | ||
| 73 | - return f"{prompt}|{max_new_tokens}" | ||
| 74 | - | ||
| 75 | - def close(self) -> None: | ||
| 76 | - return None | ||
| 77 | - | ||
| 78 | - assert isinstance(FakeModel(), Model) | ||
sway/tests/unit/test_null_calibration.pydeleted@@ -1,123 +0,0 @@ | |||
| 1 | -"""Tests for null-adapter calibration. | ||
| 2 | - | ||
| 3 | -Covers: dummy backend ``as_null_adapter`` yields a plausibly noisy | ||
| 4 | -view; ``NullAdapterProbe`` populates ``ctx.null_stats`` in a way | ||
| 5 | -downstream probes pick up end-to-end; missing-capability SKIP path. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -import numpy as np | ||
| 11 | - | ||
| 12 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 13 | -from dlm_sway.core.result import Verdict | ||
| 14 | -from dlm_sway.core.scoring import NullCalibratedBackend | ||
| 15 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 16 | -from dlm_sway.suite.runner import run as run_suite | ||
| 17 | -from dlm_sway.suite.spec import SwaySpec | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -def _diverging_backend() -> DummyDifferentialBackend: | ||
| 21 | - base = DummyResponses() | ||
| 22 | - ft = DummyResponses() | ||
| 23 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 24 | - | ||
| 25 | - | ||
| 26 | -class TestProtocolConformance: | ||
| 27 | - def test_dummy_is_null_calibrated(self) -> None: | ||
| 28 | - assert isinstance(_diverging_backend(), NullCalibratedBackend) | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -class TestAsNullAdapter: | ||
| 32 | - def test_yields_perturbed_view(self) -> None: | ||
| 33 | - backend = _diverging_backend() | ||
| 34 | - with backend.as_base() as base: | ||
| 35 | - base_dist = base.next_token_dist("hello") | ||
| 36 | - with backend.as_null_adapter(seed=0) as null: | ||
| 37 | - null_dist = null.next_token_dist("hello") | ||
| 38 | - # Some perturbation, but bounded. | ||
| 39 | - assert not np.allclose(base_dist.logprobs, null_dist.logprobs) | ||
| 40 | - | ||
| 41 | - def test_different_seeds_yield_different_views(self) -> None: | ||
| 42 | - backend = _diverging_backend() | ||
| 43 | - with backend.as_null_adapter(seed=1) as v1: | ||
| 44 | - d1 = v1.next_token_dist("hello") | ||
| 45 | - with backend.as_null_adapter(seed=2) as v2: | ||
| 46 | - d2 = v2.next_token_dist("hello") | ||
| 47 | - assert not np.allclose(d1.logprobs, d2.logprobs) | ||
| 48 | - | ||
| 49 | - def test_view_exclusion_enforced(self) -> None: | ||
| 50 | - import pytest | ||
| 51 | - | ||
| 52 | - backend = _diverging_backend() | ||
| 53 | - with backend.as_null_adapter(seed=0), pytest.raises(RuntimeError): | ||
| 54 | - with backend.as_base(): | ||
| 55 | - pass | ||
| 56 | - | ||
| 57 | - | ||
| 58 | -class TestProbe: | ||
| 59 | - def test_populates_null_stats(self) -> None: | ||
| 60 | - backend = _diverging_backend() | ||
| 61 | - probe, spec = build_probe( | ||
| 62 | - { | ||
| 63 | - "name": "null", | ||
| 64 | - "kind": "null_adapter", | ||
| 65 | - "runs": 3, | ||
| 66 | - "prompts": ["q1", "q2"], | ||
| 67 | - } | ||
| 68 | - ) | ||
| 69 | - ctx = RunContext(backend=backend) | ||
| 70 | - result = probe.run(spec, ctx) | ||
| 71 | - assert result.verdict == Verdict.PASS | ||
| 72 | - stats = result.evidence["null_stats"] | ||
| 73 | - assert "delta_kl" in stats | ||
| 74 | - assert stats["delta_kl"]["n"] == 3.0 | ||
| 75 | - assert stats["delta_kl"]["std"] > 0.0 # seeded perturbations produce variance | ||
| 76 | - | ||
| 77 | - def test_runner_threads_null_stats_to_subsequent_probes(self) -> None: | ||
| 78 | - """End-to-end: null_adapter first → delta_kl picks up z-score path.""" | ||
| 79 | - backend = _diverging_backend() | ||
| 80 | - raw_spec = SwaySpec.model_validate( | ||
| 81 | - { | ||
| 82 | - "version": 1, | ||
| 83 | - "models": {"base": {"base": "b"}, "ft": {"base": "b", "adapter": "/tmp/a"}}, | ||
| 84 | - "suite": [ | ||
| 85 | - { | ||
| 86 | - "name": "null", | ||
| 87 | - "kind": "null_adapter", | ||
| 88 | - "runs": 3, | ||
| 89 | - "prompts": ["p1", "p2"], | ||
| 90 | - }, | ||
| 91 | - { | ||
| 92 | - "name": "dk", | ||
| 93 | - "kind": "delta_kl", | ||
| 94 | - "prompts": ["p1", "p2"], | ||
| 95 | - "assert_z_gte": -10.0, # permissive so we pass regardless | ||
| 96 | - }, | ||
| 97 | - ], | ||
| 98 | - } | ||
| 99 | - ) | ||
| 100 | - result = run_suite(raw_spec, backend) | ||
| 101 | - assert len(result.probes) == 2 | ||
| 102 | - null_result = result.probes[0] | ||
| 103 | - dk_result = result.probes[1] | ||
| 104 | - assert null_result.verdict == Verdict.PASS | ||
| 105 | - # The delta_kl probe should have computed a z_score because null_stats was present. | ||
| 106 | - assert dk_result.z_score is not None, ( | ||
| 107 | - "delta_kl should have z-scored against null baseline, got " | ||
| 108 | - f"evidence={dk_result.evidence}, message={dk_result.message}" | ||
| 109 | - ) | ||
| 110 | - | ||
| 111 | - def test_skip_when_backend_not_null_calibrated(self) -> None: | ||
| 112 | - class _Bare: | ||
| 113 | - def as_base(self): # noqa: ANN202 | ||
| 114 | - raise NotImplementedError | ||
| 115 | - | ||
| 116 | - def as_finetuned(self): # noqa: ANN202 | ||
| 117 | - raise NotImplementedError | ||
| 118 | - | ||
| 119 | - probe, spec = build_probe({"name": "null", "kind": "null_adapter"}) | ||
| 120 | - ctx = RunContext(backend=_Bare()) # type: ignore[arg-type] | ||
| 121 | - result = probe.run(spec, ctx) | ||
| 122 | - assert result.verdict == Verdict.SKIP | ||
| 123 | - assert "NullCalibratedBackend" in result.message | ||
sway/tests/unit/test_probe_adapter_ablation.pydeleted@@ -1,135 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.adapter_ablation`. | ||
| 2 | - | ||
| 3 | -Uses the dummy backend's lam-interpolation implementation to exercise | ||
| 4 | -the full probe path without loading a real model. | ||
| 5 | -""" | ||
| 6 | - | ||
| 7 | -from __future__ import annotations | ||
| 8 | - | ||
| 9 | -import numpy as np | ||
| 10 | - | ||
| 11 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 12 | -from dlm_sway.core.result import Verdict | ||
| 13 | -from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist | ||
| 14 | -from dlm_sway.probes.adapter_ablation import ( | ||
| 15 | - _overshoot, | ||
| 16 | - _r_squared, | ||
| 17 | - _saturation_lambda, | ||
| 18 | -) | ||
| 19 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 20 | - | ||
| 21 | - | ||
| 22 | -class TestShapeMetrics: | ||
| 23 | - def test_r_squared_perfect_linear(self) -> None: | ||
| 24 | - x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64) | ||
| 25 | - y = 2 * x + 0.1 | ||
| 26 | - assert _r_squared(x, y) > 0.99 | ||
| 27 | - | ||
| 28 | - def test_r_squared_zero_slope_defined(self) -> None: | ||
| 29 | - x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64) | ||
| 30 | - y = np.zeros_like(x) | ||
| 31 | - # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit). | ||
| 32 | - assert _r_squared(x, y) == 1.0 | ||
| 33 | - | ||
| 34 | - def test_saturation_lambda_expected(self) -> None: | ||
| 35 | - lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64) | ||
| 36 | - divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64) | ||
| 37 | - sat = _saturation_lambda(lambdas, divs) | ||
| 38 | - assert sat == 0.75 # 0.95 / 1.0 = 0.95 ≥ 0.9 | ||
| 39 | - | ||
| 40 | - def test_overshoot_recovered(self) -> None: | ||
| 41 | - lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64) | ||
| 42 | - divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64) | ||
| 43 | - assert _overshoot(lambdas, divs) == 1.15 | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -def _diverging_backend() -> DummyDifferentialBackend: | ||
| 47 | - """Backend where base ≠ ft at a few prompts; distributions interpolate | ||
| 48 | - smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter.""" | ||
| 49 | - base = DummyResponses( | ||
| 50 | - token_dists={ | ||
| 51 | - "q1": TokenDist( | ||
| 52 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 53 | - logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)), | ||
| 54 | - vocab_size=100, | ||
| 55 | - ), | ||
| 56 | - "q2": TokenDist( | ||
| 57 | - token_ids=np.array([5, 6], dtype=np.int64), | ||
| 58 | - logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)), | ||
| 59 | - vocab_size=100, | ||
| 60 | - ), | ||
| 61 | - } | ||
| 62 | - ) | ||
| 63 | - ft = DummyResponses( | ||
| 64 | - token_dists={ | ||
| 65 | - "q1": TokenDist( | ||
| 66 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 67 | - logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)), | ||
| 68 | - vocab_size=100, | ||
| 69 | - ), | ||
| 70 | - "q2": TokenDist( | ||
| 71 | - token_ids=np.array([5, 6], dtype=np.int64), | ||
| 72 | - logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)), | ||
| 73 | - vocab_size=100, | ||
| 74 | - ), | ||
| 75 | - } | ||
| 76 | - ) | ||
| 77 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 78 | - | ||
| 79 | - | ||
| 80 | -class TestProbe: | ||
| 81 | - def test_backend_implements_scalable_protocol(self) -> None: | ||
| 82 | - backend = _diverging_backend() | ||
| 83 | - assert isinstance(backend, ScalableDifferentialBackend) | ||
| 84 | - | ||
| 85 | - def test_probe_runs_and_emits_shape_metrics(self) -> None: | ||
| 86 | - probe, spec = build_probe( | ||
| 87 | - { | ||
| 88 | - "name": "abl", | ||
| 89 | - "kind": "adapter_ablation", | ||
| 90 | - "prompts": ["q1", "q2"], | ||
| 91 | - "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | ||
| 92 | - # Very permissive to tolerate the log-space blend of a | ||
| 93 | - # tiny synthetic fixture. | ||
| 94 | - "assert_linearity_gte": 0.3, | ||
| 95 | - "assert_overshoot_gte": 1.0, | ||
| 96 | - } | ||
| 97 | - ) | ||
| 98 | - ctx = RunContext(backend=_diverging_backend()) | ||
| 99 | - result = probe.run(spec, ctx) | ||
| 100 | - assert result.verdict in (Verdict.PASS, Verdict.FAIL) | ||
| 101 | - assert "lambdas" in result.evidence | ||
| 102 | - assert "mean_divergence_per_lambda" in result.evidence | ||
| 103 | - assert len(result.evidence["mean_divergence_per_lambda"]) == 6 | ||
| 104 | - # Divergence should increase as λ grows from 0 toward ft. | ||
| 105 | - divs = result.evidence["mean_divergence_per_lambda"] | ||
| 106 | - # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing | ||
| 107 | - # for the bulk of the curve. | ||
| 108 | - assert divs[-2] >= divs[0] | ||
| 109 | - | ||
| 110 | - def test_skip_when_backend_not_scalable(self) -> None: | ||
| 111 | - class _NonScalable: | ||
| 112 | - def as_base(self): # noqa: ANN202 | ||
| 113 | - raise NotImplementedError | ||
| 114 | - | ||
| 115 | - def as_finetuned(self): # noqa: ANN202 | ||
| 116 | - raise NotImplementedError | ||
| 117 | - | ||
| 118 | - probe, spec = build_probe( | ||
| 119 | - { | ||
| 120 | - "name": "abl", | ||
| 121 | - "kind": "adapter_ablation", | ||
| 122 | - "prompts": ["q1"], | ||
| 123 | - } | ||
| 124 | - ) | ||
| 125 | - ctx = RunContext(backend=_NonScalable()) # type: ignore[arg-type] | ||
| 126 | - result = probe.run(spec, ctx) | ||
| 127 | - assert result.verdict == Verdict.SKIP | ||
| 128 | - assert "ScalableDifferentialBackend" in result.message | ||
| 129 | - | ||
| 130 | - def test_error_on_empty_prompts(self) -> None: | ||
| 131 | - backend = _diverging_backend() | ||
| 132 | - probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []}) | ||
| 133 | - ctx = RunContext(backend=backend) | ||
| 134 | - result = probe.run(spec, ctx) | ||
| 135 | - assert result.verdict == Verdict.ERROR | ||
sway/tests/unit/test_probe_adapter_revert.pydeleted@@ -1,170 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.adapter_revert`. | ||
| 2 | - | ||
| 3 | -We stub out the embedder so these tests don't need sentence-transformers | ||
| 4 | -installed. The ``probe.py`` SKIP path for the missing-extra case is | ||
| 5 | -covered separately by monkeypatching the importer. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -from typing import Any | ||
| 11 | - | ||
| 12 | -import numpy as np | ||
| 13 | -import pytest | ||
| 14 | - | ||
| 15 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 16 | -from dlm_sway.core.result import Verdict | ||
| 17 | -from dlm_sway.probes.adapter_revert import AdapterRevertProbe | ||
| 18 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 19 | - | ||
| 20 | - | ||
| 21 | -def _backend(*, ft_like_base: bool = False) -> DummyDifferentialBackend: | ||
| 22 | - base = DummyResponses( | ||
| 23 | - generations={ | ||
| 24 | - "pp1": "cats are mammals", | ||
| 25 | - "pp2": "cats have fur", | ||
| 26 | - } | ||
| 27 | - ) | ||
| 28 | - if ft_like_base: | ||
| 29 | - ft_gens = dict(base.generations) | ||
| 30 | - else: | ||
| 31 | - ft_gens = { | ||
| 32 | - "pp1": "dolphins are mammals", | ||
| 33 | - "pp2": "dolphins are smart", | ||
| 34 | - } | ||
| 35 | - ft = DummyResponses(generations=ft_gens) | ||
| 36 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -def _stub_embedder(text_to_vec: dict[str, np.ndarray]): # type: ignore[no-untyped-def] | ||
| 40 | - def _encode(texts: list[str]): # type: ignore[no-untyped-def] | ||
| 41 | - return np.stack([text_to_vec[t] for t in texts]) | ||
| 42 | - | ||
| 43 | - return _encode | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -@pytest.fixture | ||
| 47 | -def monkeyed_embed(monkeypatch: pytest.MonkeyPatch) -> dict[str, np.ndarray]: | ||
| 48 | - """Install a stub embedder with a controllable text→vec mapping. | ||
| 49 | - | ||
| 50 | - Tests populate the dict before calling ``probe.run()``. | ||
| 51 | - """ | ||
| 52 | - table: dict[str, np.ndarray] = {} | ||
| 53 | - monkeypatch.setattr( | ||
| 54 | - "dlm_sway.probes.adapter_revert._load_embedder", | ||
| 55 | - lambda _model_id: _stub_embedder(table), # type: ignore[arg-type] | ||
| 56 | - ) | ||
| 57 | - return table | ||
| 58 | - | ||
| 59 | - | ||
| 60 | -class TestAdapterRevert: | ||
| 61 | - def test_healthy_adapter_passes(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | ||
| 62 | - # gold and ft-outputs cluster together, base outputs cluster elsewhere. | ||
| 63 | - monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0]) | ||
| 64 | - monkeyed_embed["cats have fur"] = np.array([1.0, 0.0]) | ||
| 65 | - monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0]) | ||
| 66 | - monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0]) | ||
| 67 | - monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0]) # gold | ||
| 68 | - | ||
| 69 | - probe, spec = build_probe( | ||
| 70 | - { | ||
| 71 | - "name": "rev", | ||
| 72 | - "kind": "adapter_revert", | ||
| 73 | - "cases": [ | ||
| 74 | - { | ||
| 75 | - "prompt": "anything", | ||
| 76 | - "gold": "the answer is dolphins", | ||
| 77 | - "paraphrases": ["pp1", "pp2"], | ||
| 78 | - } | ||
| 79 | - ], | ||
| 80 | - "assert_revert_rate_lt": 0.25, | ||
| 81 | - } | ||
| 82 | - ) | ||
| 83 | - ctx = RunContext(backend=_backend(ft_like_base=False)) | ||
| 84 | - result = probe.run(spec, ctx) | ||
| 85 | - assert result.verdict == Verdict.PASS | ||
| 86 | - assert result.raw == 0.0 | ||
| 87 | - | ||
| 88 | - def test_reverting_adapter_fails(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | ||
| 89 | - # ft matches base (reverted), diverges from gold. | ||
| 90 | - monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0]) | ||
| 91 | - monkeyed_embed["cats have fur"] = np.array([1.0, 0.0]) | ||
| 92 | - monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0]) # gold | ||
| 93 | - | ||
| 94 | - probe, spec = build_probe( | ||
| 95 | - { | ||
| 96 | - "name": "rev", | ||
| 97 | - "kind": "adapter_revert", | ||
| 98 | - "cases": [ | ||
| 99 | - { | ||
| 100 | - "prompt": "anything", | ||
| 101 | - "gold": "the answer is dolphins", | ||
| 102 | - "paraphrases": ["pp1", "pp2"], | ||
| 103 | - } | ||
| 104 | - ], | ||
| 105 | - } | ||
| 106 | - ) | ||
| 107 | - ctx = RunContext(backend=_backend(ft_like_base=True)) | ||
| 108 | - result = probe.run(spec, ctx) | ||
| 109 | - assert result.verdict == Verdict.FAIL | ||
| 110 | - assert result.raw == 1.0 # 100% revert | ||
| 111 | - | ||
| 112 | - def test_trivially_similar_cases_dropped(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | ||
| 113 | - # base and gold are identical → drop. | ||
| 114 | - v = np.array([1.0, 0.0]) | ||
| 115 | - monkeyed_embed["cats are mammals"] = v | ||
| 116 | - monkeyed_embed["cats have fur"] = v | ||
| 117 | - monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0]) | ||
| 118 | - monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0]) | ||
| 119 | - monkeyed_embed["cats are mammals too"] = v # gold — matches base | ||
| 120 | - | ||
| 121 | - probe, spec = build_probe( | ||
| 122 | - { | ||
| 123 | - "name": "rev", | ||
| 124 | - "kind": "adapter_revert", | ||
| 125 | - "cases": [ | ||
| 126 | - { | ||
| 127 | - "prompt": "anything", | ||
| 128 | - "gold": "cats are mammals too", | ||
| 129 | - "paraphrases": ["pp1", "pp2"], | ||
| 130 | - } | ||
| 131 | - ], | ||
| 132 | - } | ||
| 133 | - ) | ||
| 134 | - ctx = RunContext(backend=_backend(ft_like_base=False)) | ||
| 135 | - result = probe.run(spec, ctx) | ||
| 136 | - # Both paraphrase pairs trivially similar → WARN (no separable signal). | ||
| 137 | - assert result.verdict == Verdict.WARN | ||
| 138 | - assert result.evidence["dropped_trivial"] == 2 | ||
| 139 | - | ||
| 140 | - def test_no_cases_errors(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | ||
| 141 | - probe, spec = build_probe({"name": "rev", "kind": "adapter_revert", "cases": []}) | ||
| 142 | - ctx = RunContext(backend=_backend()) | ||
| 143 | - result = probe.run(spec, ctx) | ||
| 144 | - assert result.verdict == Verdict.ERROR | ||
| 145 | - | ||
| 146 | - | ||
| 147 | -class TestMissingSemsim: | ||
| 148 | - def test_skip_when_sentence_transformers_missing(self, monkeypatch: pytest.MonkeyPatch) -> None: | ||
| 149 | - from dlm_sway.core.errors import BackendNotAvailableError | ||
| 150 | - | ||
| 151 | - def raiser(_model_id: Any) -> Any: # type: ignore[no-untyped-def] | ||
| 152 | - raise BackendNotAvailableError( | ||
| 153 | - "adapter_revert", | ||
| 154 | - extra="semsim", | ||
| 155 | - hint="adapter_revert relies on sentence embeddings.", | ||
| 156 | - ) | ||
| 157 | - | ||
| 158 | - monkeypatch.setattr( | ||
| 159 | - "dlm_sway.probes.adapter_revert._load_embedder", | ||
| 160 | - raiser, # type: ignore[arg-type] | ||
| 161 | - ) | ||
| 162 | - probe = AdapterRevertProbe() | ||
| 163 | - spec = probe.spec_cls( | ||
| 164 | - name="rev", | ||
| 165 | - cases=[{"prompt": "x", "gold": "y", "paraphrases": ["pp1"]}], # type: ignore[list-item] | ||
| 166 | - ) | ||
| 167 | - ctx = RunContext(backend=_backend()) | ||
| 168 | - result = probe.run(spec, ctx) | ||
| 169 | - assert result.verdict == Verdict.SKIP | ||
| 170 | - assert "semsim" in result.message | ||
sway/tests/unit/test_probe_base.pydeleted@@ -1,69 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.base`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from typing import Literal | ||
| 6 | - | ||
| 7 | -import pytest | ||
| 8 | - | ||
| 9 | -from dlm_sway.core.errors import SpecValidationError | ||
| 10 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 11 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, build_probe, registry | ||
| 12 | - | ||
| 13 | - | ||
| 14 | -class _DummySpec(ProbeSpec): | ||
| 15 | - kind: Literal["__test_dummy"] = "__test_dummy" | ||
| 16 | - payload: str = "x" | ||
| 17 | - | ||
| 18 | - | ||
| 19 | -class _DummyProbe(Probe): | ||
| 20 | - kind = "__test_dummy" | ||
| 21 | - spec_cls = _DummySpec | ||
| 22 | - category = "adherence" | ||
| 23 | - | ||
| 24 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 25 | - assert isinstance(spec, _DummySpec) | ||
| 26 | - return ProbeResult( | ||
| 27 | - name=spec.name, | ||
| 28 | - kind=spec.kind, | ||
| 29 | - verdict=Verdict.PASS, | ||
| 30 | - score=1.0, | ||
| 31 | - message=spec.payload, | ||
| 32 | - ) | ||
| 33 | - | ||
| 34 | - | ||
| 35 | -class TestRegistry: | ||
| 36 | - def test_autoregister(self) -> None: | ||
| 37 | - assert "__test_dummy" in registry() | ||
| 38 | - assert registry()["__test_dummy"] is _DummyProbe | ||
| 39 | - | ||
| 40 | - def test_duplicate_kind_rejected(self) -> None: | ||
| 41 | - with pytest.raises(ValueError, match="duplicate probe kind"): | ||
| 42 | - | ||
| 43 | - class _Clash(Probe): | ||
| 44 | - kind = "__test_dummy" | ||
| 45 | - spec_cls = _DummySpec | ||
| 46 | - | ||
| 47 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 48 | - raise NotImplementedError | ||
| 49 | - | ||
| 50 | - | ||
| 51 | -class TestBuildProbe: | ||
| 52 | - def test_valid_entry(self) -> None: | ||
| 53 | - probe, spec = build_probe({"name": "t", "kind": "__test_dummy", "payload": "hi"}) | ||
| 54 | - assert isinstance(probe, _DummyProbe) | ||
| 55 | - assert isinstance(spec, _DummySpec) | ||
| 56 | - assert spec.payload == "hi" | ||
| 57 | - | ||
| 58 | - def test_unknown_kind(self) -> None: | ||
| 59 | - with pytest.raises(SpecValidationError, match="unknown probe kind"): | ||
| 60 | - build_probe({"name": "t", "kind": "no_such_kind"}) | ||
| 61 | - | ||
| 62 | - def test_missing_kind(self) -> None: | ||
| 63 | - with pytest.raises(SpecValidationError, match="missing string 'kind'"): | ||
| 64 | - build_probe({"name": "t"}) | ||
| 65 | - | ||
| 66 | - def test_extra_field_forbidden(self) -> None: | ||
| 67 | - with pytest.raises(SpecValidationError) as exc_info: | ||
| 68 | - build_probe({"name": "t", "kind": "__test_dummy", "bogus": "y"}) | ||
| 69 | - assert "bogus" in str(exc_info.value).lower() | ||
sway/tests/unit/test_probe_calibration_drift.pydeleted@@ -1,57 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.calibration_drift`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 6 | -from dlm_sway.core.result import Verdict | ||
| 7 | -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK | ||
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 9 | - | ||
| 10 | - | ||
| 11 | -def _backend(delta_per_token: float) -> DummyDifferentialBackend: | ||
| 12 | - """Apply a uniform per-token logprob delta across every item.""" | ||
| 13 | - base_lp: dict[tuple[str, str], float] = {} | ||
| 14 | - ft_lp: dict[tuple[str, str], float] = {} | ||
| 15 | - for prompt, gold in BUILT_IN_PACK: | ||
| 16 | - base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1) | ||
| 17 | - ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1) | ||
| 18 | - return DummyDifferentialBackend( | ||
| 19 | - base=DummyResponses(logprobs=base_lp), | ||
| 20 | - ft=DummyResponses(logprobs=ft_lp), | ||
| 21 | - ) | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -class TestCalibrationDrift: | ||
| 25 | - def test_healthy_when_no_regression(self) -> None: | ||
| 26 | - backend = _backend(delta_per_token=0.0) # no drift | ||
| 27 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | ||
| 28 | - ctx = RunContext(backend=backend) | ||
| 29 | - result = probe.run(spec, ctx) | ||
| 30 | - assert result.verdict == Verdict.PASS | ||
| 31 | - assert result.raw == 0.0 # zero fraction regressed | ||
| 32 | - | ||
| 33 | - def test_fail_on_uniform_large_regression(self) -> None: | ||
| 34 | - backend = _backend(delta_per_token=-2.0) # every item regresses | ||
| 35 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | ||
| 36 | - ctx = RunContext(backend=backend) | ||
| 37 | - result = probe.run(spec, ctx) | ||
| 38 | - assert result.verdict == Verdict.FAIL | ||
| 39 | - assert result.raw == 1.0 | ||
| 40 | - | ||
| 41 | - def test_respects_items_limit(self) -> None: | ||
| 42 | - backend = _backend(delta_per_token=0.0) | ||
| 43 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5}) | ||
| 44 | - ctx = RunContext(backend=backend) | ||
| 45 | - result = probe.run(spec, ctx) | ||
| 46 | - assert result.evidence["total_items"] == 5 | ||
| 47 | - | ||
| 48 | - def test_worst_offenders_reported(self) -> None: | ||
| 49 | - backend = _backend(delta_per_token=-2.0) | ||
| 50 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | ||
| 51 | - ctx = RunContext(backend=backend) | ||
| 52 | - result = probe.run(spec, ctx) | ||
| 53 | - worst = result.evidence["worst_offenders"] | ||
| 54 | - assert len(worst) <= 5 | ||
| 55 | - # Each worst-offender record carries prompt/gold/delta fields. | ||
| 56 | - if worst: | ||
| 57 | - assert {"prompt", "gold", "delta"} <= set(worst[0].keys()) | ||
sway/tests/unit/test_probe_delta_kl.pydeleted@@ -1,124 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.delta_kl`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import numpy as np | ||
| 6 | - | ||
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 8 | -from dlm_sway.core.result import Verdict | ||
| 9 | -from dlm_sway.core.scoring import TokenDist | ||
| 10 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -def _diverging_backend() -> DummyDifferentialBackend: | ||
| 14 | - """Base peaks tightly on token 1; ft is broad uniform. Real divergence.""" | ||
| 15 | - base = DummyResponses( | ||
| 16 | - token_dists={ | ||
| 17 | - "q1": TokenDist( | ||
| 18 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 19 | - logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)), | ||
| 20 | - vocab_size=100, | ||
| 21 | - ), | ||
| 22 | - "q2": TokenDist( | ||
| 23 | - token_ids=np.array([5, 6], dtype=np.int64), | ||
| 24 | - logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)), | ||
| 25 | - vocab_size=100, | ||
| 26 | - ), | ||
| 27 | - } | ||
| 28 | - ) | ||
| 29 | - ft = DummyResponses( | ||
| 30 | - token_dists={ | ||
| 31 | - "q1": TokenDist( | ||
| 32 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 33 | - logprobs=np.log(np.array([0.3, 0.35, 0.35], dtype=np.float32)), | ||
| 34 | - vocab_size=100, | ||
| 35 | - ), | ||
| 36 | - "q2": TokenDist( | ||
| 37 | - token_ids=np.array([5, 6], dtype=np.int64), | ||
| 38 | - logprobs=np.log(np.array([0.4, 0.6], dtype=np.float32)), | ||
| 39 | - vocab_size=100, | ||
| 40 | - ), | ||
| 41 | - } | ||
| 42 | - ) | ||
| 43 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -def _identical_backend() -> DummyDifferentialBackend: | ||
| 47 | - dist = TokenDist( | ||
| 48 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 49 | - logprobs=np.log(np.array([0.5, 0.3, 0.2], dtype=np.float32)), | ||
| 50 | - vocab_size=100, | ||
| 51 | - ) | ||
| 52 | - base = DummyResponses(token_dists={"q1": dist}) | ||
| 53 | - ft = DummyResponses(token_dists={"q1": dist}) | ||
| 54 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 55 | - | ||
| 56 | - | ||
| 57 | -class TestDeltaKL: | ||
| 58 | - def test_passes_when_distributions_diverge(self) -> None: | ||
| 59 | - probe, spec = build_probe( | ||
| 60 | - { | ||
| 61 | - "name": "dk", | ||
| 62 | - "kind": "delta_kl", | ||
| 63 | - "prompts": ["q1", "q2"], | ||
| 64 | - "assert_mean_gte": 0.01, | ||
| 65 | - } | ||
| 66 | - ) | ||
| 67 | - ctx = RunContext(backend=_diverging_backend()) | ||
| 68 | - result = probe.run(spec, ctx) | ||
| 69 | - assert result.verdict == Verdict.PASS | ||
| 70 | - assert result.raw is not None | ||
| 71 | - assert result.raw > 0.01 | ||
| 72 | - assert result.evidence["num_prompts"] == 2 | ||
| 73 | - assert len(result.evidence["per_prompt"]) == 2 | ||
| 74 | - | ||
| 75 | - def test_fails_when_distributions_identical(self) -> None: | ||
| 76 | - probe, spec = build_probe( | ||
| 77 | - { | ||
| 78 | - "name": "dk", | ||
| 79 | - "kind": "delta_kl", | ||
| 80 | - "prompts": ["q1"], | ||
| 81 | - "assert_mean_gte": 0.01, | ||
| 82 | - } | ||
| 83 | - ) | ||
| 84 | - ctx = RunContext(backend=_identical_backend()) | ||
| 85 | - result = probe.run(spec, ctx) | ||
| 86 | - assert result.verdict == Verdict.FAIL | ||
| 87 | - assert result.raw == 0.0 | ||
| 88 | - | ||
| 89 | - def test_z_score_path_when_null_stats_present(self) -> None: | ||
| 90 | - probe, spec = build_probe( | ||
| 91 | - { | ||
| 92 | - "name": "dk", | ||
| 93 | - "kind": "delta_kl", | ||
| 94 | - "prompts": ["q1"], | ||
| 95 | - "assert_z_gte": 2.0, | ||
| 96 | - } | ||
| 97 | - ) | ||
| 98 | - null_stats = {"delta_kl": {"mean": 0.01, "std": 0.01, "n": 3.0}} | ||
| 99 | - ctx = RunContext(backend=_diverging_backend(), null_stats=null_stats) | ||
| 100 | - result = probe.run(spec, ctx) | ||
| 101 | - assert result.z_score is not None | ||
| 102 | - # Our synthetic ft diverges ~0.1+, far above μ=0.01, σ=0.01 → huge z. | ||
| 103 | - assert result.z_score > 2.0 | ||
| 104 | - assert result.verdict == Verdict.PASS | ||
| 105 | - | ||
| 106 | - def test_error_on_empty_prompts(self) -> None: | ||
| 107 | - probe, spec = build_probe({"name": "dk", "kind": "delta_kl", "prompts": []}) | ||
| 108 | - ctx = RunContext(backend=_identical_backend()) | ||
| 109 | - result = probe.run(spec, ctx) | ||
| 110 | - assert result.verdict == Verdict.ERROR | ||
| 111 | - | ||
| 112 | - def test_kl_kind_available(self) -> None: | ||
| 113 | - probe, spec = build_probe( | ||
| 114 | - { | ||
| 115 | - "name": "dk", | ||
| 116 | - "kind": "delta_kl", | ||
| 117 | - "prompts": ["q1"], | ||
| 118 | - "divergence": "kl", | ||
| 119 | - "assert_mean_gte": 0.0, | ||
| 120 | - } | ||
| 121 | - ) | ||
| 122 | - ctx = RunContext(backend=_diverging_backend()) | ||
| 123 | - result = probe.run(spec, ctx) | ||
| 124 | - assert result.evidence["divergence_kind"] == "kl" | ||
sway/tests/unit/test_probe_leakage.pydeleted@@ -1,109 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.leakage`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 6 | -from dlm_sway.core.result import Verdict | ||
| 7 | -from dlm_sway.core.sections import Section | ||
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 9 | -from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb | ||
| 10 | - | ||
| 11 | - | ||
| 12 | -class TestLCS: | ||
| 13 | - def test_identical_returns_one(self) -> None: | ||
| 14 | - assert _lcs_ratio("abcdef", "abcdef") == 1.0 | ||
| 15 | - | ||
| 16 | - def test_disjoint_returns_low(self) -> None: | ||
| 17 | - assert _lcs_ratio("abc", "xyz") < 0.3 | ||
| 18 | - | ||
| 19 | - def test_empty_returns_zero(self) -> None: | ||
| 20 | - assert _lcs_ratio("", "abc") == 0.0 | ||
| 21 | - | ||
| 22 | - | ||
| 23 | -class TestPerturb: | ||
| 24 | - def test_typo_swaps_first_two(self) -> None: | ||
| 25 | - assert _perturb("hello", "typo") == "ehllo" | ||
| 26 | - | ||
| 27 | - def test_case_flip_inverts_first_alpha(self) -> None: | ||
| 28 | - assert _perturb("abc", "case_flip") == "Abc" | ||
| 29 | - assert _perturb("ABC", "case_flip") == "aBC" | ||
| 30 | - | ||
| 31 | - def test_drop_punct_removes_punct(self) -> None: | ||
| 32 | - assert _perturb("a, b. c!", "drop_punct") == "a b c" | ||
| 33 | - | ||
| 34 | - | ||
| 35 | -class TestFragility: | ||
| 36 | - def test_zero_when_clean_zero(self) -> None: | ||
| 37 | - assert _fragility(0.0, 0.0) == 0.0 | ||
| 38 | - | ||
| 39 | - def test_expected_when_perturbed_dropped(self) -> None: | ||
| 40 | - import pytest | ||
| 41 | - | ||
| 42 | - assert _fragility(0.8, 0.2) == pytest.approx(0.75) | ||
| 43 | - | ||
| 44 | - | ||
| 45 | -def _prose_section(sid: str, content: str) -> Section: | ||
| 46 | - return Section(id=sid, kind="prose", content=content) | ||
| 47 | - | ||
| 48 | - | ||
| 49 | -def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend: | ||
| 50 | - """Build a backend whose ft generate() returns a controlled prefix of ``target``. | ||
| 51 | - | ||
| 52 | - The target is "aaa..." (200 chars) so we can measure LCS ratio | ||
| 53 | - against it deterministically. | ||
| 54 | - """ | ||
| 55 | - content = ("The capital of France is Paris. " * 30).strip() | ||
| 56 | - # Generate a fraction of the target to hit the desired recall. | ||
| 57 | - target = content[128 : 128 + 256] | ||
| 58 | - ft_full = target[: int(ft_recall * len(target))] | ||
| 59 | - ft_pert = target[: int(ft_perturbed_recall * len(target))] | ||
| 60 | - | ||
| 61 | - base = DummyResponses() | ||
| 62 | - ft = DummyResponses( | ||
| 63 | - generations={ | ||
| 64 | - content[:128]: ft_full, | ||
| 65 | - # perturbations of the first 128 chars hit these three: | ||
| 66 | - **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")}, | ||
| 67 | - } | ||
| 68 | - ) | ||
| 69 | - return DummyDifferentialBackend(base=base, ft=ft), content | ||
| 70 | - | ||
| 71 | - | ||
| 72 | -class TestProbe: | ||
| 73 | - def test_skip_without_sections(self) -> None: | ||
| 74 | - backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0) | ||
| 75 | - probe, spec = build_probe({"name": "c3", "kind": "leakage"}) | ||
| 76 | - ctx = RunContext(backend=backend) | ||
| 77 | - result = probe.run(spec, ctx) | ||
| 78 | - assert result.verdict == Verdict.SKIP | ||
| 79 | - | ||
| 80 | - def test_pass_when_no_leak(self) -> None: | ||
| 81 | - backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0) | ||
| 82 | - probe, spec = build_probe( | ||
| 83 | - { | ||
| 84 | - "name": "c3", | ||
| 85 | - "kind": "leakage", | ||
| 86 | - "prefix_chars": 128, | ||
| 87 | - "continuation_chars": 256, | ||
| 88 | - } | ||
| 89 | - ) | ||
| 90 | - ctx = RunContext(backend=backend, sections=(_prose_section("a", content),)) | ||
| 91 | - result = probe.run(spec, ctx) | ||
| 92 | - assert result.verdict == Verdict.PASS | ||
| 93 | - | ||
| 94 | - def test_fail_when_strong_low_fragility_leak(self) -> None: | ||
| 95 | - backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9) | ||
| 96 | - probe, spec = build_probe( | ||
| 97 | - { | ||
| 98 | - "name": "c3", | ||
| 99 | - "kind": "leakage", | ||
| 100 | - "prefix_chars": 128, | ||
| 101 | - "continuation_chars": 256, | ||
| 102 | - "assert_recall_lt": 0.5, | ||
| 103 | - "min_fragility": 0.3, | ||
| 104 | - } | ||
| 105 | - ) | ||
| 106 | - ctx = RunContext(backend=backend, sections=(_prose_section("a", content),)) | ||
| 107 | - result = probe.run(spec, ctx) | ||
| 108 | - # High recall + low fragility → fail. | ||
| 109 | - assert result.verdict == Verdict.FAIL | ||
sway/tests/unit/test_probe_paraphrase_invariance.pydeleted@@ -1,91 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.paraphrase_invariance`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 6 | -from dlm_sway.core.result import Verdict | ||
| 7 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 8 | - | ||
| 9 | - | ||
| 10 | -def _backend(*, par_lift_fraction: float, verb_lift: float = 10.0) -> DummyDifferentialBackend: | ||
| 11 | - """Return a backend with tunable verbatim/paraphrase lifts. | ||
| 12 | - | ||
| 13 | - The ft view adds ``verb_lift`` nats to the verbatim (Q,A) logprob | ||
| 14 | - and ``par_lift_fraction * verb_lift`` to paraphrase logprobs. | ||
| 15 | - """ | ||
| 16 | - base = DummyResponses( | ||
| 17 | - logprobs={ | ||
| 18 | - ("Q", "A"): -20.0, | ||
| 19 | - ("Q_par1", "A"): -20.0, | ||
| 20 | - ("Q_par2", "A"): -20.0, | ||
| 21 | - } | ||
| 22 | - ) | ||
| 23 | - ft = DummyResponses( | ||
| 24 | - logprobs={ | ||
| 25 | - ("Q", "A"): -20.0 + verb_lift, | ||
| 26 | - ("Q_par1", "A"): -20.0 + par_lift_fraction * verb_lift, | ||
| 27 | - ("Q_par2", "A"): -20.0 + par_lift_fraction * verb_lift, | ||
| 28 | - } | ||
| 29 | - ) | ||
| 30 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -def test_pass_when_generalizing() -> None: | ||
| 34 | - # High paraphrase lift + high verbatim → healthy generalization. | ||
| 35 | - backend = _backend(par_lift_fraction=0.9) | ||
| 36 | - probe, spec = build_probe( | ||
| 37 | - { | ||
| 38 | - "name": "pi", | ||
| 39 | - "kind": "paraphrase_invariance", | ||
| 40 | - "intent": "generalize", | ||
| 41 | - "min_verbatim_lift": 0.05, | ||
| 42 | - "min_generalization_ratio": 0.5, | ||
| 43 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1", "Q_par2"]}], | ||
| 44 | - } | ||
| 45 | - ) | ||
| 46 | - ctx = RunContext(backend=backend) | ||
| 47 | - result = probe.run(spec, ctx) | ||
| 48 | - assert result.verdict == Verdict.PASS | ||
| 49 | - assert result.raw is not None | ||
| 50 | - assert result.raw >= 0.5 | ||
| 51 | - | ||
| 52 | - | ||
| 53 | -def test_fails_when_only_memorized_but_intent_generalize() -> None: | ||
| 54 | - backend = _backend(par_lift_fraction=0.0) | ||
| 55 | - probe, spec = build_probe( | ||
| 56 | - { | ||
| 57 | - "name": "pi", | ||
| 58 | - "kind": "paraphrase_invariance", | ||
| 59 | - "intent": "generalize", | ||
| 60 | - "min_verbatim_lift": 0.05, | ||
| 61 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}], | ||
| 62 | - } | ||
| 63 | - ) | ||
| 64 | - ctx = RunContext(backend=backend) | ||
| 65 | - result = probe.run(spec, ctx) | ||
| 66 | - assert result.verdict == Verdict.FAIL | ||
| 67 | - | ||
| 68 | - | ||
| 69 | -def test_passes_memorize_intent_when_only_memorized() -> None: | ||
| 70 | - backend = _backend(par_lift_fraction=0.0) | ||
| 71 | - probe, spec = build_probe( | ||
| 72 | - { | ||
| 73 | - "name": "pi", | ||
| 74 | - "kind": "paraphrase_invariance", | ||
| 75 | - "intent": "memorize", | ||
| 76 | - "min_verbatim_lift": 0.05, | ||
| 77 | - "max_generalization_ratio_if_memorize": 0.3, | ||
| 78 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}], | ||
| 79 | - } | ||
| 80 | - ) | ||
| 81 | - ctx = RunContext(backend=backend) | ||
| 82 | - result = probe.run(spec, ctx) | ||
| 83 | - assert result.verdict == Verdict.PASS | ||
| 84 | - | ||
| 85 | - | ||
| 86 | -def test_error_on_empty_cases() -> None: | ||
| 87 | - probe, spec = build_probe({"name": "pi", "kind": "paraphrase_invariance", "cases": []}) | ||
| 88 | - backend = _backend(par_lift_fraction=0.9) | ||
| 89 | - ctx = RunContext(backend=backend) | ||
| 90 | - result = probe.run(spec, ctx) | ||
| 91 | - assert result.verdict == Verdict.ERROR | ||
sway/tests/unit/test_probe_preference_flip.pydeleted@@ -1,161 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.preference_flip`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 6 | -from dlm_sway.core.result import Verdict | ||
| 7 | -from dlm_sway.core.sections import Section, SectionPreference | ||
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 9 | - | ||
| 10 | - | ||
| 11 | -def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend: | ||
| 12 | - """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin). | ||
| 13 | - | ||
| 14 | - We distribute the margin half to the chosen and half (negative) to | ||
| 15 | - the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected) | ||
| 16 | - equal the requested margin. | ||
| 17 | - """ | ||
| 18 | - base_lp: dict[tuple[str, str], float] = {} | ||
| 19 | - ft_lp: dict[tuple[str, str], float] = {} | ||
| 20 | - for prompt, chosen, rejected, base_m, ft_m in pairs: | ||
| 21 | - base_lp[(prompt, chosen)] = base_m / 2 | ||
| 22 | - base_lp[(prompt, rejected)] = -base_m / 2 | ||
| 23 | - ft_lp[(prompt, chosen)] = ft_m / 2 | ||
| 24 | - ft_lp[(prompt, rejected)] = -ft_m / 2 | ||
| 25 | - return DummyDifferentialBackend( | ||
| 26 | - base=DummyResponses(logprobs=base_lp), | ||
| 27 | - ft=DummyResponses(logprobs=ft_lp), | ||
| 28 | - ) | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -def test_pass_when_base_wrong_flipped() -> None: | ||
| 32 | - backend = _backend( | ||
| 33 | - [ | ||
| 34 | - ("p1", "good1", "bad1", -2.0, 2.0), # base wrong, ft flips | ||
| 35 | - ("p2", "good2", "bad2", -1.5, 1.0), # base wrong, ft flips | ||
| 36 | - ("p3", "good3", "bad3", -0.5, 0.8), # base wrong, ft flips | ||
| 37 | - ("p4", "good4", "bad4", 1.0, 2.0), # base already right (no contribution) | ||
| 38 | - ] | ||
| 39 | - ) | ||
| 40 | - triples = [ | ||
| 41 | - {"prompt": p, "chosen": c, "rejected": r} | ||
| 42 | - for (p, c, r, _, _) in [ | ||
| 43 | - ("p1", "good1", "bad1", 0, 0), | ||
| 44 | - ("p2", "good2", "bad2", 0, 0), | ||
| 45 | - ("p3", "good3", "bad3", 0, 0), | ||
| 46 | - ("p4", "good4", "bad4", 0, 0), | ||
| 47 | - ] | ||
| 48 | - ] | ||
| 49 | - probe, spec = build_probe( | ||
| 50 | - { | ||
| 51 | - "name": "pf", | ||
| 52 | - "kind": "preference_flip", | ||
| 53 | - "triples": triples, | ||
| 54 | - "assert_flip_rate_gte": 0.7, | ||
| 55 | - "min_triples_for_decision": 3, | ||
| 56 | - } | ||
| 57 | - ) | ||
| 58 | - ctx = RunContext(backend=backend) | ||
| 59 | - result = probe.run(spec, ctx) | ||
| 60 | - assert result.verdict == Verdict.PASS | ||
| 61 | - assert result.raw == 1.0 # 3/3 flipped | ||
| 62 | - | ||
| 63 | - | ||
| 64 | -def test_fail_when_base_wrong_not_flipped() -> None: | ||
| 65 | - backend = _backend( | ||
| 66 | - [ | ||
| 67 | - ("p1", "good1", "bad1", -2.0, -1.5), # base wrong, ft still wrong | ||
| 68 | - ("p2", "good2", "bad2", -1.5, -1.0), # base wrong, ft still wrong | ||
| 69 | - ("p3", "good3", "bad3", -0.5, 0.8), # base wrong, ft flips | ||
| 70 | - ] | ||
| 71 | - ) | ||
| 72 | - triples = [ | ||
| 73 | - {"prompt": p, "chosen": c, "rejected": r} | ||
| 74 | - for p, c, r in [ | ||
| 75 | - ("p1", "good1", "bad1"), | ||
| 76 | - ("p2", "good2", "bad2"), | ||
| 77 | - ("p3", "good3", "bad3"), | ||
| 78 | - ] | ||
| 79 | - ] | ||
| 80 | - probe, spec = build_probe( | ||
| 81 | - { | ||
| 82 | - "name": "pf", | ||
| 83 | - "kind": "preference_flip", | ||
| 84 | - "triples": triples, | ||
| 85 | - "assert_flip_rate_gte": 0.7, | ||
| 86 | - "min_triples_for_decision": 3, | ||
| 87 | - } | ||
| 88 | - ) | ||
| 89 | - ctx = RunContext(backend=backend) | ||
| 90 | - result = probe.run(spec, ctx) | ||
| 91 | - assert result.verdict == Verdict.FAIL | ||
| 92 | - assert result.raw is not None | ||
| 93 | - assert result.raw < 0.7 | ||
| 94 | - | ||
| 95 | - | ||
| 96 | -def test_skip_when_no_triples_anywhere() -> None: | ||
| 97 | - probe, spec = build_probe({"name": "pf", "kind": "preference_flip"}) | ||
| 98 | - backend = _backend([]) | ||
| 99 | - ctx = RunContext(backend=backend) | ||
| 100 | - result = probe.run(spec, ctx) | ||
| 101 | - assert result.verdict == Verdict.SKIP | ||
| 102 | - | ||
| 103 | - | ||
| 104 | -def test_warn_when_too_few_base_wrong() -> None: | ||
| 105 | - backend = _backend( | ||
| 106 | - [ | ||
| 107 | - ("p1", "good1", "bad1", 1.0, 2.0), # base right | ||
| 108 | - ("p2", "good2", "bad2", 0.5, 1.0), # base right | ||
| 109 | - ("p3", "good3", "bad3", -0.5, 0.5), # base wrong | ||
| 110 | - ] | ||
| 111 | - ) | ||
| 112 | - triples = [ | ||
| 113 | - {"prompt": p, "chosen": c, "rejected": r} | ||
| 114 | - for p, c, r in [ | ||
| 115 | - ("p1", "good1", "bad1"), | ||
| 116 | - ("p2", "good2", "bad2"), | ||
| 117 | - ("p3", "good3", "bad3"), | ||
| 118 | - ] | ||
| 119 | - ] | ||
| 120 | - probe, spec = build_probe( | ||
| 121 | - { | ||
| 122 | - "name": "pf", | ||
| 123 | - "kind": "preference_flip", | ||
| 124 | - "triples": triples, | ||
| 125 | - "min_triples_for_decision": 3, | ||
| 126 | - } | ||
| 127 | - ) | ||
| 128 | - ctx = RunContext(backend=backend) | ||
| 129 | - result = probe.run(spec, ctx) | ||
| 130 | - assert result.verdict == Verdict.WARN | ||
| 131 | - | ||
| 132 | - | ||
| 133 | -def test_triples_pulled_from_sections() -> None: | ||
| 134 | - pref_section = Section( | ||
| 135 | - id="p1", | ||
| 136 | - kind="preference", | ||
| 137 | - content="...", | ||
| 138 | - preferences=( | ||
| 139 | - SectionPreference(prompt="q1", chosen="good", rejected="bad"), | ||
| 140 | - SectionPreference(prompt="q2", chosen="good2", rejected="bad2"), | ||
| 141 | - SectionPreference(prompt="q3", chosen="good3", rejected="bad3"), | ||
| 142 | - ), | ||
| 143 | - ) | ||
| 144 | - backend = _backend( | ||
| 145 | - [ | ||
| 146 | - ("q1", "good", "bad", -1.0, 1.0), | ||
| 147 | - ("q2", "good2", "bad2", -1.0, 1.0), | ||
| 148 | - ("q3", "good3", "bad3", -1.0, 1.0), | ||
| 149 | - ] | ||
| 150 | - ) | ||
| 151 | - probe, spec = build_probe( | ||
| 152 | - { | ||
| 153 | - "name": "pf", | ||
| 154 | - "kind": "preference_flip", | ||
| 155 | - "assert_flip_rate_gte": 0.7, | ||
| 156 | - "min_triples_for_decision": 3, | ||
| 157 | - } | ||
| 158 | - ) | ||
| 159 | - ctx = RunContext(backend=backend, sections=(pref_section,)) | ||
| 160 | - result = probe.run(spec, ctx) | ||
| 161 | - assert result.verdict == Verdict.PASS | ||
sway/tests/unit/test_probe_prompt_collapse.pydeleted@@ -1,137 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.prompt_collapse`. | ||
| 2 | - | ||
| 3 | -Uses a programmable dummy backend that serves different token dists | ||
| 4 | -depending on whether the prompt contains the stuffing prefix. That's the | ||
| 5 | -cleanest way to simulate "divergence decays with context length" without | ||
| 6 | -a real model. | ||
| 7 | -""" | ||
| 8 | - | ||
| 9 | -from __future__ import annotations | ||
| 10 | - | ||
| 11 | -import numpy as np | ||
| 12 | - | ||
| 13 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 14 | -from dlm_sway.core.result import Verdict | ||
| 15 | -from dlm_sway.core.scoring import TokenDist | ||
| 16 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 17 | -from dlm_sway.probes.prompt_collapse import _fit_half_life | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -class TestFitHalfLife: | ||
| 21 | - def test_exponential_recovered(self) -> None: | ||
| 22 | - lengths = np.array([0.0, 100.0, 200.0, 300.0]) | ||
| 23 | - # y = 1.0 * exp(-x / 100) | ||
| 24 | - y = np.exp(-lengths / 100.0) | ||
| 25 | - h = _fit_half_life(lengths, y) | ||
| 26 | - assert h is not None | ||
| 27 | - import math | ||
| 28 | - | ||
| 29 | - # True half-life = ln(2) * 100 ≈ 69.3 | ||
| 30 | - assert abs(h - math.log(2.0) * 100.0) < 1e-6 | ||
| 31 | - | ||
| 32 | - def test_returns_none_for_flat(self) -> None: | ||
| 33 | - lengths = np.array([0.0, 100.0, 200.0]) | ||
| 34 | - y = np.array([1e-10, 1e-10, 1e-10]) | ||
| 35 | - assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None | ||
| 36 | - # Either None or a huge half-life — both acceptable for flat input. | ||
| 37 | - | ||
| 38 | - def test_returns_none_for_increasing(self) -> None: | ||
| 39 | - lengths = np.array([0.0, 100.0, 200.0]) | ||
| 40 | - y = np.array([0.1, 0.3, 0.5]) | ||
| 41 | - assert _fit_half_life(lengths, y) is None | ||
| 42 | - | ||
| 43 | - | ||
| 44 | -def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend: | ||
| 45 | - """Return a backend whose divergence decays with prompt length. | ||
| 46 | - | ||
| 47 | - ``stuffing_sensitivity`` controls how quickly the ft distribution | ||
| 48 | - snaps back to base as prompt length grows; lower = healthier adapter. | ||
| 49 | - """ | ||
| 50 | - import numpy as np | ||
| 51 | - | ||
| 52 | - base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32) | ||
| 53 | - | ||
| 54 | - class _StuffedResponses(DummyResponses): | ||
| 55 | - def __init__(self, is_ft: bool): | ||
| 56 | - super().__init__() | ||
| 57 | - self._is_ft = is_ft | ||
| 58 | - | ||
| 59 | - # Override retrieval by subclassing the view's lookup path. | ||
| 60 | - | ||
| 61 | - # Simpler: use explicit prompts at each expected length to seed the dict. | ||
| 62 | - # The probe prefixes stuffing so the dummy sees the exact final prompt. | ||
| 63 | - # We pre-build dists for each prompt we expect to see. | ||
| 64 | - base = DummyResponses() | ||
| 65 | - ft = DummyResponses() | ||
| 66 | - | ||
| 67 | - # Pre-generate prompts the probe will query. The probe uses default | ||
| 68 | - # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok. | ||
| 69 | - from dlm_sway.probes.prompt_collapse import _stuffing | ||
| 70 | - | ||
| 71 | - for ctx_len in (0, 256, 512, 1024): | ||
| 72 | - prefix = _stuffing(ctx_len) | ||
| 73 | - for prompt in ("q1",): | ||
| 74 | - key = prefix + prompt | ||
| 75 | - # Base: always tight on token 1. | ||
| 76 | - base.token_dists[key] = TokenDist( | ||
| 77 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 78 | - logprobs=np.log(base_probs), | ||
| 79 | - vocab_size=100, | ||
| 80 | - ) | ||
| 81 | - # FT: diverges at ctx=0, decays toward base with length. | ||
| 82 | - decay = np.exp(-ctx_len * stuffing_sensitivity) | ||
| 83 | - ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay | ||
| 84 | - ft_probs = ft_probs / ft_probs.sum() | ||
| 85 | - ft.token_dists[key] = TokenDist( | ||
| 86 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 87 | - logprobs=np.log(ft_probs.astype(np.float32)), | ||
| 88 | - vocab_size=100, | ||
| 89 | - ) | ||
| 90 | - return DummyDifferentialBackend(base=base, ft=ft) | ||
| 91 | - | ||
| 92 | - | ||
| 93 | -class TestPromptCollapse: | ||
| 94 | - def test_healthy_adapter_passes(self) -> None: | ||
| 95 | - probe, spec = build_probe( | ||
| 96 | - { | ||
| 97 | - "name": "pc", | ||
| 98 | - "kind": "prompt_collapse", | ||
| 99 | - "prompts": ["q1"], | ||
| 100 | - "context_lengths": [0, 256, 512, 1024], | ||
| 101 | - "assert_half_life_tokens": 100, | ||
| 102 | - } | ||
| 103 | - ) | ||
| 104 | - ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001)) | ||
| 105 | - result = probe.run(spec, ctx) | ||
| 106 | - # Half-life should be well above 100 with slow decay. | ||
| 107 | - assert result.verdict == Verdict.PASS | ||
| 108 | - assert result.raw is not None | ||
| 109 | - assert result.raw > 100 | ||
| 110 | - | ||
| 111 | - def test_collapsing_adapter_fails(self) -> None: | ||
| 112 | - probe, spec = build_probe( | ||
| 113 | - { | ||
| 114 | - "name": "pc", | ||
| 115 | - "kind": "prompt_collapse", | ||
| 116 | - "prompts": ["q1"], | ||
| 117 | - "context_lengths": [0, 256, 512, 1024], | ||
| 118 | - "assert_half_life_tokens": 500, | ||
| 119 | - } | ||
| 120 | - ) | ||
| 121 | - ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02)) | ||
| 122 | - result = probe.run(spec, ctx) | ||
| 123 | - # Fast decay → short half-life → fail against 500-token threshold. | ||
| 124 | - assert result.verdict == Verdict.FAIL | ||
| 125 | - | ||
| 126 | - def test_error_on_empty_prompts(self) -> None: | ||
| 127 | - probe, spec = build_probe( | ||
| 128 | - { | ||
| 129 | - "name": "pc", | ||
| 130 | - "kind": "prompt_collapse", | ||
| 131 | - "prompts": [], | ||
| 132 | - "context_lengths": [0, 256], | ||
| 133 | - } | ||
| 134 | - ) | ||
| 135 | - ctx = RunContext(backend=_programmed_backend(0.001)) | ||
| 136 | - result = probe.run(spec, ctx) | ||
| 137 | - assert result.verdict == Verdict.ERROR | ||
sway/tests/unit/test_probe_section_internalization.pydeleted@@ -1,94 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.section_internalization` (the flagship B1).""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import numpy as np | ||
| 6 | - | ||
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 8 | -from dlm_sway.core.result import Verdict | ||
| 9 | -from dlm_sway.core.scoring import RollingLogprob | ||
| 10 | -from dlm_sway.core.sections import Section, SectionProbe | ||
| 11 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 12 | - | ||
| 13 | - | ||
| 14 | -def _rolling(mean_lp: float, n: int = 10) -> RollingLogprob: | ||
| 15 | - lp = np.full(n - 1, mean_lp, dtype=np.float32) | ||
| 16 | - return RollingLogprob( | ||
| 17 | - token_ids=np.arange(n, dtype=np.int64), | ||
| 18 | - logprobs=lp, | ||
| 19 | - num_tokens=n, | ||
| 20 | - total_logprob=float(lp.sum()), | ||
| 21 | - ) | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def _section(sid: str, kind: str = "prose", content: str = "content", probes=()) -> Section: | ||
| 25 | - return Section(id=sid, kind=kind, content=content, probes=tuple(probes)) # type: ignore[arg-type] | ||
| 26 | - | ||
| 27 | - | ||
| 28 | -def test_skip_without_sections() -> None: | ||
| 29 | - probe, spec = build_probe({"name": "sis", "kind": "section_internalization"}) | ||
| 30 | - backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | ||
| 31 | - ctx = RunContext(backend=backend) | ||
| 32 | - result = probe.run(spec, ctx) | ||
| 33 | - assert result.verdict == Verdict.SKIP | ||
| 34 | - | ||
| 35 | - | ||
| 36 | -def test_skip_with_single_section() -> None: | ||
| 37 | - probe, spec = build_probe({"name": "sis", "kind": "section_internalization"}) | ||
| 38 | - backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | ||
| 39 | - ctx = RunContext(backend=backend, sections=(_section("a"),)) | ||
| 40 | - result = probe.run(spec, ctx) | ||
| 41 | - assert result.verdict == Verdict.SKIP | ||
| 42 | - | ||
| 43 | - | ||
| 44 | -def test_pass_when_each_section_gets_distinct_lift() -> None: | ||
| 45 | - # Build a dummy backend where the ft is much lower-PPL than base on | ||
| 46 | - # every section's content — uniform lift, but leak-check math | ||
| 47 | - # yields ~zero differential leak so all sections pass. | ||
| 48 | - content_a = "aaa " * 10 | ||
| 49 | - content_b = "bbb " * 10 | ||
| 50 | - | ||
| 51 | - base = DummyResponses(rolling={content_a: _rolling(-3.0), content_b: _rolling(-3.0)}) | ||
| 52 | - ft = DummyResponses(rolling={content_a: _rolling(-1.0), content_b: _rolling(-2.5)}) | ||
| 53 | - backend = DummyDifferentialBackend(base=base, ft=ft) | ||
| 54 | - | ||
| 55 | - sections = ( | ||
| 56 | - _section("a", content=content_a), | ||
| 57 | - _section("b", content=content_b), | ||
| 58 | - ) | ||
| 59 | - probe, spec = build_probe( | ||
| 60 | - { | ||
| 61 | - "name": "sis", | ||
| 62 | - "kind": "section_internalization", | ||
| 63 | - "per_section_threshold": 0.05, | ||
| 64 | - } | ||
| 65 | - ) | ||
| 66 | - ctx = RunContext(backend=backend, sections=sections) | ||
| 67 | - result = probe.run(spec, ctx) | ||
| 68 | - assert result.verdict in (Verdict.PASS, Verdict.FAIL) | ||
| 69 | - assert "per_section" in result.evidence | ||
| 70 | - assert len(result.evidence["per_section"]) == 2 | ||
| 71 | - | ||
| 72 | - | ||
| 73 | -def test_instruction_uses_logprob_of() -> None: | ||
| 74 | - # Instruction sections contribute their probe Q/A pairs; feed | ||
| 75 | - # logprobs so the ft view comes out cheaper than base. | ||
| 76 | - probes_a = (SectionProbe(prompt="Qa", gold="Aa"),) | ||
| 77 | - probes_b = (SectionProbe(prompt="Qb", gold="Ab"),) | ||
| 78 | - base = DummyResponses(logprobs={("Qa", "Aa"): -10.0, ("Qb", "Ab"): -10.0}) | ||
| 79 | - ft = DummyResponses(logprobs={("Qa", "Aa"): -3.0, ("Qb", "Ab"): -8.0}) | ||
| 80 | - backend = DummyDifferentialBackend(base=base, ft=ft) | ||
| 81 | - | ||
| 82 | - sections = ( | ||
| 83 | - _section("a", kind="instruction", content="...", probes=probes_a), | ||
| 84 | - _section("b", kind="instruction", content="...", probes=probes_b), | ||
| 85 | - ) | ||
| 86 | - probe, spec = build_probe( | ||
| 87 | - {"name": "sis", "kind": "section_internalization", "per_section_threshold": 0.05} | ||
| 88 | - ) | ||
| 89 | - ctx = RunContext(backend=backend, sections=sections) | ||
| 90 | - result = probe.run(spec, ctx) | ||
| 91 | - per = result.evidence["per_section"] | ||
| 92 | - # Section A got much more lift than B, so effective_sis(a) > effective_sis(b). | ||
| 93 | - sis_by_id = {row["section_id"]: row["effective_sis"] for row in per} | ||
| 94 | - assert sis_by_id["a"] > sis_by_id["b"] | ||
sway/tests/unit/test_probe_style_fingerprint.pydeleted@@ -1,115 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.probes.style_fingerprint`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import numpy as np | ||
| 6 | - | ||
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 8 | -from dlm_sway.core.result import Verdict | ||
| 9 | -from dlm_sway.probes.base import RunContext, build_probe | ||
| 10 | -from dlm_sway.probes.style_fingerprint import fingerprint | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -class TestFingerprint: | ||
| 14 | - def test_zero_vector_for_empty(self) -> None: | ||
| 15 | - fp = fingerprint("") | ||
| 16 | - assert fp.shape == (6,) | ||
| 17 | - assert np.allclose(fp, 0.0) | ||
| 18 | - | ||
| 19 | - def test_non_zero_for_normal_text(self) -> None: | ||
| 20 | - fp = fingerprint("This is a sentence. This is another one. A third.") | ||
| 21 | - assert fp.shape == (6,) | ||
| 22 | - assert fp[0] > 0 # mean sentence length | ||
| 23 | - assert fp[2] > 0 # TTR | ||
| 24 | - assert fp[3] > 0 # avg word length | ||
| 25 | - | ||
| 26 | - def test_distinct_styles_distinct_fingerprints(self) -> None: | ||
| 27 | - terse = "Go. Now. Quick." | ||
| 28 | - verbose = ( | ||
| 29 | - "We must, with all deliberate speed and measured consideration, " | ||
| 30 | - "proceed expeditiously towards the elaborated and carefully " | ||
| 31 | - "constructed resolution of the foregoing matter." | ||
| 32 | - ) | ||
| 33 | - assert not np.allclose(fingerprint(terse), fingerprint(verbose)) | ||
| 34 | - | ||
| 35 | - | ||
| 36 | -def _backend_with_samples(base: list[str], ft: list[str]) -> DummyDifferentialBackend: | ||
| 37 | - return DummyDifferentialBackend( | ||
| 38 | - base=DummyResponses(generations={f"p{i}": s for i, s in enumerate(base)}), | ||
| 39 | - ft=DummyResponses(generations={f"p{i}": s for i, s in enumerate(ft)}), | ||
| 40 | - ) | ||
| 41 | - | ||
| 42 | - | ||
| 43 | -class TestProbe: | ||
| 44 | - def test_pass_when_ft_drifts_toward_doc(self) -> None: | ||
| 45 | - base_samples = ["Short. Plain. Words."] * 2 | ||
| 46 | - ft_samples = [ | ||
| 47 | - "Wherein many clauses conjoin themselves, through extended " | ||
| 48 | - "ruminations, unto a meandering whole of considerable length." | ||
| 49 | - ] * 2 | ||
| 50 | - doc = ( | ||
| 51 | - "Wherein many clauses conjoin themselves, through extended " | ||
| 52 | - "ruminations, unto a meandering whole of considerable length. " | ||
| 53 | - "Further elaboration, no less copious, follows apace." | ||
| 54 | - ) | ||
| 55 | - backend = _backend_with_samples(base_samples, ft_samples) | ||
| 56 | - probe, spec = build_probe( | ||
| 57 | - { | ||
| 58 | - "name": "c1", | ||
| 59 | - "kind": "style_fingerprint", | ||
| 60 | - "prompts": ["p0", "p1"], | ||
| 61 | - "doc_reference": doc, | ||
| 62 | - "max_new_tokens": 32, | ||
| 63 | - "assert_shift_gte": 0.2, | ||
| 64 | - } | ||
| 65 | - ) | ||
| 66 | - ctx = RunContext(backend=backend) | ||
| 67 | - result = probe.run(spec, ctx) | ||
| 68 | - assert result.verdict == Verdict.PASS | ||
| 69 | - assert result.raw is not None | ||
| 70 | - assert result.raw > 0.2 | ||
| 71 | - | ||
| 72 | - def test_fail_when_no_stylistic_shift(self) -> None: | ||
| 73 | - base_samples = ["Short. Plain. Words."] * 2 | ||
| 74 | - ft_samples = ["Short. Plain. Words."] * 2 | ||
| 75 | - doc = "Wherein clauses conjoin into meandering wholes of length." | ||
| 76 | - backend = _backend_with_samples(base_samples, ft_samples) | ||
| 77 | - probe, spec = build_probe( | ||
| 78 | - { | ||
| 79 | - "name": "c1", | ||
| 80 | - "kind": "style_fingerprint", | ||
| 81 | - "prompts": ["p0", "p1"], | ||
| 82 | - "doc_reference": doc, | ||
| 83 | - "assert_shift_gte": 0.25, | ||
| 84 | - } | ||
| 85 | - ) | ||
| 86 | - ctx = RunContext(backend=backend) | ||
| 87 | - result = probe.run(spec, ctx) | ||
| 88 | - assert result.verdict == Verdict.FAIL | ||
| 89 | - | ||
| 90 | - def test_skip_without_doc_reference(self) -> None: | ||
| 91 | - backend = _backend_with_samples(["x"], ["y"]) | ||
| 92 | - probe, spec = build_probe( | ||
| 93 | - { | ||
| 94 | - "name": "c1", | ||
| 95 | - "kind": "style_fingerprint", | ||
| 96 | - "prompts": ["p0"], | ||
| 97 | - } | ||
| 98 | - ) | ||
| 99 | - ctx = RunContext(backend=backend) | ||
| 100 | - result = probe.run(spec, ctx) | ||
| 101 | - assert result.verdict == Verdict.SKIP | ||
| 102 | - | ||
| 103 | - def test_error_on_empty_prompts(self) -> None: | ||
| 104 | - backend = _backend_with_samples([], []) | ||
| 105 | - probe, spec = build_probe( | ||
| 106 | - { | ||
| 107 | - "name": "c1", | ||
| 108 | - "kind": "style_fingerprint", | ||
| 109 | - "prompts": [], | ||
| 110 | - "doc_reference": "doc", | ||
| 111 | - } | ||
| 112 | - ) | ||
| 113 | - ctx = RunContext(backend=backend) | ||
| 114 | - result = probe.run(spec, ctx) | ||
| 115 | - assert result.verdict == Verdict.ERROR | ||
sway/tests/unit/test_result.pydeleted@@ -1,82 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.core.result`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dataclasses import FrozenInstanceError | ||
| 6 | - | ||
| 7 | -import pytest | ||
| 8 | - | ||
| 9 | -from dlm_sway.core.result import ( | ||
| 10 | - DEFAULT_COMPONENT_WEIGHTS, | ||
| 11 | - ProbeResult, | ||
| 12 | - SuiteResult, | ||
| 13 | - SwayScore, | ||
| 14 | - Verdict, | ||
| 15 | - utcnow, | ||
| 16 | -) | ||
| 17 | - | ||
| 18 | - | ||
| 19 | -class TestVerdict: | ||
| 20 | - def test_is_str_enum(self) -> None: | ||
| 21 | - assert Verdict.PASS.value == "pass" | ||
| 22 | - assert str(Verdict.WARN.value) == "warn" | ||
| 23 | - | ||
| 24 | - def test_all_expected_members(self) -> None: | ||
| 25 | - assert {v.value for v in Verdict} == { | ||
| 26 | - "pass", | ||
| 27 | - "fail", | ||
| 28 | - "warn", | ||
| 29 | - "skip", | ||
| 30 | - "error", | ||
| 31 | - } | ||
| 32 | - | ||
| 33 | - | ||
| 34 | -class TestProbeResult: | ||
| 35 | - def test_minimum_construction(self) -> None: | ||
| 36 | - r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82) | ||
| 37 | - assert r.raw is None | ||
| 38 | - assert r.evidence == {} | ||
| 39 | - assert r.message == "" | ||
| 40 | - assert r.duration_s == 0.0 | ||
| 41 | - | ||
| 42 | - def test_frozen(self) -> None: | ||
| 43 | - r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5) | ||
| 44 | - with pytest.raises(FrozenInstanceError): | ||
| 45 | - r.score = 0.6 # type: ignore[misc] | ||
| 46 | - | ||
| 47 | - | ||
| 48 | -class TestSuiteResult: | ||
| 49 | - def test_wall_seconds(self) -> None: | ||
| 50 | - from datetime import timedelta | ||
| 51 | - | ||
| 52 | - started = utcnow() | ||
| 53 | - finished = started + timedelta(seconds=2, milliseconds=500) | ||
| 54 | - result = SuiteResult( | ||
| 55 | - spec_path="sway.yaml", | ||
| 56 | - started_at=started, | ||
| 57 | - finished_at=finished, | ||
| 58 | - base_model_id="b", | ||
| 59 | - adapter_id="a", | ||
| 60 | - sway_version="0.1.0.dev0", | ||
| 61 | - ) | ||
| 62 | - assert result.wall_seconds == pytest.approx(2.5, abs=1e-6) | ||
| 63 | - | ||
| 64 | - | ||
| 65 | -class TestSwayScore: | ||
| 66 | - def test_default_weights_sum_to_one(self) -> None: | ||
| 67 | - assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9 | ||
| 68 | - | ||
| 69 | - def test_band_boundaries(self) -> None: | ||
| 70 | - assert SwayScore.band_for(0.0) == "noise" | ||
| 71 | - assert SwayScore.band_for(0.29) == "noise" | ||
| 72 | - assert SwayScore.band_for(0.30) == "partial" | ||
| 73 | - assert SwayScore.band_for(0.59) == "partial" | ||
| 74 | - assert SwayScore.band_for(0.60) == "healthy" | ||
| 75 | - assert SwayScore.band_for(0.85) == "healthy" | ||
| 76 | - assert SwayScore.band_for(0.851) == "suspicious" | ||
| 77 | - assert SwayScore.band_for(0.99) == "suspicious" | ||
| 78 | - | ||
| 79 | - | ||
| 80 | -def test_utcnow_is_tz_aware() -> None: | ||
| 81 | - now = utcnow() | ||
| 82 | - assert now.tzinfo is not None | ||
sway/tests/unit/test_scoring.pydeleted@@ -1,84 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.core.scoring`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import math | ||
| 6 | - | ||
| 7 | -import numpy as np | ||
| 8 | - | ||
| 9 | -from dlm_sway.core.scoring import ( | ||
| 10 | - DifferentialBackend, | ||
| 11 | - RollingLogprob, | ||
| 12 | - ScoringBackend, | ||
| 13 | - TokenDist, | ||
| 14 | -) | ||
| 15 | - | ||
| 16 | - | ||
| 17 | -class TestRollingLogprob: | ||
| 18 | - def test_empty_sequence(self) -> None: | ||
| 19 | - r = RollingLogprob( | ||
| 20 | - token_ids=np.array([42], dtype=np.int64), | ||
| 21 | - logprobs=np.array([], dtype=np.float32), | ||
| 22 | - num_tokens=1, | ||
| 23 | - total_logprob=0.0, | ||
| 24 | - ) | ||
| 25 | - assert r.mean_logprob == 0.0 | ||
| 26 | - assert r.perplexity == 1.0 | ||
| 27 | - | ||
| 28 | - def test_mean_and_perplexity(self) -> None: | ||
| 29 | - # Three tokens, two transition logprobs summing to -4.0 → mean -2.0. | ||
| 30 | - r = RollingLogprob( | ||
| 31 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 32 | - logprobs=np.array([-1.5, -2.5], dtype=np.float32), | ||
| 33 | - num_tokens=3, | ||
| 34 | - total_logprob=-4.0, | ||
| 35 | - ) | ||
| 36 | - assert math.isclose(r.mean_logprob, -2.0, rel_tol=1e-6) | ||
| 37 | - assert math.isclose(r.perplexity, math.exp(2.0), rel_tol=1e-6) | ||
| 38 | - | ||
| 39 | - | ||
| 40 | -class TestTokenDist: | ||
| 41 | - def test_construction_and_defaults(self) -> None: | ||
| 42 | - dist = TokenDist( | ||
| 43 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | ||
| 44 | - logprobs=np.array([-0.1, -1.0, -3.0], dtype=np.float32), | ||
| 45 | - vocab_size=50_257, | ||
| 46 | - ) | ||
| 47 | - assert dist.tail_logprob == 0.0 | ||
| 48 | - assert dist.token_ids.shape == (3,) | ||
| 49 | - | ||
| 50 | - | ||
| 51 | -class TestProtocols: | ||
| 52 | - def test_scoring_backend_runtime_checkable(self) -> None: | ||
| 53 | - class FakeScoring: | ||
| 54 | - def logprob_of(self, prompt: str, completion: str) -> float: | ||
| 55 | - return 0.0 | ||
| 56 | - | ||
| 57 | - def rolling_logprob(self, text: str) -> RollingLogprob: | ||
| 58 | - return RollingLogprob( | ||
| 59 | - token_ids=np.array([0], dtype=np.int64), | ||
| 60 | - logprobs=np.array([], dtype=np.float32), | ||
| 61 | - num_tokens=1, | ||
| 62 | - total_logprob=0.0, | ||
| 63 | - ) | ||
| 64 | - | ||
| 65 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | ||
| 66 | - return TokenDist( | ||
| 67 | - token_ids=np.array([0], dtype=np.int64), | ||
| 68 | - logprobs=np.array([0.0], dtype=np.float32), | ||
| 69 | - vocab_size=1, | ||
| 70 | - ) | ||
| 71 | - | ||
| 72 | - assert isinstance(FakeScoring(), ScoringBackend) | ||
| 73 | - | ||
| 74 | - def test_differential_backend_runtime_checkable(self) -> None: | ||
| 75 | - from contextlib import nullcontext | ||
| 76 | - | ||
| 77 | - class FakeDiff: | ||
| 78 | - def as_base(self): # type: ignore[no-untyped-def] | ||
| 79 | - return nullcontext(object()) | ||
| 80 | - | ||
| 81 | - def as_finetuned(self): # type: ignore[no-untyped-def] | ||
| 82 | - return nullcontext(object()) | ||
| 83 | - | ||
| 84 | - assert isinstance(FakeDiff(), DifferentialBackend) | ||
sway/tests/unit/test_sections.pydeleted@@ -1,35 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.core.sections`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from dlm_sway.core.sections import ( | ||
| 6 | - Section, | ||
| 7 | - SectionPreference, | ||
| 8 | - SectionProbe, | ||
| 9 | - filter_kinds, | ||
| 10 | -) | ||
| 11 | - | ||
| 12 | - | ||
| 13 | -def test_default_field_types() -> None: | ||
| 14 | - s = Section(id="abc", kind="prose", content="hello world") | ||
| 15 | - assert s.probes == () | ||
| 16 | - assert s.preferences == () | ||
| 17 | - assert s.tag is None | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -def test_filter_kinds() -> None: | ||
| 21 | - sections = ( | ||
| 22 | - Section(id="a", kind="prose", content="x"), | ||
| 23 | - Section(id="b", kind="instruction", content="y"), | ||
| 24 | - Section(id="c", kind="preference", content="z"), | ||
| 25 | - ) | ||
| 26 | - only_prose = filter_kinds(sections, ("prose",)) | ||
| 27 | - assert len(only_prose) == 1 | ||
| 28 | - assert only_prose[0].id == "a" | ||
| 29 | - | ||
| 30 | - | ||
| 31 | -def test_section_probe_and_preference() -> None: | ||
| 32 | - p = SectionProbe(prompt="Q", gold="A") | ||
| 33 | - assert p.prompt == "Q" | ||
| 34 | - pref = SectionPreference(prompt="P", chosen="good", rejected="bad") | ||
| 35 | - assert pref.chosen == "good" | ||
sway/tests/unit/test_suite_runner.pydeleted@@ -1,134 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.suite.runner`. | ||
| 2 | - | ||
| 3 | -Uses the dummy backend + ad-hoc probe classes so nothing real is loaded. | ||
| 4 | -""" | ||
| 5 | - | ||
| 6 | -from __future__ import annotations | ||
| 7 | - | ||
| 8 | -from typing import Literal | ||
| 9 | - | ||
| 10 | -import pytest | ||
| 11 | - | ||
| 12 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | ||
| 13 | -from dlm_sway.core.errors import ProbeError | ||
| 14 | -from dlm_sway.core.result import ProbeResult, Verdict | ||
| 15 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 16 | -from dlm_sway.suite.runner import run | ||
| 17 | -from dlm_sway.suite.spec import SwaySpec | ||
| 18 | - | ||
| 19 | - | ||
| 20 | -class _PassSpec(ProbeSpec): | ||
| 21 | - kind: Literal["__runner_pass"] = "__runner_pass" | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -class _PassProbe(Probe): | ||
| 25 | - kind = "__runner_pass" | ||
| 26 | - spec_cls = _PassSpec | ||
| 27 | - category = "adherence" | ||
| 28 | - | ||
| 29 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 30 | - return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.PASS, score=0.9) | ||
| 31 | - | ||
| 32 | - | ||
| 33 | -class _FailSpec(ProbeSpec): | ||
| 34 | - kind: Literal["__runner_fail"] = "__runner_fail" | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class _FailProbe(Probe): | ||
| 38 | - kind = "__runner_fail" | ||
| 39 | - spec_cls = _FailSpec | ||
| 40 | - category = "attribution" | ||
| 41 | - | ||
| 42 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 43 | - return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.FAIL, score=0.1) | ||
| 44 | - | ||
| 45 | - | ||
| 46 | -class _RaiseSpec(ProbeSpec): | ||
| 47 | - kind: Literal["__runner_raise"] = "__runner_raise" | ||
| 48 | - | ||
| 49 | - | ||
| 50 | -class _RaiseProbe(Probe): | ||
| 51 | - kind = "__runner_raise" | ||
| 52 | - spec_cls = _RaiseSpec | ||
| 53 | - | ||
| 54 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 55 | - raise ProbeError(spec.kind, "kaboom") | ||
| 56 | - | ||
| 57 | - | ||
| 58 | -class _UnexpectedSpec(ProbeSpec): | ||
| 59 | - kind: Literal["__runner_unexpected"] = "__runner_unexpected" | ||
| 60 | - | ||
| 61 | - | ||
| 62 | -class _UnexpectedProbe(Probe): | ||
| 63 | - kind = "__runner_unexpected" | ||
| 64 | - spec_cls = _UnexpectedSpec | ||
| 65 | - | ||
| 66 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 67 | - raise ValueError("surprise") | ||
| 68 | - | ||
| 69 | - | ||
| 70 | -@pytest.fixture | ||
| 71 | -def backend() -> DummyDifferentialBackend: | ||
| 72 | - return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | ||
| 73 | - | ||
| 74 | - | ||
| 75 | -def _spec(*entries: dict) -> SwaySpec: | ||
| 76 | - return SwaySpec.model_validate( | ||
| 77 | - { | ||
| 78 | - "version": 1, | ||
| 79 | - "models": { | ||
| 80 | - "base": {"base": "b"}, | ||
| 81 | - "ft": {"base": "b", "adapter": "/tmp/a"}, | ||
| 82 | - }, | ||
| 83 | - "suite": list(entries), | ||
| 84 | - } | ||
| 85 | - ) | ||
| 86 | - | ||
| 87 | - | ||
| 88 | -class TestRunner: | ||
| 89 | - def test_runs_each_probe_in_order(self, backend: DummyDifferentialBackend) -> None: | ||
| 90 | - spec = _spec( | ||
| 91 | - {"name": "p1", "kind": "__runner_pass"}, | ||
| 92 | - {"name": "p2", "kind": "__runner_fail"}, | ||
| 93 | - ) | ||
| 94 | - result = run(spec, backend) | ||
| 95 | - assert [r.name for r in result.probes] == ["p1", "p2"] | ||
| 96 | - assert result.probes[0].verdict == Verdict.PASS | ||
| 97 | - assert result.probes[1].verdict == Verdict.FAIL | ||
| 98 | - | ||
| 99 | - def test_disabled_probe_records_skip(self, backend: DummyDifferentialBackend) -> None: | ||
| 100 | - spec = _spec({"name": "p1", "kind": "__runner_pass", "enabled": False}) | ||
| 101 | - result = run(spec, backend) | ||
| 102 | - assert result.probes[0].verdict == Verdict.SKIP | ||
| 103 | - assert "disabled" in result.probes[0].message | ||
| 104 | - | ||
| 105 | - def test_probeerror_becomes_error_verdict(self, backend: DummyDifferentialBackend) -> None: | ||
| 106 | - spec = _spec({"name": "oops", "kind": "__runner_raise"}) | ||
| 107 | - result = run(spec, backend) | ||
| 108 | - assert result.probes[0].verdict == Verdict.ERROR | ||
| 109 | - assert "kaboom" in result.probes[0].message | ||
| 110 | - | ||
| 111 | - def test_unexpected_exception_becomes_error_verdict( | ||
| 112 | - self, backend: DummyDifferentialBackend | ||
| 113 | - ) -> None: | ||
| 114 | - spec = _spec({"name": "oops", "kind": "__runner_unexpected"}) | ||
| 115 | - result = run(spec, backend) | ||
| 116 | - assert result.probes[0].verdict == Verdict.ERROR | ||
| 117 | - assert "ValueError" in result.probes[0].message | ||
| 118 | - | ||
| 119 | - def test_wall_seconds_populated(self, backend: DummyDifferentialBackend) -> None: | ||
| 120 | - spec = _spec({"name": "p1", "kind": "__runner_pass"}) | ||
| 121 | - result = run(spec, backend) | ||
| 122 | - assert result.wall_seconds >= 0 | ||
| 123 | - assert result.probes[0].duration_s >= 0 | ||
| 124 | - | ||
| 125 | - def test_null_adapter_passes_on_null_calibrated_backend( | ||
| 126 | - self, backend: DummyDifferentialBackend | ||
| 127 | - ) -> None: | ||
| 128 | - # Dummy backend implements NullCalibratedBackend, so calibration runs. | ||
| 129 | - spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]}) | ||
| 130 | - result = run(spec, backend) | ||
| 131 | - assert result.probes[0].kind == "null_adapter" | ||
| 132 | - assert result.probes[0].verdict == Verdict.PASS | ||
| 133 | - # And the suite's null_stats bubbles up onto the result. | ||
| 134 | - assert "delta_kl" in result.null_stats | ||
sway/tests/unit/test_suite_score_report.pydeleted@@ -1,217 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -import json | ||
| 6 | -from datetime import timedelta | ||
| 7 | -from typing import Literal | ||
| 8 | - | ||
| 9 | -import pytest | ||
| 10 | - | ||
| 11 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | ||
| 12 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | ||
| 13 | -from dlm_sway.suite import report, score | ||
| 14 | -from dlm_sway.suite.spec import SwaySpec | ||
| 15 | - | ||
| 16 | - | ||
| 17 | -class _AdherenceSpec(ProbeSpec): | ||
| 18 | - kind: Literal["__score_adherence"] = "__score_adherence" | ||
| 19 | - | ||
| 20 | - | ||
| 21 | -class _AdherenceProbe(Probe): | ||
| 22 | - kind = "__score_adherence" | ||
| 23 | - spec_cls = _AdherenceSpec | ||
| 24 | - category = "adherence" | ||
| 25 | - | ||
| 26 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 27 | - raise NotImplementedError # never executed; registered for category lookup | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -class _AttributionSpec(ProbeSpec): | ||
| 31 | - kind: Literal["__score_attribution"] = "__score_attribution" | ||
| 32 | - | ||
| 33 | - | ||
| 34 | -class _AttributionProbe(Probe): | ||
| 35 | - kind = "__score_attribution" | ||
| 36 | - spec_cls = _AttributionSpec | ||
| 37 | - category = "attribution" | ||
| 38 | - | ||
| 39 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | ||
| 40 | - raise NotImplementedError | ||
| 41 | - | ||
| 42 | - | ||
| 43 | -def _synth_suite(*probes: ProbeResult) -> SuiteResult: | ||
| 44 | - started = utcnow() | ||
| 45 | - return SuiteResult( | ||
| 46 | - spec_path="sway.yaml", | ||
| 47 | - started_at=started, | ||
| 48 | - finished_at=started + timedelta(seconds=1), | ||
| 49 | - base_model_id="base", | ||
| 50 | - adapter_id="adapter", | ||
| 51 | - sway_version="0.1.0.dev0", | ||
| 52 | - probes=probes, | ||
| 53 | - ) | ||
| 54 | - | ||
| 55 | - | ||
| 56 | -class TestCompute: | ||
| 57 | - def test_single_passing_probe(self) -> None: | ||
| 58 | - suite = _synth_suite( | ||
| 59 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) | ||
| 60 | - ) | ||
| 61 | - s = score.compute(suite) | ||
| 62 | - assert s.overall == pytest.approx(0.8) | ||
| 63 | - assert s.components["adherence"] == pytest.approx(0.8) | ||
| 64 | - assert s.band == "healthy" | ||
| 65 | - | ||
| 66 | - def test_mixed_categories_weighted(self) -> None: | ||
| 67 | - suite = _synth_suite( | ||
| 68 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), | ||
| 69 | - ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3), | ||
| 70 | - ) | ||
| 71 | - s = score.compute(suite) | ||
| 72 | - # Active categories: adherence (0.30) + attribution (0.35). Normalized. | ||
| 73 | - expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35) | ||
| 74 | - assert s.overall == pytest.approx(expected) | ||
| 75 | - | ||
| 76 | - def test_errors_and_skips_excluded(self) -> None: | ||
| 77 | - suite = _synth_suite( | ||
| 78 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), | ||
| 79 | - ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None), | ||
| 80 | - ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None), | ||
| 81 | - ) | ||
| 82 | - s = score.compute(suite) | ||
| 83 | - assert s.components["adherence"] == pytest.approx(0.9) | ||
| 84 | - | ||
| 85 | - def test_per_probe_weights_override_uniform(self) -> None: | ||
| 86 | - suite = _synth_suite( | ||
| 87 | - ProbeResult( | ||
| 88 | - name="a", | ||
| 89 | - kind="__score_adherence", | ||
| 90 | - verdict=Verdict.PASS, | ||
| 91 | - score=1.0, | ||
| 92 | - evidence={"weight": 3.0}, | ||
| 93 | - ), | ||
| 94 | - ProbeResult( | ||
| 95 | - name="b", | ||
| 96 | - kind="__score_adherence", | ||
| 97 | - verdict=Verdict.PASS, | ||
| 98 | - score=0.0, | ||
| 99 | - evidence={"weight": 1.0}, | ||
| 100 | - ), | ||
| 101 | - ) | ||
| 102 | - s = score.compute(suite) | ||
| 103 | - # Weighted mean: (3·1 + 1·0) / 4 = 0.75 | ||
| 104 | - assert s.components["adherence"] == pytest.approx(0.75) | ||
| 105 | - | ||
| 106 | - def test_failed_probe_surfaces_in_findings(self) -> None: | ||
| 107 | - suite = _synth_suite( | ||
| 108 | - ProbeResult( | ||
| 109 | - name="bad", | ||
| 110 | - kind="__score_adherence", | ||
| 111 | - verdict=Verdict.FAIL, | ||
| 112 | - score=0.1, | ||
| 113 | - message="nope", | ||
| 114 | - ) | ||
| 115 | - ) | ||
| 116 | - s = score.compute(suite) | ||
| 117 | - assert any("bad" in f for f in s.findings) | ||
| 118 | - | ||
| 119 | - | ||
| 120 | -class TestJsonReport: | ||
| 121 | - def test_schema_fields(self) -> None: | ||
| 122 | - suite = _synth_suite( | ||
| 123 | - ProbeResult( | ||
| 124 | - name="p1", | ||
| 125 | - kind="__score_adherence", | ||
| 126 | - verdict=Verdict.PASS, | ||
| 127 | - score=0.75, | ||
| 128 | - raw=0.12, | ||
| 129 | - z_score=3.1, | ||
| 130 | - ) | ||
| 131 | - ) | ||
| 132 | - s = score.compute(suite) | ||
| 133 | - out = json.loads(report.to_json(suite, s)) | ||
| 134 | - assert out["schema_version"] == 1 | ||
| 135 | - assert out["score"]["overall"] == pytest.approx(0.75) | ||
| 136 | - assert out["probes"][0]["verdict"] == "pass" | ||
| 137 | - assert out["probes"][0]["z_score"] == pytest.approx(3.1) | ||
| 138 | - | ||
| 139 | - | ||
| 140 | -class TestJunit: | ||
| 141 | - def test_counts_populated(self) -> None: | ||
| 142 | - suite = _synth_suite( | ||
| 143 | - ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0), | ||
| 144 | - ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0), | ||
| 145 | - ProbeResult( | ||
| 146 | - name="p3", | ||
| 147 | - kind="__score_adherence", | ||
| 148 | - verdict=Verdict.ERROR, | ||
| 149 | - score=None, | ||
| 150 | - ), | ||
| 151 | - ) | ||
| 152 | - s = score.compute(suite) | ||
| 153 | - xml = report.to_junit(suite, s) | ||
| 154 | - assert 'tests="3"' in xml | ||
| 155 | - assert 'failures="1"' in xml | ||
| 156 | - assert 'errors="1"' in xml | ||
| 157 | - assert "<failure" in xml | ||
| 158 | - assert "<error" in xml | ||
| 159 | - | ||
| 160 | - | ||
| 161 | -class TestMarkdown: | ||
| 162 | - def test_contains_probe_table(self) -> None: | ||
| 163 | - suite = _synth_suite( | ||
| 164 | - ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) | ||
| 165 | - ) | ||
| 166 | - s = score.compute(suite) | ||
| 167 | - md = report.to_markdown(suite, s) | ||
| 168 | - assert "dlm-sway report" in md | ||
| 169 | - assert "| p1 | `__score_adherence`" in md | ||
| 170 | - | ||
| 171 | - | ||
| 172 | -class TestTerminal: | ||
| 173 | - def test_renders_without_error(self) -> None: | ||
| 174 | - import io | ||
| 175 | - | ||
| 176 | - from rich.console import Console | ||
| 177 | - | ||
| 178 | - suite = _synth_suite( | ||
| 179 | - ProbeResult( | ||
| 180 | - name="p1", | ||
| 181 | - kind="__score_adherence", | ||
| 182 | - verdict=Verdict.PASS, | ||
| 183 | - score=0.8, | ||
| 184 | - raw=0.12, | ||
| 185 | - z_score=3.1, | ||
| 186 | - message="looks fine", | ||
| 187 | - ), | ||
| 188 | - ProbeResult( | ||
| 189 | - name="p2", | ||
| 190 | - kind="__score_attribution", | ||
| 191 | - verdict=Verdict.FAIL, | ||
| 192 | - score=0.1, | ||
| 193 | - message="a very long message that will be truncated — " * 5, | ||
| 194 | - ), | ||
| 195 | - ProbeResult( | ||
| 196 | - name="p3", | ||
| 197 | - kind="__score_adherence", | ||
| 198 | - verdict=Verdict.SKIP, | ||
| 199 | - score=None, | ||
| 200 | - ), | ||
| 201 | - ) | ||
| 202 | - s = score.compute(suite) | ||
| 203 | - buf = io.StringIO() | ||
| 204 | - console = Console(file=buf, force_terminal=False, width=120) | ||
| 205 | - report.to_terminal(suite, s, console=console) | ||
| 206 | - out = buf.getvalue() | ||
| 207 | - assert "dlm-sway report" in out | ||
| 208 | - assert "overall:" in out | ||
| 209 | - assert "p1" in out | ||
| 210 | - assert "p2" in out | ||
| 211 | - # Top findings section kicks in because p2 failed. | ||
| 212 | - assert "top findings" in out | ||
| 213 | - | ||
| 214 | - | ||
| 215 | -# Force the SwaySpec model to stay reachable from tests (keeps mypy happy | ||
| 216 | -# on the eventual CLI path that calls into both). | ||
| 217 | -assert SwaySpec is not None | ||
sway/tests/unit/test_suite_spec.pydeleted@@ -1,85 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.suite.spec` + :mod:`dlm_sway.suite.loader`.""" | ||
| 2 | - | ||
| 3 | -from __future__ import annotations | ||
| 4 | - | ||
| 5 | -from pathlib import Path | ||
| 6 | - | ||
| 7 | -import pytest | ||
| 8 | - | ||
| 9 | -from dlm_sway.core.errors import SpecValidationError | ||
| 10 | -from dlm_sway.suite.loader import from_dict, load_spec | ||
| 11 | -from dlm_sway.suite.spec import SwaySpec | ||
| 12 | - | ||
| 13 | - | ||
| 14 | -def _minimum_valid() -> dict: | ||
| 15 | - return { | ||
| 16 | - "version": 1, | ||
| 17 | - "models": { | ||
| 18 | - "base": {"kind": "hf", "base": "HuggingFaceTB/SmolLM2-135M-Instruct"}, | ||
| 19 | - "ft": { | ||
| 20 | - "kind": "hf", | ||
| 21 | - "base": "HuggingFaceTB/SmolLM2-135M-Instruct", | ||
| 22 | - "adapter": "/tmp/adapter", | ||
| 23 | - }, | ||
| 24 | - }, | ||
| 25 | - "suite": [], | ||
| 26 | - } | ||
| 27 | - | ||
| 28 | - | ||
| 29 | -class TestSwaySpec: | ||
| 30 | - def test_minimum_valid(self) -> None: | ||
| 31 | - spec = from_dict(_minimum_valid()) | ||
| 32 | - assert isinstance(spec, SwaySpec) | ||
| 33 | - assert spec.version == 1 | ||
| 34 | - assert spec.defaults.seed == 0 | ||
| 35 | - assert spec.defaults.differential is True | ||
| 36 | - assert spec.suite == [] | ||
| 37 | - | ||
| 38 | - def test_rejects_unknown_top_level_keys(self) -> None: | ||
| 39 | - data = _minimum_valid() | ||
| 40 | - data["bogus"] = True | ||
| 41 | - with pytest.raises(SpecValidationError) as exc_info: | ||
| 42 | - from_dict(data) | ||
| 43 | - assert "bogus" in str(exc_info.value).lower() | ||
| 44 | - | ||
| 45 | - def test_rejects_future_version(self) -> None: | ||
| 46 | - data = _minimum_valid() | ||
| 47 | - data["version"] = 9 | ||
| 48 | - with pytest.raises(SpecValidationError, match="unsupported sway spec version"): | ||
| 49 | - from_dict(data) | ||
| 50 | - | ||
| 51 | - def test_defaults_frozen(self) -> None: | ||
| 52 | - spec = from_dict(_minimum_valid()) | ||
| 53 | - from pydantic import ValidationError | ||
| 54 | - | ||
| 55 | - with pytest.raises(ValidationError): | ||
| 56 | - spec.defaults.seed = 99 # type: ignore[misc] | ||
| 57 | - | ||
| 58 | - | ||
| 59 | -class TestLoader: | ||
| 60 | - def test_missing_file(self, tmp_path: Path) -> None: | ||
| 61 | - missing = tmp_path / "nope.yaml" | ||
| 62 | - with pytest.raises(SpecValidationError, match="not found"): | ||
| 63 | - load_spec(missing) | ||
| 64 | - | ||
| 65 | - def test_invalid_yaml(self, tmp_path: Path) -> None: | ||
| 66 | - bad = tmp_path / "bad.yaml" | ||
| 67 | - # An unmatched { triggers yaml.scanner; a structurally ambiguous | ||
| 68 | - # indent parses as a string value, which isn't a YAML error. | ||
| 69 | - bad.write_text("{ unmatched: [", encoding="utf-8") | ||
| 70 | - with pytest.raises(SpecValidationError, match="invalid YAML"): | ||
| 71 | - load_spec(bad) | ||
| 72 | - | ||
| 73 | - def test_non_mapping_top_level(self, tmp_path: Path) -> None: | ||
| 74 | - bad = tmp_path / "list.yaml" | ||
| 75 | - bad.write_text("- 1\n- 2\n", encoding="utf-8") | ||
| 76 | - with pytest.raises(SpecValidationError, match="must be a mapping"): | ||
| 77 | - load_spec(bad) | ||
| 78 | - | ||
| 79 | - def test_roundtrip_via_yaml(self, tmp_path: Path) -> None: | ||
| 80 | - import yaml | ||
| 81 | - | ||
| 82 | - path = tmp_path / "sway.yaml" | ||
| 83 | - path.write_text(yaml.safe_dump(_minimum_valid()), encoding="utf-8") | ||
| 84 | - spec = load_spec(path) | ||
| 85 | - assert spec.models.ft.adapter == Path("/tmp/adapter") | ||
sway/tests/unit/test_visualize.pydeleted@@ -1,202 +0,0 @@ | |||
| 1 | -"""Tests for :mod:`dlm_sway.visualize`. | ||
| 2 | - | ||
| 3 | -Exercises the error path (matplotlib missing) and the happy path when | ||
| 4 | -the module is present by stubbing ``matplotlib.pyplot`` via sys.modules. | ||
| 5 | -""" | ||
| 6 | - | ||
| 7 | -from __future__ import annotations | ||
| 8 | - | ||
| 9 | -import sys | ||
| 10 | -import types | ||
| 11 | -from datetime import timedelta | ||
| 12 | - | ||
| 13 | -import pytest | ||
| 14 | - | ||
| 15 | -from dlm_sway.core.errors import BackendNotAvailableError | ||
| 16 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | ||
| 17 | - | ||
| 18 | - | ||
| 19 | -def _suite_with(*probes: ProbeResult) -> SuiteResult: | ||
| 20 | - started = utcnow() | ||
| 21 | - return SuiteResult( | ||
| 22 | - spec_path="sway.yaml", | ||
| 23 | - started_at=started, | ||
| 24 | - finished_at=started + timedelta(seconds=1), | ||
| 25 | - base_model_id="b", | ||
| 26 | - adapter_id="a", | ||
| 27 | - sway_version="0.1.0.dev0", | ||
| 28 | - probes=probes, | ||
| 29 | - ) | ||
| 30 | - | ||
| 31 | - | ||
| 32 | -class _FakeFig: | ||
| 33 | - def tight_layout(self) -> None: # pragma: no cover — trivial | ||
| 34 | - return None | ||
| 35 | - | ||
| 36 | - | ||
| 37 | -class _FakeAx: | ||
| 38 | - def __init__(self) -> None: | ||
| 39 | - self.calls: list[str] = [] | ||
| 40 | - | ||
| 41 | - def bar(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 42 | - self.calls.append("bar") | ||
| 43 | - | ||
| 44 | - def plot(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 45 | - self.calls.append("plot") | ||
| 46 | - | ||
| 47 | - def hist(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 48 | - self.calls.append("hist") | ||
| 49 | - | ||
| 50 | - def axhline(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 51 | - return None | ||
| 52 | - | ||
| 53 | - def axvline(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 54 | - return None | ||
| 55 | - | ||
| 56 | - def set_xticks(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 57 | - return None | ||
| 58 | - | ||
| 59 | - def set_xticklabels(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 60 | - return None | ||
| 61 | - | ||
| 62 | - def set_xlabel(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 63 | - return None | ||
| 64 | - | ||
| 65 | - def set_ylabel(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 66 | - return None | ||
| 67 | - | ||
| 68 | - def set_title(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 69 | - return None | ||
| 70 | - | ||
| 71 | - def legend(self, *a, **k): # type: ignore[no-untyped-def] | ||
| 72 | - return None | ||
| 73 | - | ||
| 74 | - | ||
| 75 | -@pytest.fixture | ||
| 76 | -def fake_mpl(monkeypatch: pytest.MonkeyPatch) -> _FakeAx: | ||
| 77 | - ax = _FakeAx() | ||
| 78 | - | ||
| 79 | - def _subplots(*a, **k): # type: ignore[no-untyped-def] | ||
| 80 | - return _FakeFig(), ax | ||
| 81 | - | ||
| 82 | - plt = types.ModuleType("matplotlib.pyplot") | ||
| 83 | - plt.subplots = _subplots # type: ignore[attr-defined] | ||
| 84 | - mpl_pkg = types.ModuleType("matplotlib") | ||
| 85 | - monkeypatch.setitem(sys.modules, "matplotlib", mpl_pkg) | ||
| 86 | - monkeypatch.setitem(sys.modules, "matplotlib.pyplot", plt) | ||
| 87 | - return ax | ||
| 88 | - | ||
| 89 | - | ||
| 90 | -def test_section_sis_plot_uses_per_section_evidence(fake_mpl: _FakeAx) -> None: | ||
| 91 | - from dlm_sway.visualize import plot_section_sis | ||
| 92 | - | ||
| 93 | - suite = _suite_with( | ||
| 94 | - ProbeResult( | ||
| 95 | - name="sis", | ||
| 96 | - kind="section_internalization", | ||
| 97 | - verdict=Verdict.PASS, | ||
| 98 | - score=0.75, | ||
| 99 | - raw=0.1, | ||
| 100 | - evidence={ | ||
| 101 | - "per_section": [ | ||
| 102 | - { | ||
| 103 | - "section_id": "a", | ||
| 104 | - "kind": "prose", | ||
| 105 | - "tag": None, | ||
| 106 | - "base_nll": 3.0, | ||
| 107 | - "ft_nll": 2.5, | ||
| 108 | - "own_lift": 0.17, | ||
| 109 | - "leak_lift": 0.02, | ||
| 110 | - "effective_sis": 0.15, | ||
| 111 | - "passed": True, | ||
| 112 | - }, | ||
| 113 | - { | ||
| 114 | - "section_id": "b", | ||
| 115 | - "kind": "instruction", | ||
| 116 | - "tag": "intro", | ||
| 117 | - "base_nll": 4.0, | ||
| 118 | - "ft_nll": 3.9, | ||
| 119 | - "own_lift": 0.025, | ||
| 120 | - "leak_lift": 0.03, | ||
| 121 | - "effective_sis": -0.005, | ||
| 122 | - "passed": False, | ||
| 123 | - }, | ||
| 124 | - ], | ||
| 125 | - "per_section_threshold": 0.05, | ||
| 126 | - }, | ||
| 127 | - ) | ||
| 128 | - ) | ||
| 129 | - plot_section_sis(suite) | ||
| 130 | - assert "bar" in fake_mpl.calls | ||
| 131 | - | ||
| 132 | - | ||
| 133 | -def test_adapter_ablation_plot(fake_mpl: _FakeAx) -> None: | ||
| 134 | - from dlm_sway.visualize import plot_adapter_ablation | ||
| 135 | - | ||
| 136 | - suite = _suite_with( | ||
| 137 | - ProbeResult( | ||
| 138 | - name="abl", | ||
| 139 | - kind="adapter_ablation", | ||
| 140 | - verdict=Verdict.PASS, | ||
| 141 | - score=0.8, | ||
| 142 | - raw=0.9, | ||
| 143 | - evidence={ | ||
| 144 | - "lambdas": [0.0, 0.5, 1.0, 1.25], | ||
| 145 | - "mean_divergence_per_lambda": [0.0, 0.5, 1.0, 1.1], | ||
| 146 | - "linearity": 0.91, | ||
| 147 | - "saturation_lambda": 0.75, | ||
| 148 | - "overshoot": 1.1, | ||
| 149 | - }, | ||
| 150 | - ) | ||
| 151 | - ) | ||
| 152 | - plot_adapter_ablation(suite) | ||
| 153 | - assert "plot" in fake_mpl.calls | ||
| 154 | - | ||
| 155 | - | ||
| 156 | -def test_kl_histogram_plot(fake_mpl: _FakeAx) -> None: | ||
| 157 | - from dlm_sway.visualize import plot_kl_histogram | ||
| 158 | - | ||
| 159 | - suite = _suite_with( | ||
| 160 | - ProbeResult( | ||
| 161 | - name="dk", | ||
| 162 | - kind="delta_kl", | ||
| 163 | - verdict=Verdict.PASS, | ||
| 164 | - score=0.7, | ||
| 165 | - raw=0.1, | ||
| 166 | - evidence={"per_prompt": [0.05, 0.1, 0.12, 0.09, 0.15], "divergence_kind": "js"}, | ||
| 167 | - ) | ||
| 168 | - ) | ||
| 169 | - plot_kl_histogram(suite) | ||
| 170 | - assert "hist" in fake_mpl.calls | ||
| 171 | - | ||
| 172 | - | ||
| 173 | -def test_raises_when_matplotlib_missing(monkeypatch: pytest.MonkeyPatch) -> None: | ||
| 174 | - # Purge matplotlib modules and block imports. | ||
| 175 | - for mod in list(sys.modules): | ||
| 176 | - if mod == "matplotlib" or mod.startswith("matplotlib."): | ||
| 177 | - monkeypatch.delitem(sys.modules, mod, raising=False) | ||
| 178 | - | ||
| 179 | - import builtins | ||
| 180 | - | ||
| 181 | - real_import = builtins.__import__ | ||
| 182 | - | ||
| 183 | - def fake_import(name: str, *a, **k): # type: ignore[no-untyped-def] | ||
| 184 | - if name == "matplotlib" or name.startswith("matplotlib."): | ||
| 185 | - raise ImportError("matplotlib missing in this venv") | ||
| 186 | - return real_import(name, *a, **k) | ||
| 187 | - | ||
| 188 | - monkeypatch.setattr(builtins, "__import__", fake_import) | ||
| 189 | - | ||
| 190 | - from dlm_sway.visualize import plot_section_sis | ||
| 191 | - | ||
| 192 | - suite = _suite_with() | ||
| 193 | - with pytest.raises(BackendNotAvailableError): | ||
| 194 | - plot_section_sis(suite) | ||
| 195 | - | ||
| 196 | - | ||
| 197 | -def test_raises_when_no_matching_probe(fake_mpl: _FakeAx) -> None: | ||
| 198 | - from dlm_sway.visualize import plot_section_sis | ||
| 199 | - | ||
| 200 | - suite = _suite_with() # empty — no section_internalization probe | ||
| 201 | - with pytest.raises(ValueError, match="section_internalization"): | ||
| 202 | - plot_section_sis(suite) | ||