sway: convert in-tree subproject to git submodule pointing at tenseleyFlow/sway
- SHA
72bb0030b72321dea3c66a2e6d7ce26e52c74550- Parents
-
9da4019 - Tree
e628ba5
72bb003
72bb0030b72321dea3c66a2e6d7ce26e52c745509da4019
e628ba5.gitmodulesmodified@@ -5,3 +5,6 @@ | ||
| 5 | 5 | # `scripts/bump-llama-cpp.sh build` writes under vendor/llama.cpp/build/ |
| 6 | 6 | # which the submodule's own .gitignore covers. |
| 7 | 7 | ignore = untracked |
| 8 | +[submodule "sway"] | |
| 9 | + path = sway | |
| 10 | + url = https://github.com/tenseleyFlow/sway.git | |
swayadded@@ -0,0 +1,1 @@ | ||
| 1 | +Subproject commit 98ad9417c94e1bbeb97cf5e553878d7953513f69 | |
sway/CHANGELOG.mddeleted@@ -1,41 +0,0 @@ | ||
| 1 | -# Changelog | |
| 2 | - | |
| 3 | -## 0.1.0.dev0 — 2026-04-20 | |
| 4 | - | |
| 5 | -Initial pre-alpha. Full 11-primitive battery shipped. | |
| 6 | - | |
| 7 | -### Primitives | |
| 8 | - | |
| 9 | -- **Adherence** | |
| 10 | - - `delta_kl` — mean JS/KL divergence between base and fine-tuned next-token distributions | |
| 11 | - - `adapter_revert` — reversion under adversarial paraphrase (needs `sway-eval[semsim]`) | |
| 12 | - - `prompt_collapse` — exponential-decay fit of divergence over context length | |
| 13 | -- **Attribution** | |
| 14 | - - `section_internalization` *(flagship)* — per-section `effective_sis` with leak check | |
| 15 | - - `paraphrase_invariance` — memorization vs. generalization, intent-aware | |
| 16 | - - `preference_flip` — DPO/ORPO chosen/rejected margin inversion | |
| 17 | -- **Calibration** | |
| 18 | - - `style_fingerprint` — 6-dim numpy-only stylistic shift vs. document | |
| 19 | - - `calibration_drift` — general-knowledge regression on a packaged 30-item pack | |
| 20 | - - `leakage` — greedy LCS recall + perturbation fragility | |
| 21 | -- **Ablation** | |
| 22 | - - `adapter_ablation` *(signature primitive)* — λ-scaled divergence curve with linearity, saturation, overshoot metrics | |
| 23 | -- **Baseline** | |
| 24 | - - `null_adapter` — stats scaffolding for z-score calibration (implementation pending) | |
| 25 | - | |
| 26 | -### Infrastructure | |
| 27 | - | |
| 28 | -- `DifferentialBackend` + `ScalableDifferentialBackend` protocols | |
| 29 | -- HuggingFace + PEFT backend with `disable_adapter` / `set_adapter` toggling and LoRA-scale mutation | |
| 30 | -- Dummy backend for unit tests (canned responses + linear-blend scalable mode) | |
| 31 | -- YAML spec loader, composite score (four-category weighted), rich terminal + JSON + JUnit + Markdown reports | |
| 32 | -- Typer CLI: `run`, `gate`, `check`, `diff`, `autogen`, `doctor`, `report` | |
| 33 | -- `.dlm` bridge (`dlm-sway[dlm]`): resolver + full-battery autogen | |
| 34 | -- Matplotlib visualizations (`dlm-sway[viz]`): SIS bar chart, ablation curve, KL histogram | |
| 35 | - | |
| 36 | -### Known gaps | |
| 37 | - | |
| 38 | -- Null-adapter baseline is scaffolded but its HF-level materialization (building random-init LoRAs at matched rank) is not yet wired — probes fall back to fixed thresholds until the next milestone. | |
| 39 | -- Custom backend entry-point dispatch (`kind: custom`) is stubbed but not implemented. | |
| 40 | -- MLX backend is registered as a future-milestone target; all MLX paths raise `BackendNotAvailableError`. | |
| 41 | -- PyPI publication of the `dlm-sway` wheel is pending a clean CI release workflow. | |
sway/LICENSEdeleted@@ -1,21 +0,0 @@ | ||
| 1 | -MIT License | |
| 2 | - | |
| 3 | -Copyright (c) 2026 Matt Wolffe | |
| 4 | - | |
| 5 | -Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 6 | -of this software and associated documentation files (the "Software"), to deal | |
| 7 | -in the Software without restriction, including without limitation the rights | |
| 8 | -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 9 | -copies of the Software, and to permit persons to whom the Software is | |
| 10 | -furnished to do so, subject to the following conditions: | |
| 11 | - | |
| 12 | -The above copyright notice and this permission notice shall be included in all | |
| 13 | -copies or substantial portions of the Software. | |
| 14 | - | |
| 15 | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 16 | -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 17 | -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 18 | -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 19 | -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 20 | -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 21 | -SOFTWARE. | |
sway/README.mddeleted@@ -1,101 +0,0 @@ | ||
| 1 | -# dlm-sway | |
| 2 | - | |
| 3 | -Differential testing for fine-tuned causal language models. | |
| 4 | - | |
| 5 | -**One question:** *did LoRA/QLoRA training actually change model behavior | |
| 6 | -in a meaningful way, or is the model just defaulting to the pretrained | |
| 7 | -base?* | |
| 8 | - | |
| 9 | -`dlm-sway` gives you a trustworthy, reproducible answer with eleven | |
| 10 | -purpose-built primitives, each z-scored against a null-adapter baseline. | |
| 11 | -No LLM judges. No external APIs. Deterministic on CPU where possible. | |
| 12 | - | |
| 13 | -## Install | |
| 14 | - | |
| 15 | -```bash | |
| 16 | -pip install "dlm-sway[hf]" # HuggingFace + PEFT backend | |
| 17 | -pip install "dlm-sway[hf,style,semsim]" # full primitive battery | |
| 18 | -pip install "dlm-sway[all]" # everything including optional viz | |
| 19 | -pip install "dlm-sway[dlm]" # auto-generate tests from a .dlm file | |
| 20 | -``` | |
| 21 | - | |
| 22 | -## 90-second smoke test | |
| 23 | - | |
| 24 | -```bash | |
| 25 | -dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct | |
| 26 | -``` | |
| 27 | - | |
| 28 | -Outputs a verdict in under a minute on CPU for small models: *your | |
| 29 | -adapter is 4.2σ above noise* ✅ or *indistinguishable from a null | |
| 30 | -adapter* ❌. | |
| 31 | - | |
| 32 | -## Full suite | |
| 33 | - | |
| 34 | -```yaml | |
| 35 | -# sway.yaml | |
| 36 | -version: 1 | |
| 37 | -models: | |
| 38 | - base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"} | |
| 39 | - ft: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct", | |
| 40 | - adapter: "./runs/adapter/v0003"} | |
| 41 | -suite: | |
| 42 | - - {name: knows_concept, kind: dir, | |
| 43 | - prompt: "The Dunning-Kruger effect describes", | |
| 44 | - target: " a cognitive bias where", | |
| 45 | - distractor: " a programming language"} | |
| 46 | - - {name: no_reversion, kind: adapter_revert, paraphrases: 4} | |
| 47 | - - {name: section_attribution, kind: section_internalization} | |
| 48 | -``` | |
| 49 | - | |
| 50 | -```bash | |
| 51 | -dlm-sway run sway.yaml # full report to terminal + JSON | |
| 52 | -dlm-sway gate sway.yaml --junit # CI-friendly; non-zero on fail | |
| 53 | -``` | |
| 54 | - | |
| 55 | -## Why it exists | |
| 56 | - | |
| 57 | -Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"* | |
| 58 | -That's the wrong question after a targeted LoRA fine-tune on a small | |
| 59 | -user-authored document. The right question is *"did the adapter actually | |
| 60 | -move the model toward what I wrote?"* — and existing tools answer this | |
| 61 | -poorly. | |
| 62 | - | |
| 63 | -`dlm-sway` answers it directly via eleven primitives across four | |
| 64 | -categories: | |
| 65 | - | |
| 66 | -| Category | Primitives | | |
| 67 | -|---------------|-------------------------------------------------------| | |
| 68 | -| Adherence | `delta_kl`, `adapter_revert`, `prompt_collapse` | | |
| 69 | -| Attribution | `section_internalization`, `paraphrase_invariance`, `preference_flip` | | |
| 70 | -| Calibration | `style_fingerprint`, `calibration_drift`, `leakage` | | |
| 71 | -| Ablation | `adapter_ablation` ← the signature primitive | | |
| 72 | - | |
| 73 | -**The signature primitive.** `adapter_ablation` scales the LoRA additive | |
| 74 | -term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence | |
| 75 | -curve. A healthy fine-tune shows a smooth, monotonic, non-saturated | |
| 76 | -response. A degenerate one shows a step function or an overshoot-then- | |
| 77 | -crash. Nobody else does this because nobody else gets this close to the | |
| 78 | -adapter math. | |
| 79 | - | |
| 80 | -## The `.dlm` integration | |
| 81 | - | |
| 82 | -If you trained your adapter via the [DocumentLanguageModel | |
| 83 | -project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway | |
| 84 | -can auto-generate a test suite from your document's sections: | |
| 85 | - | |
| 86 | -```bash | |
| 87 | -pip install "dlm-sway[hf,dlm]" | |
| 88 | -dlm-sway autogen path/to/doc.dlm -o sway.yaml | |
| 89 | -dlm-sway run sway.yaml | |
| 90 | -``` | |
| 91 | - | |
| 92 | -Per-section attribution tells you *which* parts of your document | |
| 93 | -actually moved the model — a kind of signal no other tool provides. | |
| 94 | - | |
| 95 | -## Status | |
| 96 | - | |
| 97 | -Pre-alpha. API will break. Version `0.1.0` is the first tag. | |
| 98 | - | |
| 99 | -## License | |
| 100 | - | |
| 101 | -MIT | |
sway/pyproject.tomldeleted@@ -1,210 +0,0 @@ | ||
| 1 | -[project] | |
| 2 | -name = "dlm-sway" | |
| 3 | -version = "0.1.0.dev0" | |
| 4 | -description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?" | |
| 5 | -readme = "README.md" | |
| 6 | -requires-python = ">=3.11" | |
| 7 | -license = { text = "MIT" } | |
| 8 | -authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }] | |
| 9 | -keywords = [ | |
| 10 | - "lora", | |
| 11 | - "qlora", | |
| 12 | - "peft", | |
| 13 | - "fine-tuning", | |
| 14 | - "evaluation", | |
| 15 | - "llm", | |
| 16 | - "differential-testing", | |
| 17 | -] | |
| 18 | -classifiers = [ | |
| 19 | - "Development Status :: 3 - Alpha", | |
| 20 | - "Intended Audience :: Developers", | |
| 21 | - "Intended Audience :: Science/Research", | |
| 22 | - "License :: OSI Approved :: MIT License", | |
| 23 | - "Programming Language :: Python :: 3", | |
| 24 | - "Programming Language :: Python :: 3.11", | |
| 25 | - "Programming Language :: Python :: 3.12", | |
| 26 | - "Topic :: Scientific/Engineering :: Artificial Intelligence", | |
| 27 | -] | |
| 28 | - | |
| 29 | -# Core deps: spec loading, orchestration, reporting. No torch — a user | |
| 30 | -# who only defines specs or writes a custom backend shouldn't pull 3 GB | |
| 31 | -# of CUDA wheels. | |
| 32 | -dependencies = [ | |
| 33 | - "pydantic>=2.9", | |
| 34 | - "pyyaml>=6.0", | |
| 35 | - "typer>=0.12", | |
| 36 | - "rich>=13.7", | |
| 37 | - "numpy>=1.26", | |
| 38 | - "packaging>=24.0", | |
| 39 | -] | |
| 40 | - | |
| 41 | -[project.optional-dependencies] | |
| 42 | -# HuggingFace + PEFT scoring backend. The canonical path. | |
| 43 | -hf = [ | |
| 44 | - "torch>=2.4", | |
| 45 | - "transformers>=4.45", | |
| 46 | - "peft>=0.13", | |
| 47 | - "safetensors>=0.4", | |
| 48 | -] | |
| 49 | -# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op | |
| 50 | -# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays | |
| 51 | -# sane. | |
| 52 | -mlx = [ | |
| 53 | - "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | |
| 54 | - "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | |
| 55 | -] | |
| 56 | -# Stylistic fingerprinting (C1). spaCy models pull at runtime via | |
| 57 | -# `python -m spacy download`. | |
| 58 | -style = [ | |
| 59 | - "spacy>=3.7", | |
| 60 | - "textstat>=0.7", | |
| 61 | - "nlpaug>=1.1", | |
| 62 | -] | |
| 63 | -# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly. | |
| 64 | -semsim = [ | |
| 65 | - "sentence-transformers>=3.0", | |
| 66 | -] | |
| 67 | -# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm. | |
| 68 | -dlm = [ | |
| 69 | - "dlm>=0.9", | |
| 70 | -] | |
| 71 | -# Visualization (P9). | |
| 72 | -viz = [ | |
| 73 | - "matplotlib>=3.8", | |
| 74 | -] | |
| 75 | -all = [ | |
| 76 | - "torch>=2.4", | |
| 77 | - "transformers>=4.45", | |
| 78 | - "peft>=0.13", | |
| 79 | - "safetensors>=0.4", | |
| 80 | - "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | |
| 81 | - "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | |
| 82 | - "spacy>=3.7", | |
| 83 | - "textstat>=0.7", | |
| 84 | - "nlpaug>=1.1", | |
| 85 | - "sentence-transformers>=3.0", | |
| 86 | - "matplotlib>=3.8", | |
| 87 | -] | |
| 88 | - | |
| 89 | -[project.scripts] | |
| 90 | -dlm-sway = "dlm_sway.cli.app:main" | |
| 91 | - | |
| 92 | -[project.urls] | |
| 93 | -Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel" | |
| 94 | -Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues" | |
| 95 | - | |
| 96 | -[dependency-groups] | |
| 97 | -dev = [ | |
| 98 | - "pytest>=8.0", | |
| 99 | - "pytest-cov>=5.0", | |
| 100 | - "mypy>=1.11", | |
| 101 | - "ruff>=0.6", | |
| 102 | - "types-pyyaml>=6.0", | |
| 103 | - "hypothesis>=6.152.1", | |
| 104 | -] | |
| 105 | - | |
| 106 | -[build-system] | |
| 107 | -requires = ["hatchling"] | |
| 108 | -build-backend = "hatchling.build" | |
| 109 | - | |
| 110 | -[tool.hatch.build.targets.wheel] | |
| 111 | -packages = ["src/dlm_sway"] | |
| 112 | - | |
| 113 | -# -------- ruff -------- | |
| 114 | -[tool.ruff] | |
| 115 | -line-length = 100 | |
| 116 | -target-version = "py311" | |
| 117 | -src = ["src", "tests"] | |
| 118 | - | |
| 119 | -[tool.ruff.lint] | |
| 120 | -select = [ | |
| 121 | - "E", # pycodestyle errors | |
| 122 | - "F", # pyflakes | |
| 123 | - "W", # pycodestyle warnings | |
| 124 | - "I", # isort | |
| 125 | - "UP", # pyupgrade | |
| 126 | - "B", # bugbear | |
| 127 | - "N", # pep8-naming | |
| 128 | - "C4", # comprehensions | |
| 129 | - "SIM", # simplify | |
| 130 | - "PT", # pytest | |
| 131 | - "RET", # return | |
| 132 | - "ARG", # unused args | |
| 133 | - "PTH", # use pathlib | |
| 134 | - "TID", # tidy imports | |
| 135 | -] | |
| 136 | -ignore = [ | |
| 137 | - "E501", # handled by formatter | |
| 138 | -] | |
| 139 | - | |
| 140 | -[tool.ruff.lint.per-file-ignores] | |
| 141 | -"tests/**/*.py" = ["ARG", "PT011", "SIM117"] | |
| 142 | -# PyTorch's canonical `import torch.nn.functional as F` is universally | |
| 143 | -# read, so we allow the naming exception in the HF backend only. | |
| 144 | -"src/dlm_sway/backends/hf.py" = ["N812"] | |
| 145 | -# The .dlm bridge is the one place allowed to import the ``dlm`` package. | |
| 146 | -"src/dlm_sway/integrations/dlm/*.py" = ["TID251"] | |
| 147 | - | |
| 148 | -[tool.ruff.lint.flake8-tidy-imports.banned-api] | |
| 149 | -# Hard architectural boundary: the `dlm` package is only importable | |
| 150 | -# from inside the optional integration shim. This keeps dlm-sway | |
| 151 | -# usable for anyone with just a HuggingFace base + PEFT adapter. | |
| 152 | -"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)." | |
| 153 | - | |
| 154 | -[tool.ruff.format] | |
| 155 | -quote-style = "double" | |
| 156 | -indent-style = "space" | |
| 157 | - | |
| 158 | -# -------- mypy -------- | |
| 159 | -[tool.mypy] | |
| 160 | -strict = true | |
| 161 | -python_version = "3.11" | |
| 162 | -packages = ["dlm_sway"] | |
| 163 | -mypy_path = "src" | |
| 164 | -warn_return_any = true | |
| 165 | -warn_unused_ignores = true | |
| 166 | -warn_redundant_casts = true | |
| 167 | -no_implicit_optional = true | |
| 168 | -disallow_untyped_decorators = true | |
| 169 | -plugins = ["pydantic.mypy"] | |
| 170 | - | |
| 171 | -[tool.pydantic-mypy] | |
| 172 | -init_forbid_extra = true | |
| 173 | -init_typed = true | |
| 174 | -warn_required_dynamic_aliases = true | |
| 175 | - | |
| 176 | -# Stubless ML ecosystem packages. Narrow boundaries in backends/* import | |
| 177 | -# them explicitly; the rest of the codebase stays strict. | |
| 178 | -[[tool.mypy.overrides]] | |
| 179 | -module = [ | |
| 180 | - "torch", | |
| 181 | - "torch.*", | |
| 182 | - "transformers.*", | |
| 183 | - "peft.*", | |
| 184 | - "safetensors.*", | |
| 185 | - "mlx.*", | |
| 186 | - "mlx_lm.*", | |
| 187 | - "sentence_transformers.*", | |
| 188 | - "spacy.*", | |
| 189 | - "textstat.*", | |
| 190 | - "nlpaug.*", | |
| 191 | - "matplotlib", | |
| 192 | - "matplotlib.*", | |
| 193 | - "huggingface_hub.*", | |
| 194 | - "dlm.*", | |
| 195 | -] | |
| 196 | -ignore_missing_imports = true | |
| 197 | -disable_error_code = ["no-untyped-call"] | |
| 198 | - | |
| 199 | -# -------- pytest -------- | |
| 200 | -[tool.pytest.ini_options] | |
| 201 | -testpaths = ["tests"] | |
| 202 | -addopts = [ | |
| 203 | - "-ra", | |
| 204 | - "-m", "not slow and not gpu and not online", | |
| 205 | -] | |
| 206 | -markers = [ | |
| 207 | - "slow: expensive; deselected by default", | |
| 208 | - "gpu: requires CUDA; skipped on CPU/MPS runners", | |
| 209 | - "online: touches the network; skipped in offline CI", | |
| 210 | -] | |
sway/src/dlm_sway/__init__.pydeleted@@ -1,42 +0,0 @@ | ||
| 1 | -"""dlm-sway — differential testing for fine-tuned causal language models.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.core.errors import ( | |
| 6 | - BackendNotAvailableError, | |
| 7 | - ProbeError, | |
| 8 | - SpecValidationError, | |
| 9 | - SwayError, | |
| 10 | -) | |
| 11 | -from dlm_sway.core.model import LoadedModel, Model, ModelSpec | |
| 12 | -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict | |
| 13 | -from dlm_sway.core.scoring import ( | |
| 14 | - DifferentialBackend, | |
| 15 | - NullCalibratedBackend, | |
| 16 | - RollingLogprob, | |
| 17 | - ScalableDifferentialBackend, | |
| 18 | - ScoringBackend, | |
| 19 | - TokenDist, | |
| 20 | -) | |
| 21 | - | |
| 22 | -__all__ = [ | |
| 23 | - "BackendNotAvailableError", | |
| 24 | - "DifferentialBackend", | |
| 25 | - "LoadedModel", | |
| 26 | - "Model", | |
| 27 | - "ModelSpec", | |
| 28 | - "NullCalibratedBackend", | |
| 29 | - "ProbeError", | |
| 30 | - "ProbeResult", | |
| 31 | - "RollingLogprob", | |
| 32 | - "ScalableDifferentialBackend", | |
| 33 | - "ScoringBackend", | |
| 34 | - "SpecValidationError", | |
| 35 | - "SuiteResult", | |
| 36 | - "SwayError", | |
| 37 | - "SwayScore", | |
| 38 | - "TokenDist", | |
| 39 | - "Verdict", | |
| 40 | -] | |
| 41 | - | |
| 42 | -__version__ = "0.1.0.dev0" | |
sway/src/dlm_sway/backends/__init__.pydeleted@@ -1,118 +0,0 @@ | ||
| 1 | -"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom. | |
| 2 | - | |
| 3 | -Backends are constructed from a :class:`~dlm_sway.core.model.ModelSpec` | |
| 4 | -via :func:`build`. Heavy backends (HF, MLX) import their framework only | |
| 5 | -on construction so ``import dlm_sway`` stays cheap for users who only | |
| 6 | -touch the dummy backend or the spec loader. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -from pathlib import Path | |
| 12 | -from typing import TYPE_CHECKING | |
| 13 | - | |
| 14 | -from dlm_sway.core.errors import SpecValidationError | |
| 15 | -from dlm_sway.core.model import ModelSpec | |
| 16 | - | |
| 17 | -if TYPE_CHECKING: | |
| 18 | - from dlm_sway.core.scoring import DifferentialBackend | |
| 19 | - | |
| 20 | - | |
| 21 | -def build(base_spec: ModelSpec, *, adapter_path: Path | None = None) -> DifferentialBackend: | |
| 22 | - """Materialize a differential backend from a model spec. | |
| 23 | - | |
| 24 | - The adapter path typically comes from ``ft.adapter`` in the spec — | |
| 25 | - it's lifted to a keyword here so the same function can be used for | |
| 26 | - "differential" (base + adapter on one loaded model) or future | |
| 27 | - split-load paths. | |
| 28 | - """ | |
| 29 | - effective_adapter = adapter_path if adapter_path is not None else base_spec.adapter | |
| 30 | - | |
| 31 | - if base_spec.kind == "dummy": | |
| 32 | - # Dummy backend isn't really about the spec — it's for tests | |
| 33 | - # that pre-populate responses. Surface a loud error if someone | |
| 34 | - # tries to build it through the normal path. | |
| 35 | - raise SpecValidationError( | |
| 36 | - "kind='dummy' backends must be constructed directly via " | |
| 37 | - "DummyDifferentialBackend(base=..., ft=...); they cannot be " | |
| 38 | - "materialized from a ModelSpec." | |
| 39 | - ) | |
| 40 | - | |
| 41 | - if base_spec.kind == "hf": | |
| 42 | - if effective_adapter is None: | |
| 43 | - raise SpecValidationError( | |
| 44 | - "hf backend requires an adapter path (set `adapter:` on the ft model)" | |
| 45 | - ) | |
| 46 | - from dlm_sway.backends.hf import HuggingFaceDifferentialBackend | |
| 47 | - | |
| 48 | - return HuggingFaceDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter) | |
| 49 | - | |
| 50 | - if base_spec.kind == "mlx": | |
| 51 | - if effective_adapter is None: | |
| 52 | - raise SpecValidationError( | |
| 53 | - "mlx backend requires an adapter path (set `adapter:` on the ft model; " | |
| 54 | - "must be an MLX .npz adapter — use dlm's peft→mlx converter if needed)" | |
| 55 | - ) | |
| 56 | - from dlm_sway.backends.mlx import MLXDifferentialBackend | |
| 57 | - | |
| 58 | - return MLXDifferentialBackend(base_spec=base_spec, adapter_path=effective_adapter) | |
| 59 | - | |
| 60 | - if base_spec.kind == "custom": | |
| 61 | - return _load_custom(base_spec, effective_adapter) | |
| 62 | - | |
| 63 | - raise SpecValidationError(f"unknown backend kind: {base_spec.kind!r}") | |
| 64 | - | |
| 65 | - | |
| 66 | -def _load_custom(base_spec: ModelSpec, adapter: Path | None) -> DifferentialBackend: | |
| 67 | - """Dispatch to a user-supplied backend via ``entry_point='pkg.mod:Name'``. | |
| 68 | - | |
| 69 | - The imported class is instantiated as ``Cls(base_spec=..., adapter_path=...)`` | |
| 70 | - — the same signature as :class:`dlm_sway.backends.hf.HuggingFaceDifferentialBackend` | |
| 71 | - so authors can model their implementation on the built-in. The | |
| 72 | - result is runtime-checked against :class:`DifferentialBackend` so | |
| 73 | - protocol violations fail at construction, not deep inside a probe. | |
| 74 | - """ | |
| 75 | - from dlm_sway.core.scoring import DifferentialBackend as DiffBackend | |
| 76 | - | |
| 77 | - entry = base_spec.entry_point | |
| 78 | - if not entry: | |
| 79 | - raise SpecValidationError( | |
| 80 | - "kind='custom' requires an entry_point of the form 'pkg.module:ClassName'" | |
| 81 | - ) | |
| 82 | - if ":" not in entry: | |
| 83 | - raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}") | |
| 84 | - module_path, _, class_name = entry.partition(":") | |
| 85 | - if not module_path or not class_name: | |
| 86 | - raise SpecValidationError(f"entry_point must be 'pkg.module:ClassName', got {entry!r}") | |
| 87 | - | |
| 88 | - import importlib | |
| 89 | - | |
| 90 | - try: | |
| 91 | - module = importlib.import_module(module_path) | |
| 92 | - except ImportError as exc: | |
| 93 | - raise SpecValidationError( | |
| 94 | - f"custom backend: cannot import module {module_path!r}: {exc}" | |
| 95 | - ) from exc | |
| 96 | - cls = getattr(module, class_name, None) | |
| 97 | - if cls is None: | |
| 98 | - raise SpecValidationError( | |
| 99 | - f"custom backend: module {module_path!r} has no attribute {class_name!r}" | |
| 100 | - ) | |
| 101 | - | |
| 102 | - try: | |
| 103 | - instance = cls(base_spec=base_spec, adapter_path=adapter) | |
| 104 | - except TypeError as exc: | |
| 105 | - raise SpecValidationError( | |
| 106 | - f"custom backend {entry!r} constructor signature mismatch: {exc}. " | |
| 107 | - "Expected Cls(base_spec: ModelSpec, adapter_path: Path | None)" | |
| 108 | - ) from exc | |
| 109 | - | |
| 110 | - if not isinstance(instance, DiffBackend): | |
| 111 | - raise SpecValidationError( | |
| 112 | - f"custom backend {entry!r} does not satisfy DifferentialBackend " | |
| 113 | - "(needs as_base() and as_finetuned() context managers)" | |
| 114 | - ) | |
| 115 | - return instance | |
| 116 | - | |
| 117 | - | |
| 118 | -__all__ = ["build"] | |
sway/src/dlm_sway/backends/dummy.pydeleted@@ -1,257 +0,0 @@ | ||
| 1 | -"""In-memory backend for unit tests. | |
| 2 | - | |
| 3 | -Deterministic, torchless, and trivially fast. Tests pass canned responses | |
| 4 | -and canned score tables keyed by ``(mode, prompt, completion)``. The same | |
| 5 | -backend instance serves as both ``as_base`` and ``as_finetuned`` — it | |
| 6 | -switches an internal mode flag. | |
| 7 | - | |
| 8 | -Use it to drive every probe's unit test without loading a real model. | |
| 9 | -For integration tests against a real PEFT adapter, see | |
| 10 | -:class:`~dlm_sway.backends.hf.HuggingFaceDifferentialBackend`. | |
| 11 | -""" | |
| 12 | - | |
| 13 | -from __future__ import annotations | |
| 14 | - | |
| 15 | -import math | |
| 16 | -from collections.abc import Iterator | |
| 17 | -from contextlib import contextmanager | |
| 18 | -from dataclasses import dataclass, field | |
| 19 | -from typing import Literal | |
| 20 | - | |
| 21 | -import numpy as np | |
| 22 | - | |
| 23 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | |
| 24 | - | |
| 25 | -Mode = Literal["base", "ft"] | |
| 26 | - | |
| 27 | - | |
| 28 | -@dataclass(slots=True) | |
| 29 | -class DummyResponses: | |
| 30 | - """Canned data for one mode (base or ft). | |
| 31 | - | |
| 32 | - Callers populate one of these per mode and hand both to | |
| 33 | - :class:`DummyDifferentialBackend`. | |
| 34 | - """ | |
| 35 | - | |
| 36 | - generations: dict[str, str] = field(default_factory=dict) | |
| 37 | - """Prompt → canned completion. Lookup is exact-match.""" | |
| 38 | - logprobs: dict[tuple[str, str], float] = field(default_factory=dict) | |
| 39 | - """``(prompt, completion) → sum logprob``. Default ``-10.0`` if missing.""" | |
| 40 | - rolling: dict[str, RollingLogprob] = field(default_factory=dict) | |
| 41 | - """Text → canned :class:`RollingLogprob`.""" | |
| 42 | - token_dists: dict[str, TokenDist] = field(default_factory=dict) | |
| 43 | - """Prompt → canned :class:`TokenDist`.""" | |
| 44 | - | |
| 45 | - | |
| 46 | -class _DummyView: | |
| 47 | - """The per-mode view yielded by ``as_base`` / ``as_finetuned``. | |
| 48 | - | |
| 49 | - Implements :class:`~dlm_sway.core.model.Model` *and* | |
| 50 | - :class:`~dlm_sway.core.scoring.ScoringBackend` — i.e. the | |
| 51 | - ``ScoringModel`` intersection. | |
| 52 | - """ | |
| 53 | - | |
| 54 | - def __init__(self, mode: Mode, responses: DummyResponses) -> None: | |
| 55 | - self.id = mode | |
| 56 | - self._mode: Mode = mode | |
| 57 | - self._r = responses | |
| 58 | - | |
| 59 | - # -- Model --------------------------------------------------------- | |
| 60 | - def generate( | |
| 61 | - self, | |
| 62 | - prompt: str, | |
| 63 | - *, | |
| 64 | - max_new_tokens: int, | |
| 65 | - temperature: float = 0.0, | |
| 66 | - top_p: float = 1.0, | |
| 67 | - seed: int = 0, | |
| 68 | - ) -> str: | |
| 69 | - del max_new_tokens, temperature, top_p, seed # canned; decoding is trivial. | |
| 70 | - try: | |
| 71 | - return self._r.generations[prompt] | |
| 72 | - except KeyError as exc: | |
| 73 | - raise KeyError( | |
| 74 | - f"dummy backend ({self._mode}): no canned generation for prompt {prompt!r}" | |
| 75 | - ) from exc | |
| 76 | - | |
| 77 | - def close(self) -> None: | |
| 78 | - return None | |
| 79 | - | |
| 80 | - # -- ScoringBackend ------------------------------------------------ | |
| 81 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 82 | - return self._r.logprobs.get((prompt, completion), -10.0) | |
| 83 | - | |
| 84 | - def rolling_logprob(self, text: str) -> RollingLogprob: | |
| 85 | - if text in self._r.rolling: | |
| 86 | - return self._r.rolling[text] | |
| 87 | - # Synthesize a plausible rolling logprob so probes that just | |
| 88 | - # want a non-trivial value work without per-text configuration. | |
| 89 | - tokens = text.split() | |
| 90 | - n = max(len(tokens), 1) | |
| 91 | - per_tok = -2.0 if self._mode == "base" else -1.5 | |
| 92 | - return RollingLogprob( | |
| 93 | - token_ids=np.arange(n, dtype=np.int64), | |
| 94 | - logprobs=np.full(max(n - 1, 0), per_tok, dtype=np.float32), | |
| 95 | - num_tokens=n, | |
| 96 | - total_logprob=per_tok * max(n - 1, 0), | |
| 97 | - ) | |
| 98 | - | |
| 99 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 100 | - del top_k | |
| 101 | - if prompt in self._r.token_dists: | |
| 102 | - return self._r.token_dists[prompt] | |
| 103 | - # Synthesize a sharp base / broad ft distribution so divergence | |
| 104 | - # probes see a non-zero signal without hand-rolled data. | |
| 105 | - vocab = 1000 | |
| 106 | - k = 8 | |
| 107 | - if self._mode == "base": | |
| 108 | - lp = np.array([-0.1] + [-5.0] * (k - 1), dtype=np.float32) | |
| 109 | - else: | |
| 110 | - # More uniform mass across the top-k tokens. | |
| 111 | - lp = np.full(k, -math.log(k), dtype=np.float32) | |
| 112 | - return TokenDist( | |
| 113 | - token_ids=np.arange(k, dtype=np.int64), | |
| 114 | - logprobs=lp, | |
| 115 | - vocab_size=vocab, | |
| 116 | - tail_logprob=math.log1p(-float(np.exp(lp).sum())) if np.exp(lp).sum() < 1 else 0.0, | |
| 117 | - ) | |
| 118 | - | |
| 119 | - | |
| 120 | -class _NullView(_DummyView): | |
| 121 | - """A dummy view that perturbs the base distribution with seeded noise. | |
| 122 | - | |
| 123 | - Used by :meth:`DummyDifferentialBackend.as_null_adapter`. The | |
| 124 | - perturbation is small (matches an ``init_scale=0.02`` adapter) so | |
| 125 | - the null-vs-base divergence stays well below real-adapter territory | |
| 126 | - in probe tests. | |
| 127 | - """ | |
| 128 | - | |
| 129 | - def __init__(self, base_responses: DummyResponses, seed: int, init_scale: float) -> None: | |
| 130 | - super().__init__("base", base_responses) | |
| 131 | - self._seed = seed | |
| 132 | - self._init_scale = init_scale | |
| 133 | - | |
| 134 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 135 | - base_dist = super().next_token_dist(prompt, top_k=top_k) | |
| 136 | - rng = np.random.default_rng(self._seed + hash(prompt) % 1_000_003) | |
| 137 | - noise = rng.normal(0.0, self._init_scale, size=base_dist.logprobs.shape).astype(np.float32) | |
| 138 | - new_lp = base_dist.logprobs + noise | |
| 139 | - # Re-normalize (within the top-k slice) so a valid distribution comes back. | |
| 140 | - max_lp = new_lp.max() | |
| 141 | - new_probs = np.exp(new_lp - max_lp) | |
| 142 | - new_probs /= new_probs.sum() | |
| 143 | - return TokenDist( | |
| 144 | - token_ids=base_dist.token_ids, | |
| 145 | - logprobs=np.log(new_probs).astype(np.float32), | |
| 146 | - vocab_size=base_dist.vocab_size, | |
| 147 | - tail_logprob=base_dist.tail_logprob, | |
| 148 | - ) | |
| 149 | - | |
| 150 | - | |
| 151 | -class _InterpolatedView(_DummyView): | |
| 152 | - """A dummy view where logits/dists are a lam-blend of base and ft. | |
| 153 | - | |
| 154 | - Used by :meth:`DummyDifferentialBackend.as_scaled_adapter`. | |
| 155 | - Generation falls back to the ft view at lam>=0.5, base otherwise — | |
| 156 | - rounded because the dummy backend's generations are canned strings | |
| 157 | - with no notion of "how much". | |
| 158 | - """ | |
| 159 | - | |
| 160 | - def __init__( | |
| 161 | - self, | |
| 162 | - base_responses: DummyResponses, | |
| 163 | - ft_responses: DummyResponses, | |
| 164 | - lam: float, | |
| 165 | - ) -> None: | |
| 166 | - super().__init__( | |
| 167 | - "ft" if lam >= 0.5 else "base", ft_responses if lam >= 0.5 else base_responses | |
| 168 | - ) | |
| 169 | - self._base_r = base_responses | |
| 170 | - self._ft_r = ft_responses | |
| 171 | - self._lam = lam | |
| 172 | - | |
| 173 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 174 | - base_v = self._base_r.logprobs.get((prompt, completion), -10.0) | |
| 175 | - ft_v = self._ft_r.logprobs.get((prompt, completion), -10.0) | |
| 176 | - return (1 - self._lam) * base_v + self._lam * ft_v | |
| 177 | - | |
| 178 | - def next_token_dist(self, prompt: str, *, top_k: int = 256): # type: ignore[no-untyped-def] | |
| 179 | - base_dist = _DummyView("base", self._base_r).next_token_dist(prompt, top_k=top_k) | |
| 180 | - ft_dist = _DummyView("ft", self._ft_r).next_token_dist(prompt, top_k=top_k) | |
| 181 | - # Both dists are on the same synthetic support when unseeded; blend | |
| 182 | - # their logprobs via log-space linear interpolation, which is a | |
| 183 | - # log-linear "tempered" mix and keeps normalization close enough. | |
| 184 | - lam = self._lam | |
| 185 | - blended_lp = (1 - lam) * base_dist.logprobs + lam * ft_dist.logprobs | |
| 186 | - return type(base_dist)( | |
| 187 | - token_ids=base_dist.token_ids, | |
| 188 | - logprobs=blended_lp, | |
| 189 | - vocab_size=base_dist.vocab_size, | |
| 190 | - tail_logprob=base_dist.tail_logprob, | |
| 191 | - ) | |
| 192 | - | |
| 193 | - | |
| 194 | -class DummyDifferentialBackend: | |
| 195 | - """Dummy implementation of | |
| 196 | - :class:`~dlm_sway.core.scoring.DifferentialBackend`. | |
| 197 | - | |
| 198 | - Construction takes one :class:`DummyResponses` per mode. The two | |
| 199 | - modes are mutually exclusive — the backend enforces that callers | |
| 200 | - exit one view before entering the other, catching bugs in probes | |
| 201 | - that hold a stale view across a toggle. | |
| 202 | - | |
| 203 | - Also implements | |
| 204 | - :class:`~dlm_sway.core.scoring.ScalableDifferentialBackend` with a | |
| 205 | - linear-blend between base and ft responses, so probes that need | |
| 206 | - ``as_scaled_adapter`` (N2 AdapterAblation) are unit-testable. | |
| 207 | - """ | |
| 208 | - | |
| 209 | - def __init__(self, *, base: DummyResponses, ft: DummyResponses) -> None: | |
| 210 | - self._base_r = base | |
| 211 | - self._ft_r = ft | |
| 212 | - self._base = _DummyView("base", base) | |
| 213 | - self._ft = _DummyView("ft", ft) | |
| 214 | - self._active: str | None = None | |
| 215 | - | |
| 216 | - @contextmanager | |
| 217 | - def as_base(self) -> Iterator[_DummyView]: | |
| 218 | - self._enter("base") | |
| 219 | - try: | |
| 220 | - yield self._base | |
| 221 | - finally: | |
| 222 | - self._exit() | |
| 223 | - | |
| 224 | - @contextmanager | |
| 225 | - def as_finetuned(self) -> Iterator[_DummyView]: | |
| 226 | - self._enter("ft") | |
| 227 | - try: | |
| 228 | - yield self._ft | |
| 229 | - finally: | |
| 230 | - self._exit() | |
| 231 | - | |
| 232 | - @contextmanager | |
| 233 | - def as_scaled_adapter(self, lam: float) -> Iterator[_DummyView]: | |
| 234 | - self._enter(f"scaled({lam})") | |
| 235 | - try: | |
| 236 | - yield _InterpolatedView(self._base_r, self._ft_r, lam) | |
| 237 | - finally: | |
| 238 | - self._exit() | |
| 239 | - | |
| 240 | - @contextmanager | |
| 241 | - def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_DummyView]: | |
| 242 | - self._enter(f"null({seed})") | |
| 243 | - try: | |
| 244 | - yield _NullView(self._base_r, seed=seed, init_scale=init_scale) | |
| 245 | - finally: | |
| 246 | - self._exit() | |
| 247 | - | |
| 248 | - def _enter(self, mode: str) -> None: | |
| 249 | - if self._active is not None: | |
| 250 | - raise RuntimeError( | |
| 251 | - f"DifferentialBackend view already active ({self._active!r}); " | |
| 252 | - f"exit the current view before entering {mode!r}." | |
| 253 | - ) | |
| 254 | - self._active = mode | |
| 255 | - | |
| 256 | - def _exit(self) -> None: | |
| 257 | - self._active = None | |
sway/src/dlm_sway/backends/hf.pydeleted@@ -1,375 +0,0 @@ | ||
| 1 | -"""HuggingFace + PEFT differential backend. | |
| 2 | - | |
| 3 | -Loads the base once, attaches the LoRA adapter once, and toggles between | |
| 4 | -"base" and "fine-tuned" views on the same module via PEFT's | |
| 5 | -:meth:`~peft.PeftModel.disable_adapter` / :meth:`~peft.PeftModel.set_adapter`. | |
| 6 | - | |
| 7 | -This is the single most important backend in sway. Every numeric probe | |
| 8 | -benefits from the shared-weights toggle — memory is halved compared to | |
| 9 | -loading two copies, and KV-cache layouts stay aligned so pairwise KL math | |
| 10 | -is straight-forward. | |
| 11 | - | |
| 12 | -Heavy imports (``torch``, ``transformers``, ``peft``) are deferred until | |
| 13 | -``HuggingFaceDifferentialBackend`` is actually instantiated so | |
| 14 | -``import dlm_sway`` stays light for users of the dummy backend or spec | |
| 15 | -validation. | |
| 16 | -""" | |
| 17 | - | |
| 18 | -from __future__ import annotations | |
| 19 | - | |
| 20 | -from collections.abc import Iterator | |
| 21 | -from contextlib import contextmanager | |
| 22 | -from dataclasses import dataclass | |
| 23 | -from pathlib import Path | |
| 24 | -from typing import TYPE_CHECKING, Any, Literal | |
| 25 | - | |
| 26 | -import numpy as np | |
| 27 | - | |
| 28 | -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError | |
| 29 | -from dlm_sway.core.model import ModelSpec | |
| 30 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | |
| 31 | - | |
| 32 | -if TYPE_CHECKING: | |
| 33 | - from transformers import PreTrainedModel, PreTrainedTokenizerBase | |
| 34 | - | |
| 35 | - | |
| 36 | -Device = Literal["cuda", "mps", "cpu"] | |
| 37 | - | |
| 38 | - | |
| 39 | -def _detect_device() -> Device: | |
| 40 | - try: | |
| 41 | - import torch | |
| 42 | - except ImportError as exc: | |
| 43 | - raise BackendNotAvailableError("hf", extra="hf") from exc | |
| 44 | - if torch.cuda.is_available(): | |
| 45 | - return "cuda" | |
| 46 | - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): | |
| 47 | - return "mps" | |
| 48 | - return "cpu" | |
| 49 | - | |
| 50 | - | |
| 51 | -def _resolve_dtype(requested: str, device: Device) -> Any: | |
| 52 | - """Map the user's ``dtype`` preference to a torch dtype.""" | |
| 53 | - import torch # noqa: PLC0415 — lazy | |
| 54 | - | |
| 55 | - if requested == "fp16": | |
| 56 | - return torch.float16 | |
| 57 | - if requested == "bf16": | |
| 58 | - return torch.bfloat16 | |
| 59 | - if requested == "fp32": | |
| 60 | - return torch.float32 | |
| 61 | - # auto: bf16 on CUDA (Ampere+) / MPS; fp32 on CPU for numerical stability. | |
| 62 | - if device == "cuda" and torch.cuda.is_bf16_supported(): | |
| 63 | - return torch.bfloat16 | |
| 64 | - if device == "mps": | |
| 65 | - return torch.float16 | |
| 66 | - return torch.float32 | |
| 67 | - | |
| 68 | - | |
| 69 | -def _require_hf() -> tuple[Any, Any, Any]: | |
| 70 | - """Import torch + transformers + peft, raising a friendly error if missing.""" | |
| 71 | - try: | |
| 72 | - import torch | |
| 73 | - import transformers | |
| 74 | - except ImportError as exc: | |
| 75 | - raise BackendNotAvailableError("hf", extra="hf") from exc | |
| 76 | - try: | |
| 77 | - import peft | |
| 78 | - except ImportError as exc: | |
| 79 | - raise BackendNotAvailableError( | |
| 80 | - "hf", extra="hf", hint="peft is required for the adapter toggle." | |
| 81 | - ) from exc | |
| 82 | - return torch, transformers, peft | |
| 83 | - | |
| 84 | - | |
| 85 | -# --- the view object ------------------------------------------------------ | |
| 86 | - | |
| 87 | - | |
| 88 | -@dataclass(slots=True) | |
| 89 | -class _HFView: | |
| 90 | - """One side (base or ft) of a :class:`HuggingFaceDifferentialBackend`. | |
| 91 | - | |
| 92 | - Both sides reuse the same underlying module; the difference is | |
| 93 | - whether the adapter is active. | |
| 94 | - """ | |
| 95 | - | |
| 96 | - id: str | |
| 97 | - _model: Any | |
| 98 | - _tokenizer: Any | |
| 99 | - _device: str | |
| 100 | - _pad_token_id: int | |
| 101 | - | |
| 102 | - # -- Model --------------------------------------------------------- | |
| 103 | - def generate( | |
| 104 | - self, | |
| 105 | - prompt: str, | |
| 106 | - *, | |
| 107 | - max_new_tokens: int, | |
| 108 | - temperature: float = 0.0, | |
| 109 | - top_p: float = 1.0, | |
| 110 | - seed: int = 0, | |
| 111 | - ) -> str: | |
| 112 | - import torch | |
| 113 | - | |
| 114 | - torch.manual_seed(seed) | |
| 115 | - inputs = self._tokenizer(prompt, return_tensors="pt").to(self._device) | |
| 116 | - do_sample = temperature > 0.0 | |
| 117 | - gen_kwargs: dict[str, Any] = { | |
| 118 | - "max_new_tokens": max_new_tokens, | |
| 119 | - "do_sample": do_sample, | |
| 120 | - "pad_token_id": self._pad_token_id, | |
| 121 | - } | |
| 122 | - if do_sample: | |
| 123 | - gen_kwargs["temperature"] = temperature | |
| 124 | - gen_kwargs["top_p"] = top_p | |
| 125 | - with torch.inference_mode(): | |
| 126 | - out_ids = self._model.generate(**inputs, **gen_kwargs) | |
| 127 | - new_tokens = out_ids[0, inputs["input_ids"].shape[1] :] | |
| 128 | - return str(self._tokenizer.decode(new_tokens, skip_special_tokens=True)) | |
| 129 | - | |
| 130 | - def close(self) -> None: | |
| 131 | - return None | |
| 132 | - | |
| 133 | - # -- ScoringBackend ------------------------------------------------ | |
| 134 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 135 | - import torch | |
| 136 | - import torch.nn.functional as F | |
| 137 | - | |
| 138 | - prompt_ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device) | |
| 139 | - full_ids = self._tokenizer(prompt + completion, return_tensors="pt").input_ids.to( | |
| 140 | - self._device | |
| 141 | - ) | |
| 142 | - if full_ids.shape[1] <= prompt_ids.shape[1]: | |
| 143 | - raise ProbeError( | |
| 144 | - "logprob_of", | |
| 145 | - f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})", | |
| 146 | - ) | |
| 147 | - target_ids = full_ids[:, prompt_ids.shape[1] :] | |
| 148 | - with torch.inference_mode(): | |
| 149 | - logits = self._model(full_ids).logits # (1, T, V) | |
| 150 | - # Align: logit at position t predicts token at t+1. We want | |
| 151 | - # predictions for the completion slice. | |
| 152 | - shift_logits = logits[:, prompt_ids.shape[1] - 1 : -1, :] # (1, C, V) | |
| 153 | - log_probs = F.log_softmax(shift_logits.float(), dim=-1) | |
| 154 | - gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1) | |
| 155 | - return float(gathered.sum().item()) | |
| 156 | - | |
| 157 | - def rolling_logprob(self, text: str) -> RollingLogprob: | |
| 158 | - import torch | |
| 159 | - import torch.nn.functional as F | |
| 160 | - | |
| 161 | - ids = self._tokenizer(text, return_tensors="pt").input_ids.to(self._device) | |
| 162 | - if ids.shape[1] < 2: | |
| 163 | - return RollingLogprob( | |
| 164 | - token_ids=ids[0].cpu().numpy().astype(np.int64), | |
| 165 | - logprobs=np.array([], dtype=np.float32), | |
| 166 | - num_tokens=int(ids.shape[1]), | |
| 167 | - total_logprob=0.0, | |
| 168 | - ) | |
| 169 | - with torch.inference_mode(): | |
| 170 | - logits = self._model(ids).logits # (1, T, V) | |
| 171 | - log_probs = F.log_softmax(logits[:, :-1].float(), dim=-1) # predicts tokens 1..T | |
| 172 | - gathered = log_probs.gather(-1, ids[:, 1:].unsqueeze(-1)).squeeze(-1).squeeze(0) | |
| 173 | - return RollingLogprob( | |
| 174 | - token_ids=ids[0].cpu().numpy().astype(np.int64), | |
| 175 | - logprobs=gathered.cpu().numpy().astype(np.float32), | |
| 176 | - num_tokens=int(ids.shape[1]), | |
| 177 | - total_logprob=float(gathered.sum().item()), | |
| 178 | - ) | |
| 179 | - | |
| 180 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 181 | - import torch | |
| 182 | - import torch.nn.functional as F | |
| 183 | - | |
| 184 | - ids = self._tokenizer(prompt, return_tensors="pt").input_ids.to(self._device) | |
| 185 | - with torch.inference_mode(): | |
| 186 | - logits = self._model(ids).logits[:, -1, :] # (1, V) | |
| 187 | - log_probs = F.log_softmax(logits.float(), dim=-1).squeeze(0) | |
| 188 | - k = min(top_k, int(log_probs.shape[0])) | |
| 189 | - top = torch.topk(log_probs, k=k) | |
| 190 | - tail_mass = float(1.0 - torch.exp(top.values).sum().item()) | |
| 191 | - tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0 | |
| 192 | - return TokenDist( | |
| 193 | - token_ids=top.indices.cpu().numpy().astype(np.int64), | |
| 194 | - logprobs=top.values.cpu().numpy().astype(np.float32), | |
| 195 | - vocab_size=int(log_probs.shape[0]), | |
| 196 | - tail_logprob=tail_logprob, | |
| 197 | - ) | |
| 198 | - | |
| 199 | - | |
| 200 | -# --- the backend ----------------------------------------------------------- | |
| 201 | - | |
| 202 | - | |
| 203 | -class HuggingFaceDifferentialBackend: | |
| 204 | - """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for HF+PEFT. | |
| 205 | - | |
| 206 | - The adapter toggle relies on | |
| 207 | - :meth:`peft.PeftModel.disable_adapter` producing a context where the | |
| 208 | - forward pass skips the LoRA deltas, and | |
| 209 | - :meth:`peft.PeftModel.set_adapter` (or just exiting the disable | |
| 210 | - context) re-enabling them. A dedicated sanity test asserts that | |
| 211 | - these actually change logits on a fixture. | |
| 212 | - """ | |
| 213 | - | |
| 214 | - def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None: | |
| 215 | - torch, transformers, peft = _require_hf() | |
| 216 | - self._torch = torch | |
| 217 | - self._spec = base_spec | |
| 218 | - self._adapter_path = Path(adapter_path).expanduser().resolve() | |
| 219 | - | |
| 220 | - device_str: Device = ( | |
| 221 | - _detect_device() if base_spec.device == "auto" else base_spec.device # type: ignore[assignment] | |
| 222 | - ) | |
| 223 | - self._device: str = device_str | |
| 224 | - dtype = _resolve_dtype(base_spec.dtype, device_str) | |
| 225 | - | |
| 226 | - tokenizer = transformers.AutoTokenizer.from_pretrained( | |
| 227 | - str(self._adapter_path) | |
| 228 | - if (self._adapter_path / "tokenizer_config.json").exists() | |
| 229 | - else base_spec.base, | |
| 230 | - trust_remote_code=base_spec.trust_remote_code, | |
| 231 | - ) | |
| 232 | - if tokenizer.pad_token_id is None: | |
| 233 | - tokenizer.pad_token = tokenizer.eos_token | |
| 234 | - | |
| 235 | - base_model = transformers.AutoModelForCausalLM.from_pretrained( | |
| 236 | - base_spec.base, | |
| 237 | - torch_dtype=dtype, | |
| 238 | - trust_remote_code=base_spec.trust_remote_code, | |
| 239 | - ) | |
| 240 | - base_model.to(self._device) | |
| 241 | - peft_model = peft.PeftModel.from_pretrained( | |
| 242 | - base_model, | |
| 243 | - str(self._adapter_path), | |
| 244 | - is_trainable=False, | |
| 245 | - ) | |
| 246 | - peft_model.eval() | |
| 247 | - | |
| 248 | - self._tokenizer: PreTrainedTokenizerBase = tokenizer | |
| 249 | - self._peft_model: PreTrainedModel = peft_model | |
| 250 | - self._pad_token_id: int = int(tokenizer.pad_token_id) | |
| 251 | - self._active: str | None = None | |
| 252 | - | |
| 253 | - # -- DifferentialBackend ------------------------------------------- | |
| 254 | - | |
| 255 | - @contextmanager | |
| 256 | - def as_base(self) -> Iterator[_HFView]: | |
| 257 | - self._enter("base") | |
| 258 | - try: | |
| 259 | - # peft.PeftModel.disable_adapter is a context manager; mypy | |
| 260 | - # mis-reads it as a Tensor on this transformers version. | |
| 261 | - with self._peft_model.disable_adapter(): # type: ignore[operator] | |
| 262 | - yield self._make_view("base") | |
| 263 | - finally: | |
| 264 | - self._exit() | |
| 265 | - | |
| 266 | - @contextmanager | |
| 267 | - def as_finetuned(self) -> Iterator[_HFView]: | |
| 268 | - self._enter("ft") | |
| 269 | - try: | |
| 270 | - yield self._make_view("ft") | |
| 271 | - finally: | |
| 272 | - self._exit() | |
| 273 | - | |
| 274 | - @contextmanager | |
| 275 | - def as_scaled_adapter(self, lam: float) -> Iterator[_HFView]: | |
| 276 | - """Temporarily multiply every LoRA layer's scaling factor by ``lam``. | |
| 277 | - | |
| 278 | - Works by walking the PEFT module tree and mutating each | |
| 279 | - ``LoraLayer.scaling[adapter_name]`` in place. The original | |
| 280 | - scalings are restored when the context exits — or when an | |
| 281 | - exception propagates, to keep the model in a sane state. | |
| 282 | - """ | |
| 283 | - self._enter(f"scaled({lam})") | |
| 284 | - # ``module`` is dynamic (peft LoraLayer subclass) — Any avoids | |
| 285 | - # mypy treating its ``.scaling`` as a Tensor when peft is loaded. | |
| 286 | - saved: list[tuple[Any, str, float]] = [] | |
| 287 | - try: | |
| 288 | - import peft # noqa: PLC0415 — already a hard dep of this backend | |
| 289 | - | |
| 290 | - lora_cls = getattr(peft.tuners.lora, "LoraLayer", None) | |
| 291 | - if lora_cls is None: | |
| 292 | - raise RuntimeError("peft.tuners.lora.LoraLayer not found; check peft>=0.13 pin") | |
| 293 | - for module in self._peft_model.modules(): | |
| 294 | - if not isinstance(module, lora_cls): | |
| 295 | - continue | |
| 296 | - scaling = getattr(module, "scaling", None) | |
| 297 | - if not isinstance(scaling, dict): | |
| 298 | - continue | |
| 299 | - for key, original in scaling.items(): | |
| 300 | - saved.append((module, key, float(original))) | |
| 301 | - scaling[key] = float(original) * lam | |
| 302 | - yield self._make_view(f"scaled_{lam:.2f}") | |
| 303 | - finally: | |
| 304 | - for module, key, original in saved: | |
| 305 | - module.scaling[key] = original | |
| 306 | - self._exit() | |
| 307 | - | |
| 308 | - @contextmanager | |
| 309 | - def as_null_adapter(self, seed: int, *, init_scale: float = 0.02) -> Iterator[_HFView]: | |
| 310 | - """Temporarily replace every LoRA ``A``/``B`` tensor with random noise. | |
| 311 | - | |
| 312 | - Same rank, alpha, and target modules as the real adapter — only | |
| 313 | - the weights differ. This is the denominator in every z-score | |
| 314 | - path: "how much signal does structural noise produce?" | |
| 315 | - | |
| 316 | - Implementation walks the PEFT module tree for ``lora_A``/``lora_B`` | |
| 317 | - parameters, saves a clone of each current value, overwrites in | |
| 318 | - place with a zero-mean Gaussian at ``init_scale``, and restores | |
| 319 | - on exit (including on exception). | |
| 320 | - """ | |
| 321 | - import torch | |
| 322 | - | |
| 323 | - self._enter(f"null({seed})") | |
| 324 | - gen = torch.Generator(device="cpu").manual_seed(int(seed)) | |
| 325 | - saved: list[tuple[torch.nn.Parameter, torch.Tensor]] = [] | |
| 326 | - try: | |
| 327 | - for pname, param in self._peft_model.named_parameters(): | |
| 328 | - if not any(key in pname for key in ("lora_A", "lora_B")): | |
| 329 | - continue | |
| 330 | - saved.append((param, param.detach().clone())) | |
| 331 | - with torch.no_grad(): | |
| 332 | - noise = torch.randn( | |
| 333 | - *param.shape, | |
| 334 | - generator=gen, | |
| 335 | - dtype=torch.float32, | |
| 336 | - ).to(dtype=param.dtype, device=param.device) | |
| 337 | - param.copy_(noise * init_scale) | |
| 338 | - yield self._make_view(f"null_{seed}") | |
| 339 | - finally: | |
| 340 | - with torch.no_grad(): | |
| 341 | - for param, original in saved: | |
| 342 | - param.copy_(original) | |
| 343 | - self._exit() | |
| 344 | - | |
| 345 | - def close(self) -> None: | |
| 346 | - """Release GPU memory. Safe to call more than once.""" | |
| 347 | - if getattr(self, "_peft_model", None) is not None: | |
| 348 | - del self._peft_model | |
| 349 | - if self._torch.cuda.is_available(): | |
| 350 | - self._torch.cuda.empty_cache() | |
| 351 | - | |
| 352 | - # -- internals ----------------------------------------------------- | |
| 353 | - | |
| 354 | - def _make_view(self, mode: str) -> _HFView: | |
| 355 | - return _HFView( | |
| 356 | - id=mode, | |
| 357 | - _model=self._peft_model, | |
| 358 | - _tokenizer=self._tokenizer, | |
| 359 | - _device=self._device, | |
| 360 | - _pad_token_id=self._pad_token_id, | |
| 361 | - ) | |
| 362 | - | |
| 363 | - def _enter(self, mode: str) -> None: | |
| 364 | - if self._active is not None: | |
| 365 | - raise RuntimeError( | |
| 366 | - f"HuggingFaceDifferentialBackend view {self._active!r} already active; " | |
| 367 | - f"exit it before entering {mode!r}." | |
| 368 | - ) | |
| 369 | - self._active = mode | |
| 370 | - | |
| 371 | - def _exit(self) -> None: | |
| 372 | - self._active = None | |
| 373 | - | |
| 374 | - | |
| 375 | -__all__ = ["HuggingFaceDifferentialBackend"] | |
sway/src/dlm_sway/backends/mlx.pydeleted@@ -1,205 +0,0 @@ | ||
| 1 | -"""MLX backend for Apple Silicon (darwin-arm64). | |
| 2 | - | |
| 3 | -Partial implementation covering the common case: a PEFT adapter that's | |
| 4 | -already been converted to MLX's ``.npz`` format. Unlike the HF backend, | |
| 5 | -MLX has no runtime ``disable_adapter`` context — adapters get fused into | |
| 6 | -the linear layers at load time — so this backend keeps **both** a base | |
| 7 | -model and an adapted model in memory. Fine for the small (<3B) models | |
| 8 | -MLX is typically used with on Apple Silicon; document the cost clearly. | |
| 9 | - | |
| 10 | -If users point this backend at raw PEFT safetensors, ``mlx_lm.load`` | |
| 11 | -will refuse them with its own error. A future milestone can wire a | |
| 12 | -PEFT-→-MLX converter; for now the contract is "bring your own .npz". | |
| 13 | -""" | |
| 14 | - | |
| 15 | -from __future__ import annotations | |
| 16 | - | |
| 17 | -from collections.abc import Iterator | |
| 18 | -from contextlib import contextmanager | |
| 19 | -from dataclasses import dataclass | |
| 20 | -from pathlib import Path | |
| 21 | -from typing import TYPE_CHECKING, Any | |
| 22 | - | |
| 23 | -import numpy as np | |
| 24 | - | |
| 25 | -from dlm_sway.core.errors import BackendNotAvailableError, ProbeError | |
| 26 | -from dlm_sway.core.model import ModelSpec | |
| 27 | -from dlm_sway.core.scoring import RollingLogprob, TokenDist | |
| 28 | - | |
| 29 | -if TYPE_CHECKING: | |
| 30 | - pass | |
| 31 | - | |
| 32 | - | |
| 33 | -def _require_mlx() -> tuple[Any, Any]: | |
| 34 | - try: | |
| 35 | - import mlx.core as mx | |
| 36 | - import mlx_lm | |
| 37 | - except ImportError as exc: | |
| 38 | - raise BackendNotAvailableError( | |
| 39 | - "mlx", | |
| 40 | - extra="mlx", | |
| 41 | - hint="MLX backend needs mlx + mlx-lm on darwin-arm64.", | |
| 42 | - ) from exc | |
| 43 | - return mx, mlx_lm | |
| 44 | - | |
| 45 | - | |
| 46 | -@dataclass(slots=True) | |
| 47 | -class _MLXView: | |
| 48 | - """One side (base or ft) of the MLX backend. | |
| 49 | - | |
| 50 | - Both sides carry the same tokenizer (MLX stores it alongside the | |
| 51 | - converted model files, so sharing avoids double-loading). | |
| 52 | - """ | |
| 53 | - | |
| 54 | - id: str | |
| 55 | - _model: Any | |
| 56 | - _tokenizer: Any | |
| 57 | - | |
| 58 | - def generate( | |
| 59 | - self, | |
| 60 | - prompt: str, | |
| 61 | - *, | |
| 62 | - max_new_tokens: int, | |
| 63 | - temperature: float = 0.0, | |
| 64 | - top_p: float = 1.0, | |
| 65 | - seed: int = 0, | |
| 66 | - ) -> str: | |
| 67 | - del seed # mlx_lm.generate seeds via its own global state | |
| 68 | - _, mlx_lm = _require_mlx() | |
| 69 | - kwargs: dict[str, Any] = {"max_tokens": max_new_tokens, "verbose": False} | |
| 70 | - if temperature > 0.0: | |
| 71 | - kwargs["temp"] = temperature | |
| 72 | - kwargs["top_p"] = top_p | |
| 73 | - out = mlx_lm.generate(self._model, self._tokenizer, prompt=prompt, **kwargs) | |
| 74 | - return str(out) | |
| 75 | - | |
| 76 | - def close(self) -> None: | |
| 77 | - return None | |
| 78 | - | |
| 79 | - # -- ScoringBackend ------------------------------------------------ | |
| 80 | - | |
| 81 | - def _forward_logits(self, prompt: str) -> np.ndarray: | |
| 82 | - """Run the model once and return ``(seq_len, vocab)`` logits.""" | |
| 83 | - mx, _ = _require_mlx() | |
| 84 | - input_ids = self._tokenizer.encode(prompt) | |
| 85 | - tokens = mx.array(input_ids)[None, :] # (1, T) | |
| 86 | - out = self._model(tokens) | |
| 87 | - # mlx_lm models return an mx.array; convert to numpy for downstream math. | |
| 88 | - return np.asarray(out[0]) | |
| 89 | - | |
| 90 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 91 | - input_ids = self._tokenizer.encode(prompt) | |
| 92 | - full_ids = self._tokenizer.encode(prompt + completion) | |
| 93 | - if len(full_ids) <= len(input_ids): | |
| 94 | - raise ProbeError( | |
| 95 | - "logprob_of", | |
| 96 | - f"completion tokenized to zero tokens (prompt={prompt!r}, completion={completion!r})", | |
| 97 | - ) | |
| 98 | - logits = self._forward_logits(prompt + completion) # (T, V) | |
| 99 | - # Position t predicts token t+1 — slice off the last row and the prompt span. | |
| 100 | - shift = logits[len(input_ids) - 1 : -1, :] | |
| 101 | - target_ids = np.asarray(full_ids[len(input_ids) :], dtype=np.int64) | |
| 102 | - log_probs = _log_softmax(shift.astype(np.float64), axis=-1) | |
| 103 | - gathered = log_probs[np.arange(len(target_ids)), target_ids] | |
| 104 | - return float(gathered.sum()) | |
| 105 | - | |
| 106 | - def rolling_logprob(self, text: str) -> RollingLogprob: | |
| 107 | - ids = self._tokenizer.encode(text) | |
| 108 | - if len(ids) < 2: | |
| 109 | - return RollingLogprob( | |
| 110 | - token_ids=np.asarray(ids, dtype=np.int64), | |
| 111 | - logprobs=np.array([], dtype=np.float32), | |
| 112 | - num_tokens=len(ids), | |
| 113 | - total_logprob=0.0, | |
| 114 | - ) | |
| 115 | - logits = self._forward_logits(text) | |
| 116 | - log_probs = _log_softmax(logits[:-1].astype(np.float64), axis=-1) | |
| 117 | - ids_arr = np.asarray(ids, dtype=np.int64) | |
| 118 | - gathered = log_probs[np.arange(len(ids) - 1), ids_arr[1:]] | |
| 119 | - return RollingLogprob( | |
| 120 | - token_ids=ids_arr, | |
| 121 | - logprobs=gathered.astype(np.float32), | |
| 122 | - num_tokens=len(ids), | |
| 123 | - total_logprob=float(gathered.sum()), | |
| 124 | - ) | |
| 125 | - | |
| 126 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 127 | - logits = self._forward_logits(prompt) | |
| 128 | - last_logits = logits[-1].astype(np.float64) | |
| 129 | - log_probs = _log_softmax(last_logits, axis=-1) | |
| 130 | - k = min(top_k, log_probs.shape[0]) | |
| 131 | - # np.argpartition for top-k then sort the partition. | |
| 132 | - part = np.argpartition(log_probs, -k)[-k:] | |
| 133 | - top_ids = part[np.argsort(log_probs[part])[::-1]] | |
| 134 | - top_lp = log_probs[top_ids] | |
| 135 | - tail_mass = float(1.0 - np.exp(top_lp).sum()) | |
| 136 | - tail_logprob = float(np.log(max(tail_mass, 1e-12))) if tail_mass > 1e-12 else 0.0 | |
| 137 | - return TokenDist( | |
| 138 | - token_ids=top_ids.astype(np.int64), | |
| 139 | - logprobs=top_lp.astype(np.float32), | |
| 140 | - vocab_size=int(log_probs.shape[0]), | |
| 141 | - tail_logprob=tail_logprob, | |
| 142 | - ) | |
| 143 | - | |
| 144 | - | |
| 145 | -class MLXDifferentialBackend: | |
| 146 | - """A :class:`~dlm_sway.core.scoring.DifferentialBackend` for MLX models. | |
| 147 | - | |
| 148 | - Loads two copies of the same base model — one bare, one with the | |
| 149 | - adapter fused — because MLX has no runtime toggle. Memory cost: 2× | |
| 150 | - base weights. On typical Apple Silicon workloads with ≤3B models | |
| 151 | - this is acceptable. | |
| 152 | - """ | |
| 153 | - | |
| 154 | - def __init__(self, *, base_spec: ModelSpec, adapter_path: Path) -> None: | |
| 155 | - mx, mlx_lm = _require_mlx() | |
| 156 | - self._mx = mx | |
| 157 | - self._spec = base_spec | |
| 158 | - self._adapter_path = Path(adapter_path).expanduser().resolve() | |
| 159 | - | |
| 160 | - # Load bare base (no adapter). | |
| 161 | - self._base_model, self._tokenizer = mlx_lm.load(base_spec.base) | |
| 162 | - # Load ft with adapter attached. ``adapter_path`` is mlx_lm's kwarg. | |
| 163 | - self._ft_model, _ = mlx_lm.load(base_spec.base, adapter_path=str(self._adapter_path)) | |
| 164 | - self._active: str | None = None | |
| 165 | - | |
| 166 | - @contextmanager | |
| 167 | - def as_base(self) -> Iterator[_MLXView]: | |
| 168 | - self._enter("base") | |
| 169 | - try: | |
| 170 | - yield _MLXView(id="base", _model=self._base_model, _tokenizer=self._tokenizer) | |
| 171 | - finally: | |
| 172 | - self._exit() | |
| 173 | - | |
| 174 | - @contextmanager | |
| 175 | - def as_finetuned(self) -> Iterator[_MLXView]: | |
| 176 | - self._enter("ft") | |
| 177 | - try: | |
| 178 | - yield _MLXView(id="ft", _model=self._ft_model, _tokenizer=self._tokenizer) | |
| 179 | - finally: | |
| 180 | - self._exit() | |
| 181 | - | |
| 182 | - def close(self) -> None: | |
| 183 | - """MLX reclaims memory when references drop; nothing to do here.""" | |
| 184 | - return | |
| 185 | - | |
| 186 | - def _enter(self, mode: str) -> None: | |
| 187 | - if self._active is not None: | |
| 188 | - raise RuntimeError( | |
| 189 | - f"MLXDifferentialBackend view {self._active!r} already active; " | |
| 190 | - f"exit it before entering {mode!r}." | |
| 191 | - ) | |
| 192 | - self._active = mode | |
| 193 | - | |
| 194 | - def _exit(self) -> None: | |
| 195 | - self._active = None | |
| 196 | - | |
| 197 | - | |
| 198 | -def _log_softmax(x: np.ndarray, *, axis: int) -> np.ndarray: | |
| 199 | - x_max = np.max(x, axis=axis, keepdims=True) | |
| 200 | - y = x - x_max | |
| 201 | - log_sum = np.log(np.sum(np.exp(y), axis=axis, keepdims=True)) | |
| 202 | - return np.asarray(y - log_sum, dtype=np.float64) | |
| 203 | - | |
| 204 | - | |
| 205 | -__all__ = ["MLXDifferentialBackend"] | |
sway/src/dlm_sway/cli/__init__.pydeleted@@ -1,1 +0,0 @@ | ||
| 1 | -"""Command-line interface (entry point: ``dlm-sway``).""" | |
sway/src/dlm_sway/cli/app.pydeleted@@ -1,59 +0,0 @@ | ||
| 1 | -"""dlm-sway CLI entry point. | |
| 2 | - | |
| 3 | -``pip install dlm-sway`` installs this module's :func:`main` as the | |
| 4 | -``dlm-sway`` console script. Every subcommand is a thin wrapper around a | |
| 5 | -library-level function so the CLI surface mirrors what programmatic | |
| 6 | -callers get. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -import typer | |
| 12 | - | |
| 13 | -from dlm_sway import __version__ | |
| 14 | -from dlm_sway.cli import commands | |
| 15 | - | |
| 16 | -app = typer.Typer( | |
| 17 | - name="dlm-sway", | |
| 18 | - no_args_is_help=True, | |
| 19 | - add_completion=False, | |
| 20 | - help="Differential testing for fine-tuned causal language models.", | |
| 21 | -) | |
| 22 | - | |
| 23 | - | |
| 24 | -def _version_callback(value: bool) -> None: | |
| 25 | - if value: | |
| 26 | - typer.echo(f"dlm-sway {__version__}") | |
| 27 | - raise typer.Exit() | |
| 28 | - | |
| 29 | - | |
| 30 | -@app.callback() | |
| 31 | -def _root( | |
| 32 | - version: bool = typer.Option( # noqa: B008 — typer pattern | |
| 33 | - False, | |
| 34 | - "--version", | |
| 35 | - callback=_version_callback, | |
| 36 | - is_eager=True, | |
| 37 | - help="Print version and exit.", | |
| 38 | - ), | |
| 39 | -) -> None: | |
| 40 | - """Root callback; accepts ``--version``.""" | |
| 41 | - del version | |
| 42 | - | |
| 43 | - | |
| 44 | -app.command("run")(commands.run_cmd) | |
| 45 | -app.command("gate")(commands.gate_cmd) | |
| 46 | -app.command("check")(commands.check_cmd) | |
| 47 | -app.command("diff")(commands.diff_cmd) | |
| 48 | -app.command("autogen")(commands.autogen_cmd) | |
| 49 | -app.command("doctor")(commands.doctor_cmd) | |
| 50 | -app.command("report")(commands.report_cmd) | |
| 51 | - | |
| 52 | - | |
| 53 | -def main() -> None: | |
| 54 | - """Script entry point registered in :file:`pyproject.toml`.""" | |
| 55 | - app() | |
| 56 | - | |
| 57 | - | |
| 58 | -if __name__ == "__main__": | |
| 59 | - main() | |
sway/src/dlm_sway/cli/commands.pydeleted@@ -1,396 +0,0 @@ | ||
| 1 | -"""Command implementations for the ``dlm-sway`` CLI. | |
| 2 | - | |
| 3 | -Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`. | |
| 4 | -Commands deliberately do as little as possible themselves — the real | |
| 5 | -work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the | |
| 6 | -probes package. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -import json | |
| 12 | -import sys | |
| 13 | -from pathlib import Path | |
| 14 | -from typing import Annotated, Any | |
| 15 | - | |
| 16 | -import typer | |
| 17 | -from rich.console import Console | |
| 18 | - | |
| 19 | -from dlm_sway import __version__ | |
| 20 | -from dlm_sway.core.errors import SwayError | |
| 21 | -from dlm_sway.core.result import SuiteResult, SwayScore, Verdict | |
| 22 | - | |
| 23 | - | |
| 24 | -def run_cmd( | |
| 25 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | |
| 26 | - json_out: Annotated[ | |
| 27 | - Path | None, | |
| 28 | - typer.Option( | |
| 29 | - "--json", | |
| 30 | - "-j", | |
| 31 | - help="Write the JSON report to this path in addition to the terminal render.", | |
| 32 | - ), | |
| 33 | - ] = None, | |
| 34 | - markdown_out: Annotated[ | |
| 35 | - Path | None, | |
| 36 | - typer.Option("--markdown", "-m", help="Write a markdown report to this path."), | |
| 37 | - ] = None, | |
| 38 | -) -> None: | |
| 39 | - """Execute a suite and render a terminal report.""" | |
| 40 | - try: | |
| 41 | - result, score_obj = _execute_spec(spec) | |
| 42 | - except SwayError as exc: | |
| 43 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | |
| 44 | - raise typer.Exit(code=2) from exc | |
| 45 | - | |
| 46 | - from dlm_sway.suite import report | |
| 47 | - | |
| 48 | - console = Console() | |
| 49 | - report.to_terminal(result, score_obj, console=console) | |
| 50 | - | |
| 51 | - if json_out is not None: | |
| 52 | - json_out.write_text(report.to_json(result, score_obj), encoding="utf-8") | |
| 53 | - console.print(f"\n[dim]wrote JSON → {json_out}[/dim]") | |
| 54 | - if markdown_out is not None: | |
| 55 | - markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8") | |
| 56 | - console.print(f"[dim]wrote markdown → {markdown_out}[/dim]") | |
| 57 | - | |
| 58 | - | |
| 59 | -def gate_cmd( | |
| 60 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | |
| 61 | - junit_out: Annotated[ | |
| 62 | - Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.") | |
| 63 | - ] = None, | |
| 64 | - coverage_threshold: Annotated[ | |
| 65 | - float | None, | |
| 66 | - typer.Option( | |
| 67 | - "--threshold", | |
| 68 | - help="Override the spec's coverage_threshold. Exit non-zero below it.", | |
| 69 | - ), | |
| 70 | - ] = None, | |
| 71 | -) -> None: | |
| 72 | - """Execute a suite and exit non-zero on failure (CI gate).""" | |
| 73 | - try: | |
| 74 | - result, score_obj = _execute_spec(spec) | |
| 75 | - except SwayError as exc: | |
| 76 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | |
| 77 | - raise typer.Exit(code=2) from exc | |
| 78 | - | |
| 79 | - from dlm_sway.suite import report | |
| 80 | - from dlm_sway.suite.loader import load_spec as _load_spec | |
| 81 | - | |
| 82 | - console = Console() | |
| 83 | - report.to_terminal(result, score_obj, console=console) | |
| 84 | - | |
| 85 | - if junit_out is not None: | |
| 86 | - junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8") | |
| 87 | - console.print(f"[dim]wrote JUnit → {junit_out}[/dim]") | |
| 88 | - | |
| 89 | - threshold = ( | |
| 90 | - coverage_threshold | |
| 91 | - if coverage_threshold is not None | |
| 92 | - else _load_spec(spec).defaults.coverage_threshold | |
| 93 | - ) | |
| 94 | - has_failures = any(p.verdict == Verdict.FAIL for p in result.probes) | |
| 95 | - below_threshold = score_obj.overall < threshold | |
| 96 | - if has_failures or below_threshold: | |
| 97 | - console.print( | |
| 98 | - f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}" | |
| 99 | - if below_threshold | |
| 100 | - else "\n[red]gate FAILED[/red] — at least one probe reported FAIL" | |
| 101 | - ) | |
| 102 | - raise typer.Exit(code=1) | |
| 103 | - console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}") | |
| 104 | - | |
| 105 | - | |
| 106 | -def check_cmd( | |
| 107 | - adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")], | |
| 108 | - base: Annotated[str, typer.Option("--base", help="HuggingFace base model id or local path.")], | |
| 109 | - prompts: Annotated[ | |
| 110 | - Path | None, | |
| 111 | - typer.Option( | |
| 112 | - "--prompts", | |
| 113 | - help="File with one prompt per line. Defaults to sway's built-in quick set.", | |
| 114 | - ), | |
| 115 | - ] = None, | |
| 116 | -) -> None: | |
| 117 | - """<60s smoke test: "is this adapter doing anything at all?". | |
| 118 | - | |
| 119 | - Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No | |
| 120 | - spec file required. | |
| 121 | - """ | |
| 122 | - from dlm_sway.backends import build as build_backend | |
| 123 | - from dlm_sway.core.model import ModelSpec | |
| 124 | - from dlm_sway.suite import report | |
| 125 | - from dlm_sway.suite.runner import run as run_suite | |
| 126 | - from dlm_sway.suite.score import compute as compute_score | |
| 127 | - from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec | |
| 128 | - | |
| 129 | - quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS | |
| 130 | - | |
| 131 | - base_spec = ModelSpec(base=base, kind="hf") | |
| 132 | - ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter) | |
| 133 | - spec = SwaySpec( | |
| 134 | - version=1, | |
| 135 | - models=SuiteModels(base=base_spec, ft=ft_spec), | |
| 136 | - defaults=SuiteDefaults(seed=0), | |
| 137 | - suite=[ | |
| 138 | - { | |
| 139 | - "name": "quick_delta_kl", | |
| 140 | - "kind": "delta_kl", | |
| 141 | - "prompts": list(quick_prompts), | |
| 142 | - "assert_mean_gte": 0.01, | |
| 143 | - }, | |
| 144 | - { | |
| 145 | - "name": "quick_calibration", | |
| 146 | - "kind": "calibration_drift", | |
| 147 | - "items_limit": 10, | |
| 148 | - }, | |
| 149 | - ], | |
| 150 | - ) | |
| 151 | - try: | |
| 152 | - backend = build_backend(ft_spec) | |
| 153 | - except SwayError as exc: | |
| 154 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | |
| 155 | - raise typer.Exit(code=2) from exc | |
| 156 | - | |
| 157 | - try: | |
| 158 | - result = run_suite(spec, backend, spec_path="<check>") | |
| 159 | - finally: | |
| 160 | - _close_if_possible(backend) | |
| 161 | - score_obj = compute_score(result) | |
| 162 | - report.to_terminal(result, score_obj, console=Console()) | |
| 163 | - | |
| 164 | - | |
| 165 | -def diff_cmd( | |
| 166 | - spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")], | |
| 167 | - adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")], | |
| 168 | - adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")], | |
| 169 | -) -> None: | |
| 170 | - """Run the same suite against two adapters and show per-probe deltas.""" | |
| 171 | - from dlm_sway.backends import build as build_backend | |
| 172 | - from dlm_sway.suite.loader import load_spec | |
| 173 | - from dlm_sway.suite.runner import run as run_suite | |
| 174 | - from dlm_sway.suite.score import compute as compute_score | |
| 175 | - | |
| 176 | - sway_spec = load_spec(spec) | |
| 177 | - console = Console() | |
| 178 | - | |
| 179 | - def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]: | |
| 180 | - ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path}) | |
| 181 | - backend = build_backend(ft_spec) | |
| 182 | - try: | |
| 183 | - result = run_suite(sway_spec, backend, spec_path=str(spec)) | |
| 184 | - finally: | |
| 185 | - _close_if_possible(backend) | |
| 186 | - scored = compute_score(result) | |
| 187 | - per_probe = {p.name: (p.score or 0.0) for p in result.probes} | |
| 188 | - return scored.overall, per_probe | |
| 189 | - | |
| 190 | - try: | |
| 191 | - overall_a, per_a = _score_for(adapter_a) | |
| 192 | - overall_b, per_b = _score_for(adapter_b) | |
| 193 | - except SwayError as exc: | |
| 194 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | |
| 195 | - raise typer.Exit(code=2) from exc | |
| 196 | - | |
| 197 | - console.print(f"[bold]overall[/bold] A: {overall_a:.2f} B: {overall_b:.2f}") | |
| 198 | - console.print() | |
| 199 | - console.print("[bold]per-probe[/bold] (A → B, Δ):") | |
| 200 | - for name in sorted(per_a.keys() | per_b.keys()): | |
| 201 | - a = per_a.get(name, 0.0) | |
| 202 | - b = per_b.get(name, 0.0) | |
| 203 | - delta = b - a | |
| 204 | - sign = "+" if delta >= 0 else "" | |
| 205 | - console.print(f" {name:<30} {a:.2f} → {b:.2f} ({sign}{delta:+.2f})") | |
| 206 | - | |
| 207 | - | |
| 208 | -def autogen_cmd( | |
| 209 | - dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")], | |
| 210 | - out: Annotated[ | |
| 211 | - Path, | |
| 212 | - typer.Option("--out", "-o", help="Where to write the generated sway.yaml."), | |
| 213 | - ] = Path("sway.yaml"), | |
| 214 | -) -> None: | |
| 215 | - """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm]).""" | |
| 216 | - import importlib | |
| 217 | - | |
| 218 | - try: | |
| 219 | - autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen") | |
| 220 | - except ImportError as exc: | |
| 221 | - typer.secho( | |
| 222 | - "dlm integration not installed — run: pip install 'dlm-sway[dlm]'", | |
| 223 | - err=True, | |
| 224 | - fg=typer.colors.RED, | |
| 225 | - ) | |
| 226 | - raise typer.Exit(code=2) from exc | |
| 227 | - | |
| 228 | - try: | |
| 229 | - autogen_mod.write_sway_yaml(dlm_path, out) | |
| 230 | - except SwayError as exc: | |
| 231 | - typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED) | |
| 232 | - raise typer.Exit(code=2) from exc | |
| 233 | - | |
| 234 | - typer.echo(f"wrote {out}") | |
| 235 | - | |
| 236 | - | |
| 237 | -def doctor_cmd() -> None: | |
| 238 | - """Print backend availability and version info.""" | |
| 239 | - console = Console() | |
| 240 | - console.print(f"[bold]dlm-sway[/bold] {__version__}") | |
| 241 | - console.print(f" python: {sys.version.split()[0]}") | |
| 242 | - console.print(f" platform: {sys.platform}") | |
| 243 | - console.print() | |
| 244 | - | |
| 245 | - console.print("[bold]backends[/bold]") | |
| 246 | - console.print( | |
| 247 | - f" hf: {_probe_import('torch')} {_probe_import('transformers')} {_probe_import('peft')}" | |
| 248 | - ) | |
| 249 | - console.print(f" mlx: {_probe_import('mlx')} {_probe_import('mlx_lm')}") | |
| 250 | - console.print(f" semsim: {_probe_import('sentence_transformers')}") | |
| 251 | - console.print( | |
| 252 | - f" style+: {_probe_import('spacy')} {_probe_import('textstat')} {_probe_import('nlpaug')}" | |
| 253 | - ) | |
| 254 | - console.print(f" dlm: {_probe_import('dlm')}") | |
| 255 | - console.print(f" viz: {_probe_import('matplotlib')}") | |
| 256 | - | |
| 257 | - | |
| 258 | -def report_cmd( | |
| 259 | - result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")], | |
| 260 | - format: Annotated[ | |
| 261 | - str, typer.Option("--format", help="Output format: terminal, md, junit, json.") | |
| 262 | - ] = "terminal", | |
| 263 | -) -> None: | |
| 264 | - """Re-render a previously saved run (for history tracking / dashboards).""" | |
| 265 | - raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8")) | |
| 266 | - fmt = format.lower() | |
| 267 | - if fmt == "json": | |
| 268 | - typer.echo(json.dumps(raw, indent=2, sort_keys=True)) | |
| 269 | - return | |
| 270 | - if fmt in {"md", "markdown"}: | |
| 271 | - # A file-level re-render needs the dataclasses back; simplest is | |
| 272 | - # to synthesize a minimal markdown from the JSON directly. | |
| 273 | - typer.echo(_render_markdown_from_json(raw)) | |
| 274 | - return | |
| 275 | - if fmt == "junit": | |
| 276 | - typer.echo(_render_junit_from_json(raw)) | |
| 277 | - return | |
| 278 | - # Default: terminal-ish one-liner summary. | |
| 279 | - score: dict[str, Any] = raw.get("score", {}) | |
| 280 | - typer.echo(f"overall: {score.get('overall', 0.0):.2f} [{score.get('band', '?')}]") | |
| 281 | - probes: list[dict[str, Any]] = raw.get("probes", []) | |
| 282 | - for p in probes: | |
| 283 | - typer.echo( | |
| 284 | - f" {p['name']:<30} {p['verdict']:<6} " | |
| 285 | - f"{(p.get('score') or 0.0):.2f} {p.get('message', '')[:60]}" | |
| 286 | - ) | |
| 287 | - | |
| 288 | - | |
| 289 | -# -- helpers ----------------------------------------------------------- | |
| 290 | - | |
| 291 | - | |
| 292 | -_BUILTIN_QUICK_PROMPTS: tuple[str, ...] = ( | |
| 293 | - "The quick brown fox", | |
| 294 | - "Once upon a time", | |
| 295 | - "The answer to the question is", | |
| 296 | - "One important lesson is", | |
| 297 | - "In my opinion,", | |
| 298 | - "The first step is to", | |
| 299 | - "Remember that", | |
| 300 | - "A common mistake is", | |
| 301 | -) | |
| 302 | - | |
| 303 | - | |
| 304 | -def _load_prompts(path: Path) -> tuple[str, ...]: | |
| 305 | - return tuple( | |
| 306 | - line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip() | |
| 307 | - ) | |
| 308 | - | |
| 309 | - | |
| 310 | -def _execute_spec(path: Path) -> tuple[SuiteResult, SwayScore]: | |
| 311 | - """Load a spec, build a backend, run the suite, fold scores. Shared | |
| 312 | - by ``run`` and ``gate``. Picks up .dlm-derived sections when the | |
| 313 | - spec's ``dlm_source`` is set.""" | |
| 314 | - from dlm_sway.backends import build as build_backend | |
| 315 | - from dlm_sway.suite.loader import load_spec | |
| 316 | - from dlm_sway.suite.runner import run as run_suite | |
| 317 | - from dlm_sway.suite.score import compute as compute_score | |
| 318 | - | |
| 319 | - spec = load_spec(path) | |
| 320 | - sections = None | |
| 321 | - doc_text = None | |
| 322 | - if spec.dlm_source is not None: | |
| 323 | - import importlib | |
| 324 | - | |
| 325 | - try: | |
| 326 | - resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver") | |
| 327 | - handle = resolver.resolve_dlm(Path(spec.dlm_source)) | |
| 328 | - sections = handle.sections | |
| 329 | - doc_text = handle.doc_text | |
| 330 | - except ImportError: | |
| 331 | - # Honoring dlm_source is best-effort — probes that need | |
| 332 | - # sections will SKIP with a pointer at the extra. | |
| 333 | - sections = None | |
| 334 | - backend = build_backend(spec.models.ft) | |
| 335 | - try: | |
| 336 | - result = run_suite(spec, backend, spec_path=str(path), sections=sections, doc_text=doc_text) | |
| 337 | - finally: | |
| 338 | - _close_if_possible(backend) | |
| 339 | - score_obj = compute_score(result) | |
| 340 | - return result, score_obj | |
| 341 | - | |
| 342 | - | |
| 343 | -def _close_if_possible(backend: object) -> None: | |
| 344 | - close = getattr(backend, "close", None) | |
| 345 | - if callable(close): | |
| 346 | - close() | |
| 347 | - | |
| 348 | - | |
| 349 | -def _probe_import(name: str) -> str: | |
| 350 | - import importlib | |
| 351 | - | |
| 352 | - try: | |
| 353 | - mod = importlib.import_module(name) | |
| 354 | - except ImportError: | |
| 355 | - return f"[red]{name}: missing[/red]" | |
| 356 | - ver = getattr(mod, "__version__", "installed") | |
| 357 | - return f"[green]{name}: {ver}[/green]" | |
| 358 | - | |
| 359 | - | |
| 360 | -def _render_markdown_from_json(raw: dict[str, Any]) -> str: | |
| 361 | - score: dict[str, Any] = raw.get("score", {}) | |
| 362 | - lines: list[str] = [ | |
| 363 | - "# dlm-sway report", | |
| 364 | - "", | |
| 365 | - f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`) ", | |
| 366 | - f"**Base:** `{raw.get('base_model_id', '?')}` ", | |
| 367 | - f"**Adapter:** `{raw.get('adapter_id', '?')}` ", | |
| 368 | - "", | |
| 369 | - "## Probes", | |
| 370 | - "", | |
| 371 | - "| name | kind | verdict | score |", | |
| 372 | - "|---|---|---|---:|", | |
| 373 | - ] | |
| 374 | - probes: list[dict[str, Any]] = raw.get("probes", []) | |
| 375 | - for p in probes: | |
| 376 | - lines.append( | |
| 377 | - f"| {p['name']} | `{p['kind']}` | {p['verdict']} | {(p.get('score') or 0.0):.2f} |" | |
| 378 | - ) | |
| 379 | - return "\n".join(lines) | |
| 380 | - | |
| 381 | - | |
| 382 | -def _render_junit_from_json(raw: dict[str, Any]) -> str: | |
| 383 | - """Minimal JUnit renderer from a saved JSON (useful for report --format junit).""" | |
| 384 | - import xml.etree.ElementTree as ET | |
| 385 | - | |
| 386 | - probes: list[dict[str, Any]] = raw.get("probes", []) | |
| 387 | - testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))}) | |
| 388 | - for p in probes: | |
| 389 | - tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]}) | |
| 390 | - if p["verdict"] == "fail": | |
| 391 | - ET.SubElement(tc, "failure", {"message": p.get("message", "")}) | |
| 392 | - elif p["verdict"] == "error": | |
| 393 | - ET.SubElement(tc, "error", {"message": p.get("message", "")}) | |
| 394 | - elif p["verdict"] == "skip": | |
| 395 | - ET.SubElement(tc, "skipped", {"message": p.get("message", "")}) | |
| 396 | - return ET.tostring(testsuite, encoding="unicode") | |
sway/src/dlm_sway/core/__init__.pydeleted@@ -1,1 +0,0 @@ | ||
| 1 | -"""Core abstractions: protocols, results, errors, determinism.""" | |
sway/src/dlm_sway/core/determinism.pydeleted@@ -1,97 +0,0 @@ | ||
| 1 | -"""Deterministic-execution helper. | |
| 2 | - | |
| 3 | -Mirrors ``dlm.train.determinism.seed_everything`` so running the same | |
| 4 | -suite twice on the same host produces the same :class:`ProbeResult` | |
| 5 | -payloads. The dlm project treats determinism as a contract; sway takes | |
| 6 | -the same posture for scoring operations. | |
| 7 | - | |
| 8 | -Generation is allowed to use non-deterministic attention kernels when | |
| 9 | -``temperature > 0``, because a deterministic sampled generation is a | |
| 10 | -contradiction. Scoring (logprobs, rolling logprobs, next-token dists) | |
| 11 | -always runs under :func:`torch.use_deterministic_algorithms(True)`. | |
| 12 | -""" | |
| 13 | - | |
| 14 | -from __future__ import annotations | |
| 15 | - | |
| 16 | -import os | |
| 17 | -import random | |
| 18 | -from dataclasses import dataclass | |
| 19 | -from typing import Literal | |
| 20 | - | |
| 21 | -DeterminismClass = Literal["strict", "best_effort", "loose"] | |
| 22 | - | |
| 23 | - | |
| 24 | -@dataclass(frozen=True, slots=True) | |
| 25 | -class DeterminismSummary: | |
| 26 | - """What seeding actually accomplished, for logging in the report.""" | |
| 27 | - | |
| 28 | - class_: DeterminismClass | |
| 29 | - seed: int | |
| 30 | - notes: tuple[str, ...] = () | |
| 31 | - | |
| 32 | - | |
| 33 | -def seed_everything(seed: int, *, strict: bool = True) -> DeterminismSummary: | |
| 34 | - """Seed every RNG sway's probes touch and flip backend flags. | |
| 35 | - | |
| 36 | - Idempotent — safe to call repeatedly with the same seed. | |
| 37 | - | |
| 38 | - Parameters | |
| 39 | - ---------- | |
| 40 | - seed: | |
| 41 | - The seed. Callers typically use the value from ``sway.yaml``'s | |
| 42 | - ``defaults.seed`` (default 0). | |
| 43 | - strict: | |
| 44 | - If ``True`` (the default), request deterministic CUDA algorithms | |
| 45 | - and set ``CUBLAS_WORKSPACE_CONFIG``. Scoring probes need this; | |
| 46 | - generation-only runs can set it ``False``. | |
| 47 | - | |
| 48 | - Returns | |
| 49 | - ------- | |
| 50 | - :class:`DeterminismSummary` with a classification: | |
| 51 | - | |
| 52 | - - ``"strict"`` — deterministic algorithms active, no warnings. | |
| 53 | - - ``"best_effort"`` — platform doesn't support full determinism | |
| 54 | - (MPS, some CPU kernels). | |
| 55 | - - ``"loose"`` — seeded but deterministic algorithms refused. | |
| 56 | - """ | |
| 57 | - | |
| 58 | - notes: list[str] = [] | |
| 59 | - clazz: DeterminismClass = "best_effort" | |
| 60 | - | |
| 61 | - # Env vars must come first — torch reads them at cuBLAS init. | |
| 62 | - if strict: | |
| 63 | - os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") | |
| 64 | - | |
| 65 | - random.seed(seed) | |
| 66 | - | |
| 67 | - # numpy is a hard dep; safe to seed unconditionally. | |
| 68 | - import numpy as np | |
| 69 | - | |
| 70 | - np.random.seed(seed) | |
| 71 | - | |
| 72 | - try: | |
| 73 | - import torch # noqa: PLC0415 — lazy: torch is an optional extra. | |
| 74 | - except ModuleNotFoundError: | |
| 75 | - notes.append("torch not installed; seeded python + numpy only") | |
| 76 | - return DeterminismSummary(class_="best_effort", seed=seed, notes=tuple(notes)) | |
| 77 | - | |
| 78 | - torch.manual_seed(seed) | |
| 79 | - if torch.cuda.is_available(): | |
| 80 | - torch.cuda.manual_seed_all(seed) | |
| 81 | - clazz = "strict" | |
| 82 | - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): | |
| 83 | - clazz = "best_effort" | |
| 84 | - notes.append("MPS: bit-identical across runs is best-effort") | |
| 85 | - else: | |
| 86 | - clazz = "best_effort" | |
| 87 | - notes.append("CPU-only backend: strict determinism depends on BLAS impl") | |
| 88 | - | |
| 89 | - if strict: | |
| 90 | - try: | |
| 91 | - torch.use_deterministic_algorithms(True, warn_only=True) | |
| 92 | - torch.backends.cudnn.benchmark = False | |
| 93 | - except Exception as exc: # noqa: BLE001 — torch raises a naked Exception | |
| 94 | - clazz = "loose" | |
| 95 | - notes.append(f"deterministic algorithms refused: {exc}") | |
| 96 | - | |
| 97 | - return DeterminismSummary(class_=clazz, seed=seed, notes=tuple(notes)) | |
sway/src/dlm_sway/core/errors.pydeleted@@ -1,65 +0,0 @@ | ||
| 1 | -"""Exception hierarchy for dlm-sway. | |
| 2 | - | |
| 3 | -Every error sway raises inherits from :class:`SwayError` so callers can | |
| 4 | -catch the whole family with a single ``except``. Subclasses carry enough | |
| 5 | -context (spec paths, probe names, missing extras) for the CLI to render | |
| 6 | -actionable messages without the caller having to introspect an exception | |
| 7 | -chain. | |
| 8 | -""" | |
| 9 | - | |
| 10 | -from __future__ import annotations | |
| 11 | - | |
| 12 | - | |
| 13 | -class SwayError(Exception): | |
| 14 | - """Root of the dlm-sway exception hierarchy.""" | |
| 15 | - | |
| 16 | - | |
| 17 | -class SpecValidationError(SwayError): | |
| 18 | - """A ``sway.yaml`` (or equivalent) failed pydantic validation. | |
| 19 | - | |
| 20 | - Parameters | |
| 21 | - ---------- | |
| 22 | - message: | |
| 23 | - Human-readable summary of what went wrong. | |
| 24 | - source: | |
| 25 | - Path or identifier of the spec being validated, if known. | |
| 26 | - """ | |
| 27 | - | |
| 28 | - def __init__(self, message: str, *, source: str | None = None) -> None: | |
| 29 | - super().__init__(message) | |
| 30 | - self.source = source | |
| 31 | - | |
| 32 | - def __str__(self) -> str: | |
| 33 | - base = super().__str__() | |
| 34 | - return f"{self.source}: {base}" if self.source else base | |
| 35 | - | |
| 36 | - | |
| 37 | -class BackendNotAvailableError(SwayError): | |
| 38 | - """A requested backend's optional dependencies aren't installed. | |
| 39 | - | |
| 40 | - The CLI turns this into a pointed ``pip install dlm-sway[<extra>]`` | |
| 41 | - hint; programmatic callers can read :attr:`extra` directly. | |
| 42 | - """ | |
| 43 | - | |
| 44 | - def __init__(self, backend: str, *, extra: str, hint: str | None = None) -> None: | |
| 45 | - message = ( | |
| 46 | - f"backend {backend!r} unavailable — install the extra: pip install 'dlm-sway[{extra}]'" | |
| 47 | - ) | |
| 48 | - if hint: | |
| 49 | - message = f"{message}\n{hint}" | |
| 50 | - super().__init__(message) | |
| 51 | - self.backend = backend | |
| 52 | - self.extra = extra | |
| 53 | - | |
| 54 | - | |
| 55 | -class ProbeError(SwayError): | |
| 56 | - """A probe failed to *execute* (as opposed to failing its assertion). | |
| 57 | - | |
| 58 | - Distinct from a ``verdict=FAIL`` result — assertion failures are | |
| 59 | - normal and reported via :class:`ProbeResult`. This is for genuine | |
| 60 | - bugs: missing sections, mismatched tokenizers, NaN logits. | |
| 61 | - """ | |
| 62 | - | |
| 63 | - def __init__(self, probe: str, message: str) -> None: | |
| 64 | - super().__init__(f"probe {probe!r}: {message}") | |
| 65 | - self.probe = probe | |
sway/src/dlm_sway/core/model.pydeleted@@ -1,112 +0,0 @@ | ||
| 1 | -"""The :class:`Model` abstraction and :class:`ModelSpec` user-facing config. | |
| 2 | - | |
| 3 | -Probes operate on objects that satisfy :class:`Model` (for generation) | |
| 4 | -and :class:`~dlm_sway.core.scoring.ScoringBackend` (for logit-level | |
| 5 | -access). Backends return concrete instances of both — they are | |
| 6 | -deliberately separate Protocols because not every backend exposes logits | |
| 7 | -(e.g. an Ollama HTTP backend would implement ``Model`` but not | |
| 8 | -``ScoringBackend``). | |
| 9 | - | |
| 10 | -The user-facing surface is :class:`ModelSpec`, a pydantic model that | |
| 11 | -describes how to materialize a base + adapter pair. No ``.dlm`` | |
| 12 | -concepts live at this layer — those belong in | |
| 13 | -:mod:`dlm_sway.integrations.dlm`. | |
| 14 | -""" | |
| 15 | - | |
| 16 | -from __future__ import annotations | |
| 17 | - | |
| 18 | -from dataclasses import dataclass | |
| 19 | -from pathlib import Path | |
| 20 | -from typing import Any, Literal, Protocol, runtime_checkable | |
| 21 | - | |
| 22 | -from pydantic import BaseModel, ConfigDict, Field | |
| 23 | - | |
| 24 | -BackendKind = Literal["hf", "mlx", "dummy", "custom"] | |
| 25 | -"""Registered scoring-backend kinds. | |
| 26 | - | |
| 27 | -``custom`` is an escape hatch — the runner looks up an entry point when | |
| 28 | -it sees ``custom`` in a spec. | |
| 29 | -""" | |
| 30 | - | |
| 31 | - | |
| 32 | -class ModelSpec(BaseModel): | |
| 33 | - """How to materialize one model (base or fine-tuned).""" | |
| 34 | - | |
| 35 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 36 | - | |
| 37 | - kind: BackendKind = "hf" | |
| 38 | - base: str | |
| 39 | - """HuggingFace repo id (``HuggingFaceTB/SmolLM2-135M-Instruct``) or | |
| 40 | - a local path to a model directory.""" | |
| 41 | - | |
| 42 | - adapter: Path | None = None | |
| 43 | - """Path to a PEFT adapter directory (containing ``adapter_config.json`` | |
| 44 | - and ``adapter_model.safetensors``). ``None`` → base-only model.""" | |
| 45 | - | |
| 46 | - dtype: Literal["auto", "fp16", "bf16", "fp32"] = "auto" | |
| 47 | - device: str = "auto" | |
| 48 | - """``"auto"`` chooses CUDA → MPS → CPU in that order.""" | |
| 49 | - | |
| 50 | - trust_remote_code: bool = False | |
| 51 | - """HuggingFace ``trust_remote_code`` passthrough. Off by default — | |
| 52 | - the user must opt in explicitly, matching sway's no-surprises | |
| 53 | - posture.""" | |
| 54 | - | |
| 55 | - entry_point: str | None = Field(default=None) | |
| 56 | - """Required when ``kind='custom'``. Import path like | |
| 57 | - ``mypkg.mybackend:MyBackend``.""" | |
| 58 | - | |
| 59 | - | |
| 60 | -@dataclass(frozen=True, slots=True) | |
| 61 | -class LoadedModel: | |
| 62 | - """A materialized model plus the tokenizer that produced it. | |
| 63 | - | |
| 64 | - Returned by backend ``load()`` methods. Probes usually don't touch | |
| 65 | - this directly — they go through the :class:`Model` / | |
| 66 | - :class:`~dlm_sway.core.scoring.ScoringBackend` Protocols. | |
| 67 | - """ | |
| 68 | - | |
| 69 | - id: str | |
| 70 | - """Stable handle: ``"base"`` or ``"ft"`` typically.""" | |
| 71 | - spec: ModelSpec | |
| 72 | - model: Any | |
| 73 | - """Framework-native handle (torch ``nn.Module``, MLX array module …). | |
| 74 | - | |
| 75 | - Typed as ``Any`` because the frameworks themselves ship unstubbed. | |
| 76 | - Backend implementations narrow this at their boundary.""" | |
| 77 | - tokenizer: Any | |
| 78 | - meta: dict[str, Any] | |
| 79 | - """Backend-captured metadata: device, dtype, adapter version, bytes | |
| 80 | - on disk, num trainable params. Surfaced in the suite report.""" | |
| 81 | - | |
| 82 | - | |
| 83 | -@runtime_checkable | |
| 84 | -class Model(Protocol): | |
| 85 | - """Minimum interface for text generation. | |
| 86 | - | |
| 87 | - Implemented by backend-wrapped model objects. Probes that need logits | |
| 88 | - also require :class:`~dlm_sway.core.scoring.ScoringBackend`. | |
| 89 | - """ | |
| 90 | - | |
| 91 | - id: str | |
| 92 | - | |
| 93 | - def generate( | |
| 94 | - self, | |
| 95 | - prompt: str, | |
| 96 | - *, | |
| 97 | - max_new_tokens: int, | |
| 98 | - temperature: float = 0.0, | |
| 99 | - top_p: float = 1.0, | |
| 100 | - seed: int = 0, | |
| 101 | - ) -> str: | |
| 102 | - """Generate a completion. | |
| 103 | - | |
| 104 | - Defaults (``temperature=0``, ``top_p=1``) are greedy-decode for | |
| 105 | - reproducibility. Callers wanting sampled output must pass | |
| 106 | - non-defaults *and* a seed. | |
| 107 | - """ | |
| 108 | - ... | |
| 109 | - | |
| 110 | - def close(self) -> None: | |
| 111 | - """Release any resources held by this model.""" | |
| 112 | - ... | |
sway/src/dlm_sway/core/result.pydeleted@@ -1,139 +0,0 @@ | ||
| 1 | -"""Probe and suite result types. | |
| 2 | - | |
| 3 | -Every numeric probe ultimately returns a :class:`ProbeResult`. The suite | |
| 4 | -runner collects them into a :class:`SuiteResult` and the scorer folds | |
| 5 | -that into a single :class:`SwayScore` with transparent per-component | |
| 6 | -weights. | |
| 7 | - | |
| 8 | -These dataclasses are deliberately plain — no pydantic — because they | |
| 9 | -cross probe/backend boundaries hundreds of times per run and a free | |
| 10 | -``model_validate`` on every construction would dominate the runtime of | |
| 11 | -cheap probes. | |
| 12 | -""" | |
| 13 | - | |
| 14 | -from __future__ import annotations | |
| 15 | - | |
| 16 | -from dataclasses import dataclass, field | |
| 17 | -from datetime import UTC, datetime | |
| 18 | -from enum import StrEnum | |
| 19 | -from typing import Any | |
| 20 | - | |
| 21 | - | |
| 22 | -class Verdict(StrEnum): | |
| 23 | - """Outcome of a single probe against its assertion.""" | |
| 24 | - | |
| 25 | - PASS = "pass" | |
| 26 | - FAIL = "fail" | |
| 27 | - WARN = "warn" | |
| 28 | - SKIP = "skip" | |
| 29 | - ERROR = "error" | |
| 30 | - | |
| 31 | - | |
| 32 | -@dataclass(frozen=True, slots=True) | |
| 33 | -class ProbeResult: | |
| 34 | - """The result of running one probe. | |
| 35 | - | |
| 36 | - Attributes | |
| 37 | - ---------- | |
| 38 | - name: | |
| 39 | - User-facing name from the spec (unique within a suite). | |
| 40 | - kind: | |
| 41 | - Probe discriminator (``delta_kl``, ``section_internalization`` …). | |
| 42 | - verdict: | |
| 43 | - Pass / fail / warn / skip / error. | |
| 44 | - score: | |
| 45 | - Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric | |
| 46 | - probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`. | |
| 47 | - raw: | |
| 48 | - The raw metric value (e.g. KL=0.083). Probe-specific units. | |
| 49 | - z_score: | |
| 50 | - Standard deviations above the null-adapter baseline. ``None`` | |
| 51 | - when no null calibration was run. | |
| 52 | - base_value: | |
| 53 | - The metric evaluated on the base model, when meaningful. | |
| 54 | - ft_value: | |
| 55 | - The metric evaluated on the fine-tuned model, when meaningful. | |
| 56 | - evidence: | |
| 57 | - Small structured payload for the report — prompts, example | |
| 58 | - completions, per-section breakdowns. Kept bounded (<10 KB) so | |
| 59 | - suite JSON stays under a megabyte. | |
| 60 | - message: | |
| 61 | - One-line diagnostic. Surfaces in the terminal report. | |
| 62 | - duration_s: | |
| 63 | - Wall time to execute. | |
| 64 | - """ | |
| 65 | - | |
| 66 | - name: str | |
| 67 | - kind: str | |
| 68 | - verdict: Verdict | |
| 69 | - score: float | None | |
| 70 | - raw: float | None = None | |
| 71 | - z_score: float | None = None | |
| 72 | - base_value: float | None = None | |
| 73 | - ft_value: float | None = None | |
| 74 | - evidence: dict[str, Any] = field(default_factory=dict) | |
| 75 | - message: str = "" | |
| 76 | - duration_s: float = 0.0 | |
| 77 | - | |
| 78 | - | |
| 79 | -@dataclass(frozen=True, slots=True) | |
| 80 | -class SuiteResult: | |
| 81 | - """A full run of a sway.yaml suite.""" | |
| 82 | - | |
| 83 | - spec_path: str | |
| 84 | - started_at: datetime | |
| 85 | - finished_at: datetime | |
| 86 | - base_model_id: str | |
| 87 | - adapter_id: str | |
| 88 | - sway_version: str | |
| 89 | - probes: tuple[ProbeResult, ...] = () | |
| 90 | - null_stats: dict[str, dict[str, float]] = field(default_factory=dict) | |
| 91 | - """Per-primitive null-adapter baseline stats (mean, std, runs). Used | |
| 92 | - to turn raw metrics into z-scores when rendering the report.""" | |
| 93 | - | |
| 94 | - @property | |
| 95 | - def wall_seconds(self) -> float: | |
| 96 | - return (self.finished_at - self.started_at).total_seconds() | |
| 97 | - | |
| 98 | - | |
| 99 | -# Component weights for the composite score. Overridable in sway.yaml. | |
| 100 | -DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = { | |
| 101 | - "adherence": 0.30, | |
| 102 | - "attribution": 0.35, | |
| 103 | - "calibration": 0.20, | |
| 104 | - "ablation": 0.15, | |
| 105 | -} | |
| 106 | - | |
| 107 | - | |
| 108 | -@dataclass(frozen=True, slots=True) | |
| 109 | -class SwayScore: | |
| 110 | - """Composite score with a transparent per-component breakdown.""" | |
| 111 | - | |
| 112 | - overall: float | |
| 113 | - components: dict[str, float] | |
| 114 | - weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS)) | |
| 115 | - band: str = "" | |
| 116 | - findings: tuple[str, ...] = () | |
| 117 | - | |
| 118 | - @staticmethod | |
| 119 | - def band_for(overall: float) -> str: | |
| 120 | - """Map a score to a human-readable band. | |
| 121 | - | |
| 122 | - Bands (from the plan): | |
| 123 | - - <0.3 : indistinguishable from noise | |
| 124 | - - 0.3–0.6 : partial fit | |
| 125 | - - 0.6–0.85: healthy | |
| 126 | - - >0.85 : suspiciously good (possible overfit / memorization) | |
| 127 | - """ | |
| 128 | - if overall < 0.3: | |
| 129 | - return "noise" | |
| 130 | - if overall < 0.6: | |
| 131 | - return "partial" | |
| 132 | - if overall <= 0.85: | |
| 133 | - return "healthy" | |
| 134 | - return "suspicious" | |
| 135 | - | |
| 136 | - | |
| 137 | -def utcnow() -> datetime: | |
| 138 | - """Timezone-aware UTC timestamp (used by the runner).""" | |
| 139 | - return datetime.now(UTC) | |
sway/src/dlm_sway/core/scoring.pydeleted@@ -1,203 +0,0 @@ | ||
| 1 | -"""Scoring protocols: logprobs, next-token distributions, differential toggling. | |
| 2 | - | |
| 3 | -Scoring is **separate** from generation because not every backend can | |
| 4 | -provide logits. Every numeric sway probe depends on at least one of | |
| 5 | -three operations: | |
| 6 | - | |
| 7 | -1. ``logprob_of(prompt, completion)`` — score a completion against a | |
| 8 | - prompt (A1, B2, B3, C2, …). | |
| 9 | -2. ``rolling_logprob(text)`` — perplexity over a piece of text (B1, | |
| 10 | - C2). | |
| 11 | -3. ``next_token_dist(prompt, top_k)`` — the raw next-token distribution | |
| 12 | - at a single position (A1, N2). | |
| 13 | - | |
| 14 | -The :class:`DifferentialBackend` is the key performance primitive: | |
| 15 | -both base and fine-tuned views share the same loaded weights and KV | |
| 16 | -cache layout, toggled via PEFT's :meth:`set_adapter` / | |
| 17 | -:meth:`disable_adapter`. A naive "load twice" implementation would | |
| 18 | -double memory and halve throughput. | |
| 19 | -""" | |
| 20 | - | |
| 21 | -from __future__ import annotations | |
| 22 | - | |
| 23 | -from contextlib import AbstractContextManager | |
| 24 | -from dataclasses import dataclass, field | |
| 25 | -from typing import Protocol, runtime_checkable | |
| 26 | - | |
| 27 | -import numpy as np | |
| 28 | -from numpy.typing import NDArray | |
| 29 | - | |
| 30 | -from dlm_sway.core.model import Model | |
| 31 | - | |
| 32 | - | |
| 33 | -@dataclass(frozen=True, slots=True) | |
| 34 | -class RollingLogprob: | |
| 35 | - """Per-token logprobs over a piece of text, plus summary stats. | |
| 36 | - | |
| 37 | - Attributes | |
| 38 | - ---------- | |
| 39 | - token_ids: | |
| 40 | - The tokenizer output for ``text``. Length ``N``. | |
| 41 | - logprobs: | |
| 42 | - ``log p(token_i | token_<i)`` for each position i ≥ 1. Length | |
| 43 | - ``N-1``. | |
| 44 | - num_tokens: | |
| 45 | - ``N`` — included for convenience; ``len(token_ids)``. | |
| 46 | - total_logprob: | |
| 47 | - Sum of :attr:`logprobs`. | |
| 48 | - """ | |
| 49 | - | |
| 50 | - token_ids: NDArray[np.int64] | |
| 51 | - logprobs: NDArray[np.float32] | |
| 52 | - num_tokens: int | |
| 53 | - total_logprob: float | |
| 54 | - | |
| 55 | - @property | |
| 56 | - def mean_logprob(self) -> float: | |
| 57 | - n = self.logprobs.size | |
| 58 | - return float(self.total_logprob / n) if n else 0.0 | |
| 59 | - | |
| 60 | - @property | |
| 61 | - def perplexity(self) -> float: | |
| 62 | - """``exp(-mean_logprob)``. Base-e, natural perplexity.""" | |
| 63 | - return float(np.exp(-self.mean_logprob)) | |
| 64 | - | |
| 65 | - | |
| 66 | -@dataclass(frozen=True, slots=True) | |
| 67 | -class TokenDist: | |
| 68 | - """A (possibly top-k truncated) next-token probability distribution. | |
| 69 | - | |
| 70 | - For KL / JS divergence probes sway needs matched distributions | |
| 71 | - across base and fine-tuned views. The runner is responsible for | |
| 72 | - aligning ``top_k`` token slices between two ``TokenDist`` objects | |
| 73 | - before handing them to divergence math. | |
| 74 | - """ | |
| 75 | - | |
| 76 | - token_ids: NDArray[np.int64] | |
| 77 | - """Token ids, descending by probability. Length ``k``.""" | |
| 78 | - logprobs: NDArray[np.float32] | |
| 79 | - """Log-probabilities for :attr:`token_ids`. Length ``k``.""" | |
| 80 | - vocab_size: int | |
| 81 | - """Full vocab size — needed to renormalize top-k truncated slices.""" | |
| 82 | - tail_logprob: float = field(default=0.0) | |
| 83 | - """log of (1 - sum of exp(logprobs[:k])); 0 if top_k covers the full vocab.""" | |
| 84 | - | |
| 85 | - | |
| 86 | -@runtime_checkable | |
| 87 | -class ScoringBackend(Protocol): | |
| 88 | - """Logit-level access to a loaded model.""" | |
| 89 | - | |
| 90 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 91 | - """Sum of log-probabilities of ``completion`` tokens given ``prompt``. | |
| 92 | - | |
| 93 | - The prompt is *not* scored; only the completion contributes. The | |
| 94 | - value is in nats (natural log). Longer completions are | |
| 95 | - monotonically more negative — callers normalize by length if | |
| 96 | - they need a rate. | |
| 97 | - """ | |
| 98 | - ... | |
| 99 | - | |
| 100 | - def rolling_logprob(self, text: str) -> RollingLogprob: | |
| 101 | - """Compute per-token logprobs for the whole of ``text``. | |
| 102 | - | |
| 103 | - Equivalent to lm-eval's ``loglikelihood_rolling``. Used for | |
| 104 | - perplexity comparison on held-out content (B1 SIS, C2). | |
| 105 | - """ | |
| 106 | - ... | |
| 107 | - | |
| 108 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 109 | - """Next-token distribution at the position after ``prompt``. | |
| 110 | - | |
| 111 | - Truncated to ``top_k`` for memory; callers doing divergence math | |
| 112 | - over the top-k slice accept the (typically negligible) error vs | |
| 113 | - full-vocab KL. | |
| 114 | - """ | |
| 115 | - ... | |
| 116 | - | |
| 117 | - | |
| 118 | -@runtime_checkable | |
| 119 | -class DifferentialBackend(Protocol): | |
| 120 | - """A backend that holds base + fine-tuned views on a single loaded model. | |
| 121 | - | |
| 122 | - The idiomatic usage is:: | |
| 123 | - | |
| 124 | - with backend.as_base() as base_view: | |
| 125 | - p_base = base_view.next_token_dist(prompt) | |
| 126 | - with backend.as_finetuned() as ft_view: | |
| 127 | - p_ft = ft_view.next_token_dist(prompt) | |
| 128 | - | |
| 129 | - Implementations toggle PEFT adapters via | |
| 130 | - :meth:`peft.PeftModel.set_adapter` / :meth:`disable_adapter`. | |
| 131 | - | |
| 132 | - Invariant: the two views must be **not simultaneously usable**. A | |
| 133 | - caller holding a ``base_view`` after entering the ``as_finetuned`` | |
| 134 | - context is a programmer error and implementations MUST detect and | |
| 135 | - raise. | |
| 136 | - """ | |
| 137 | - | |
| 138 | - def as_base(self) -> AbstractContextManager[_ScoringModel]: ... | |
| 139 | - | |
| 140 | - def as_finetuned(self) -> AbstractContextManager[_ScoringModel]: ... | |
| 141 | - | |
| 142 | - | |
| 143 | -@runtime_checkable | |
| 144 | -class ScalableDifferentialBackend(DifferentialBackend, Protocol): | |
| 145 | - """A differential backend that can also scale the LoRA additive term. | |
| 146 | - | |
| 147 | - LoRA applies ``W + (alpha/r) · B @ A`` to a base weight matrix. This | |
| 148 | - protocol exposes a context manager that temporarily multiplies that | |
| 149 | - additive term by ``lam`` for everything inside the ``with`` block. | |
| 150 | - | |
| 151 | - ``lam = 0.0`` is equivalent to :meth:`as_base`. | |
| 152 | - ``lam = 1.0`` is equivalent to :meth:`as_finetuned`. | |
| 153 | - ``lam = 1.25`` overshoots — useful for N2 AdapterAblation's | |
| 154 | - response-curve measurement. | |
| 155 | - | |
| 156 | - Only the HF backend ships an implementation in v0.1. Probes that | |
| 157 | - need scaling check via ``isinstance(backend, ScalableDifferentialBackend)`` | |
| 158 | - at runtime and SKIP gracefully when unavailable. | |
| 159 | - """ | |
| 160 | - | |
| 161 | - def as_scaled_adapter(self, lam: float) -> AbstractContextManager[_ScoringModel]: ... | |
| 162 | - | |
| 163 | - | |
| 164 | -@runtime_checkable | |
| 165 | -class NullCalibratedBackend(DifferentialBackend, Protocol): | |
| 166 | - """A differential backend that can produce a "null adapter" view. | |
| 167 | - | |
| 168 | - A null adapter has the *same structure* (rank, alpha, target modules) | |
| 169 | - as the real adapter but with weights drawn from a zero-mean Gaussian. | |
| 170 | - Running probes against this view yields the baseline "how much | |
| 171 | - signal does random noise produce" distribution — the denominator in | |
| 172 | - every numeric probe's z-score. | |
| 173 | - | |
| 174 | - The context manager takes a ``seed`` so calibration runs can be | |
| 175 | - reproduced and multiple independent null samples can be drawn to | |
| 176 | - estimate ``std``. | |
| 177 | - | |
| 178 | - Implementations MUST restore the real adapter on exit, including | |
| 179 | - on exceptions, so a caller can freely interleave null and real | |
| 180 | - calibrations within the same backend lifetime. | |
| 181 | - """ | |
| 182 | - | |
| 183 | - def as_null_adapter( | |
| 184 | - self, seed: int, *, init_scale: float = 0.02 | |
| 185 | - ) -> AbstractContextManager[_ScoringModel]: ... | |
| 186 | - | |
| 187 | - | |
| 188 | -# Helper Protocol for type-checking the yielded context object: it | |
| 189 | -# must satisfy both Model and ScoringBackend. mypy doesn't support | |
| 190 | -# intersection types, so we spell it out explicitly. | |
| 191 | -@runtime_checkable | |
| 192 | -class _ScoringModel(Model, ScoringBackend, Protocol): | |
| 193 | - """A Model that also exposes ScoringBackend.""" | |
| 194 | - | |
| 195 | - ... | |
| 196 | - | |
| 197 | - | |
| 198 | -ScoringModel = _ScoringModel | |
| 199 | -"""Public alias for the intersection ``Model & ScoringBackend``. | |
| 200 | - | |
| 201 | -Exported for backend and probe implementations that need to annotate | |
| 202 | -variables of this combined type. | |
| 203 | -""" | |
sway/src/dlm_sway/core/sections.pydeleted@@ -1,76 +0,0 @@ | ||
| 1 | -"""Minimal section contract for attribution probes. | |
| 2 | - | |
| 3 | -The flagship B1 ``section_internalization`` probe needs *structured* | |
| 4 | -input — a section has an id, a kind, content text, and possibly some | |
| 5 | -Q/A pairs or chosen/rejected triples. sway defines this shape here so | |
| 6 | -the probes stay oblivious to the upstream (``.dlm`` parser, custom | |
| 7 | -loaders, synthetic test fixtures). | |
| 8 | - | |
| 9 | -Field names are aligned with :mod:`dlm.doc.sections` but this module | |
| 10 | -does not import ``dlm`` — the bridge at | |
| 11 | -:mod:`dlm_sway.integrations.dlm` does the adaptation. | |
| 12 | -""" | |
| 13 | - | |
| 14 | -from __future__ import annotations | |
| 15 | - | |
| 16 | -from dataclasses import dataclass, field | |
| 17 | -from typing import Literal | |
| 18 | - | |
| 19 | -SectionKind = Literal["prose", "instruction", "preference"] | |
| 20 | - | |
| 21 | - | |
| 22 | -@dataclass(frozen=True, slots=True) | |
| 23 | -class SectionProbe: | |
| 24 | - """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section.""" | |
| 25 | - | |
| 26 | - prompt: str | |
| 27 | - gold: str | |
| 28 | - | |
| 29 | - | |
| 30 | -@dataclass(frozen=True, slots=True) | |
| 31 | -class SectionPreference: | |
| 32 | - """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section.""" | |
| 33 | - | |
| 34 | - prompt: str | |
| 35 | - chosen: str | |
| 36 | - rejected: str | |
| 37 | - | |
| 38 | - | |
| 39 | -@dataclass(frozen=True, slots=True) | |
| 40 | -class Section: | |
| 41 | - """One typed chunk of a training document. | |
| 42 | - | |
| 43 | - Attributes | |
| 44 | - ---------- | |
| 45 | - id: | |
| 46 | - Content-addressed identifier. ``.dlm`` uses a 16-hex-char | |
| 47 | - sha256 prefix; sway doesn't enforce a format. | |
| 48 | - kind: | |
| 49 | - Discriminator for which of :attr:`probes` / | |
| 50 | - :attr:`preferences` / :attr:`content` is the primary signal. | |
| 51 | - content: | |
| 52 | - Raw section text. Always populated; used by the rolling-PPL | |
| 53 | - path for PROSE sections. | |
| 54 | - probes: | |
| 55 | - For INSTRUCTION: parsed Q/A pairs. Empty tuple for others. | |
| 56 | - preferences: | |
| 57 | - For PREFERENCE: parsed chosen/rejected triples. Empty otherwise. | |
| 58 | - tag: | |
| 59 | - Optional free-form label for the section (e.g., "intro", | |
| 60 | - "api-reference"). Surfaces in per-section reports. | |
| 61 | - """ | |
| 62 | - | |
| 63 | - id: str | |
| 64 | - kind: SectionKind | |
| 65 | - content: str | |
| 66 | - probes: tuple[SectionProbe, ...] = field(default_factory=tuple) | |
| 67 | - preferences: tuple[SectionPreference, ...] = field(default_factory=tuple) | |
| 68 | - tag: str | None = None | |
| 69 | - | |
| 70 | - | |
| 71 | -def filter_kinds( | |
| 72 | - sections: tuple[Section, ...], kinds: tuple[SectionKind, ...] | |
| 73 | -) -> tuple[Section, ...]: | |
| 74 | - """Return only sections whose ``kind`` matches one of ``kinds``.""" | |
| 75 | - allow = set(kinds) | |
| 76 | - return tuple(s for s in sections if s.kind in allow) | |
sway/src/dlm_sway/integrations/__init__.pydeleted@@ -1,1 +0,0 @@ | ||
| 1 | -"""Optional integrations with upstream fine-tuning tools.""" | |
sway/src/dlm_sway/integrations/dlm/__init__.pydeleted@@ -1,1 +0,0 @@ | ||
| 1 | -"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``.""" | |
sway/src/dlm_sway/integrations/dlm/autogen.pydeleted@@ -1,191 +0,0 @@ | ||
| 1 | -"""Auto-generate a ``sway.yaml`` from a ``.dlm`` document. | |
| 2 | - | |
| 3 | -Walks the parsed sections and emits one entry per primitive sway ships: | |
| 4 | -the full 11-primitive battery wired up against the document's own | |
| 5 | -content. The result is a YAML artifact the user commits alongside their | |
| 6 | -``.dlm`` and diffs in PRs. | |
| 7 | - | |
| 8 | -The generated spec includes a ``dlm_source`` field that the suite loader | |
| 9 | -uses to pick up :class:`~dlm_sway.core.sections.Section` data at run | |
| 10 | -time — probes that need sections (B1, B3, C3) then work against the | |
| 11 | -typed structure instead of re-parsing text. | |
| 12 | -""" | |
| 13 | - | |
| 14 | -from __future__ import annotations | |
| 15 | - | |
| 16 | -from pathlib import Path | |
| 17 | -from typing import Any | |
| 18 | - | |
| 19 | -import yaml | |
| 20 | - | |
| 21 | -from dlm_sway.core.errors import SwayError | |
| 22 | -from dlm_sway.core.sections import Section | |
| 23 | -from dlm_sway.integrations.dlm.resolver import DlmHandle, resolve_dlm | |
| 24 | - | |
| 25 | - | |
| 26 | -def write_sway_yaml(dlm_path: Path, out: Path) -> None: | |
| 27 | - """Resolve the .dlm, build a spec dict, write it as YAML to ``out``.""" | |
| 28 | - handle = resolve_dlm(dlm_path) | |
| 29 | - if handle.adapter_path is None: | |
| 30 | - raise SwayError( | |
| 31 | - f"{dlm_path}: no trained adapter found at ~/.dlm/store/{handle.dlm_id}/adapter; " | |
| 32 | - "train the document with `dlm train` before generating a sway suite." | |
| 33 | - ) | |
| 34 | - spec = build_spec_dict(handle, dlm_source=str(dlm_path.resolve())) | |
| 35 | - out.write_text(yaml.safe_dump(spec, sort_keys=False), encoding="utf-8") | |
| 36 | - | |
| 37 | - | |
| 38 | -def build_spec_dict(handle: DlmHandle, *, dlm_source: str | None = None) -> dict[str, Any]: | |
| 39 | - """Build a sway.yaml-shaped dict from a :class:`DlmHandle`.""" | |
| 40 | - base_spec = {"kind": "hf", "base": handle.base_model} | |
| 41 | - ft_spec = { | |
| 42 | - "kind": "hf", | |
| 43 | - "base": handle.base_model, | |
| 44 | - "adapter": str(handle.adapter_path) if handle.adapter_path else None, | |
| 45 | - } | |
| 46 | - spec: dict[str, Any] = { | |
| 47 | - "version": 1, | |
| 48 | - "models": {"base": base_spec, "ft": ft_spec}, | |
| 49 | - "defaults": {"seed": 0, "differential": True}, | |
| 50 | - "suite": _build_suite(handle.sections), | |
| 51 | - } | |
| 52 | - if dlm_source is not None: | |
| 53 | - spec["dlm_source"] = dlm_source | |
| 54 | - return spec | |
| 55 | - | |
| 56 | - | |
| 57 | -def _build_suite(sections: tuple[Section, ...]) -> list[dict[str, Any]]: | |
| 58 | - """Assemble the full probe battery for the given sections. | |
| 59 | - | |
| 60 | - The ordering matters: ``null_adapter`` first so every downstream | |
| 61 | - probe's z-score threshold has stats to consult. | |
| 62 | - """ | |
| 63 | - instruction_probes: list[tuple[str, str]] = [ | |
| 64 | - (p.prompt, p.gold) for s in sections if s.kind == "instruction" for p in s.probes | |
| 65 | - ] | |
| 66 | - prose_prompts: list[str] = [] | |
| 67 | - for s in sections: | |
| 68 | - if s.kind == "prose" and s.content.strip(): | |
| 69 | - # Use the section's leading sentence as a natural completion prompt. | |
| 70 | - first_sentence = s.content.split(".")[0].strip() | |
| 71 | - if first_sentence: | |
| 72 | - prose_prompts.append(first_sentence + ".") | |
| 73 | - | |
| 74 | - kl_prompts = [q for q, _ in instruction_probes][:16] or prose_prompts[:16] | |
| 75 | - style_prompts = prose_prompts[:8] or [q for q, _ in instruction_probes][:8] | |
| 76 | - | |
| 77 | - suite: list[dict[str, Any]] = [] | |
| 78 | - | |
| 79 | - # Baseline calibration — always first. | |
| 80 | - suite.append({"name": "null_baseline", "kind": "null_adapter", "runs": 3}) | |
| 81 | - | |
| 82 | - # Adherence. | |
| 83 | - if kl_prompts: | |
| 84 | - suite.append( | |
| 85 | - { | |
| 86 | - "name": "delta_kl_doc", | |
| 87 | - "kind": "delta_kl", | |
| 88 | - "prompts": kl_prompts, | |
| 89 | - "assert_mean_gte": 0.02, | |
| 90 | - } | |
| 91 | - ) | |
| 92 | - if instruction_probes: | |
| 93 | - suite.append( | |
| 94 | - { | |
| 95 | - "name": "revert_check", | |
| 96 | - "kind": "adapter_revert", | |
| 97 | - "cases": [ | |
| 98 | - {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)} | |
| 99 | - for q, a in instruction_probes[:8] | |
| 100 | - ], | |
| 101 | - "assert_revert_rate_lt": 0.3, | |
| 102 | - } | |
| 103 | - ) | |
| 104 | - if kl_prompts: | |
| 105 | - suite.append( | |
| 106 | - { | |
| 107 | - "name": "prompt_collapse", | |
| 108 | - "kind": "prompt_collapse", | |
| 109 | - "prompts": kl_prompts[:4], | |
| 110 | - "context_lengths": [0, 256, 512, 1024], | |
| 111 | - "assert_half_life_tokens": 300, | |
| 112 | - } | |
| 113 | - ) | |
| 114 | - | |
| 115 | - # Attribution. | |
| 116 | - if len(sections) >= 2: | |
| 117 | - suite.append( | |
| 118 | - { | |
| 119 | - "name": "section_attribution", | |
| 120 | - "kind": "section_internalization", | |
| 121 | - "per_section_threshold": 0.05, | |
| 122 | - } | |
| 123 | - ) | |
| 124 | - if instruction_probes: | |
| 125 | - suite.append( | |
| 126 | - { | |
| 127 | - "name": "paraphrase_invariance", | |
| 128 | - "kind": "paraphrase_invariance", | |
| 129 | - "cases": [ | |
| 130 | - {"prompt": q, "gold": a, "paraphrases": _auto_paraphrases(q)} | |
| 131 | - for q, a in instruction_probes[:6] | |
| 132 | - ], | |
| 133 | - } | |
| 134 | - ) | |
| 135 | - has_preferences = any(s.kind == "preference" and s.preferences for s in sections) | |
| 136 | - if has_preferences: | |
| 137 | - suite.append( | |
| 138 | - { | |
| 139 | - "name": "preference_flip", | |
| 140 | - "kind": "preference_flip", | |
| 141 | - "assert_flip_rate_gte": 0.7, | |
| 142 | - } | |
| 143 | - ) | |
| 144 | - | |
| 145 | - # Calibration. | |
| 146 | - if style_prompts: | |
| 147 | - suite.append( | |
| 148 | - { | |
| 149 | - "name": "style_shift", | |
| 150 | - "kind": "style_fingerprint", | |
| 151 | - "prompts": style_prompts, | |
| 152 | - } | |
| 153 | - ) | |
| 154 | - suite.append({"name": "general_knowledge", "kind": "calibration_drift"}) | |
| 155 | - if any(s.kind == "prose" for s in sections): | |
| 156 | - suite.append( | |
| 157 | - { | |
| 158 | - "name": "verbatim_leak", | |
| 159 | - "kind": "leakage", | |
| 160 | - "prefix_chars": 128, | |
| 161 | - "continuation_chars": 256, | |
| 162 | - } | |
| 163 | - ) | |
| 164 | - | |
| 165 | - # Signature ablation — goes last because it's the most expensive. | |
| 166 | - if kl_prompts: | |
| 167 | - suite.append( | |
| 168 | - { | |
| 169 | - "name": "adapter_ablation", | |
| 170 | - "kind": "adapter_ablation", | |
| 171 | - "prompts": kl_prompts[:6], | |
| 172 | - "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | |
| 173 | - } | |
| 174 | - ) | |
| 175 | - | |
| 176 | - return suite | |
| 177 | - | |
| 178 | - | |
| 179 | -def _auto_paraphrases(prompt: str) -> list[str]: | |
| 180 | - """Small, deterministic paraphrase set used when authors don't supply one. | |
| 181 | - | |
| 182 | - Purely heuristic — good enough to detect "did the model memorize the | |
| 183 | - exact wording". Real paraphrase generation lives behind the | |
| 184 | - ``semsim`` extra. | |
| 185 | - """ | |
| 186 | - variants: list[str] = [] | |
| 187 | - stripped = prompt.rstrip("?. ") | |
| 188 | - variants.append(f"Could you explain: {stripped}?") | |
| 189 | - variants.append(f"I'd like to know — {stripped}.") | |
| 190 | - variants.append(f"Please describe: {stripped}.") | |
| 191 | - return variants[:3] | |
sway/src/dlm_sway/integrations/dlm/resolver.pydeleted@@ -1,243 +0,0 @@ | ||
| 1 | -"""Resolve a ``.dlm`` file to the artifacts sway needs. | |
| 2 | - | |
| 3 | -Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything | |
| 4 | -outside this package is oblivious to dlm's internal shape; the bridge | |
| 5 | -is the only place that knows, e.g., that a dlm section carries a | |
| 6 | -``kind`` field named ``type`` or that adapters live at | |
| 7 | -``adapter/versions/vNNNN/``. | |
| 8 | -""" | |
| 9 | - | |
| 10 | -from __future__ import annotations | |
| 11 | - | |
| 12 | -import hashlib | |
| 13 | -from dataclasses import dataclass | |
| 14 | -from pathlib import Path | |
| 15 | - | |
| 16 | -from dlm_sway.core.errors import SwayError | |
| 17 | -from dlm_sway.core.sections import ( | |
| 18 | - Section, | |
| 19 | - SectionKind, | |
| 20 | - SectionPreference, | |
| 21 | - SectionProbe, | |
| 22 | -) | |
| 23 | - | |
| 24 | - | |
| 25 | -@dataclass(frozen=True, slots=True) | |
| 26 | -class DlmHandle: | |
| 27 | - """Everything the sway bridge pulls out of a ``.dlm`` file. | |
| 28 | - | |
| 29 | - Attributes | |
| 30 | - ---------- | |
| 31 | - dlm_id: | |
| 32 | - Stable identifier from the frontmatter. | |
| 33 | - base_model: | |
| 34 | - Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape | |
| 35 | - hatch, taken verbatim from the frontmatter. | |
| 36 | - adapter_path: | |
| 37 | - Directory containing the current trained PEFT adapter (resolved | |
| 38 | - via dlm's own ``StorePath.for_dlm``). ``None`` if the document | |
| 39 | - hasn't been trained yet. | |
| 40 | - sections: | |
| 41 | - Typed sections ready for sway's probes. | |
| 42 | - doc_text: | |
| 43 | - Concatenated raw content of all sections. Used by probes that | |
| 44 | - need a whole-document stylistic reference (C1). | |
| 45 | - """ | |
| 46 | - | |
| 47 | - dlm_id: str | |
| 48 | - base_model: str | |
| 49 | - adapter_path: Path | None | |
| 50 | - sections: tuple[Section, ...] | |
| 51 | - doc_text: str | |
| 52 | - | |
| 53 | - | |
| 54 | -def resolve_dlm(dlm_path: Path) -> DlmHandle: | |
| 55 | - """Parse ``dlm_path`` and return a :class:`DlmHandle`. | |
| 56 | - | |
| 57 | - Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message | |
| 58 | - when the file is malformed or when the resolved adapter path doesn't | |
| 59 | - exist on disk. | |
| 60 | - """ | |
| 61 | - try: | |
| 62 | - from dlm.doc.parser import parse_file as dlm_parse_file | |
| 63 | - except ImportError as exc: | |
| 64 | - raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc | |
| 65 | - | |
| 66 | - parsed = dlm_parse_file(dlm_path) | |
| 67 | - fm = parsed.frontmatter | |
| 68 | - sections = tuple(_translate_section(s) for s in parsed.sections) | |
| 69 | - doc_text = "\n\n".join(s.content for s in sections) | |
| 70 | - | |
| 71 | - adapter_path = _resolve_adapter_path(fm.dlm_id) | |
| 72 | - base_hf_id = _resolve_base_model_to_hf_id(fm.base_model) | |
| 73 | - | |
| 74 | - return DlmHandle( | |
| 75 | - dlm_id=fm.dlm_id, | |
| 76 | - base_model=base_hf_id, | |
| 77 | - adapter_path=adapter_path, | |
| 78 | - sections=sections, | |
| 79 | - doc_text=doc_text, | |
| 80 | - ) | |
| 81 | - | |
| 82 | - | |
| 83 | -def _resolve_base_model_to_hf_id(base_model: str) -> str: | |
| 84 | - """Translate dlm's base-model *key* to a HuggingFace repo id. | |
| 85 | - | |
| 86 | - dlm's frontmatter stores registry keys like ``smollm2-135m`` which | |
| 87 | - resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends | |
| 88 | - call ``AutoModelForCausalLM.from_pretrained`` directly and need the | |
| 89 | - HF id. The ``hf:org/name`` escape hatch passes through unchanged. | |
| 90 | - """ | |
| 91 | - if base_model.startswith("hf:"): | |
| 92 | - return base_model[len("hf:") :] | |
| 93 | - try: | |
| 94 | - from dlm.base_models import resolve as resolve_base | |
| 95 | - except ImportError: | |
| 96 | - return base_model | |
| 97 | - try: | |
| 98 | - spec = resolve_base(base_model) | |
| 99 | - except Exception: # noqa: BLE001 — unknown dlm errors | |
| 100 | - return base_model | |
| 101 | - hf_id = getattr(spec, "hf_id", None) | |
| 102 | - return str(hf_id) if hf_id else base_model | |
| 103 | - | |
| 104 | - | |
| 105 | -def _resolve_adapter_path(dlm_id: str) -> Path | None: | |
| 106 | - """Locate the current adapter directory for ``dlm_id``. | |
| 107 | - | |
| 108 | - Uses dlm's module-level ``for_dlm`` helper if available, else falls | |
| 109 | - back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt`` | |
| 110 | - pointer. Returns ``None`` if no adapter has been trained yet. | |
| 111 | - """ | |
| 112 | - # Primary path: use dlm's own store-path helpers. | |
| 113 | - try: | |
| 114 | - from dlm.store.paths import for_dlm as _for_dlm | |
| 115 | - except ImportError: | |
| 116 | - _for_dlm = None | |
| 117 | - | |
| 118 | - if _for_dlm is not None: | |
| 119 | - try: | |
| 120 | - store = _for_dlm(dlm_id) | |
| 121 | - except Exception: # noqa: BLE001 — unknown dlm exception shapes | |
| 122 | - store = None | |
| 123 | - if store is not None: | |
| 124 | - try: | |
| 125 | - resolved = store.resolve_current_adapter() | |
| 126 | - except (AttributeError, FileNotFoundError): | |
| 127 | - resolved = None | |
| 128 | - if resolved is not None and Path(resolved).exists(): | |
| 129 | - return Path(resolved) | |
| 130 | - | |
| 131 | - # Manual fallback. The ``current.txt`` pointer is relative to the | |
| 132 | - # **store root**, not to current.txt's parent dir — so go up one level. | |
| 133 | - import os | |
| 134 | - | |
| 135 | - home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser() | |
| 136 | - store_root = home / "store" / dlm_id | |
| 137 | - current_file = store_root / "adapter" / "current.txt" | |
| 138 | - if current_file.exists(): | |
| 139 | - pointer = current_file.read_text(encoding="utf-8").strip() | |
| 140 | - candidate = (store_root / pointer).resolve() | |
| 141 | - if candidate.exists(): | |
| 142 | - return candidate | |
| 143 | - return None | |
| 144 | - | |
| 145 | - | |
| 146 | -def _translate_section(dlm_section: object) -> Section: | |
| 147 | - """Adapt a ``dlm.doc.sections.Section`` to sway's section type. | |
| 148 | - | |
| 149 | - dlm's Section dataclass uses the attribute name ``type`` (not | |
| 150 | - ``kind``) and stores instruction/preference content as raw markdown | |
| 151 | - — dlm ships dedicated parsers (``parse_instruction_body``, | |
| 152 | - ``parse_preference_body``) that we reuse here so any future dlm | |
| 153 | - syntax additions land in sway for free. | |
| 154 | - """ | |
| 155 | - # dlm's current attribute is ``type``; older revisions used ``kind``. | |
| 156 | - kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None)) | |
| 157 | - kind = _normalize_kind(kind_raw) | |
| 158 | - content = str(getattr(dlm_section, "content", "")) | |
| 159 | - section_id = str( | |
| 160 | - getattr(dlm_section, "section_id", None) | |
| 161 | - or getattr(dlm_section, "id", None) | |
| 162 | - or _content_hash(content) | |
| 163 | - ) | |
| 164 | - tag = getattr(dlm_section, "tag", None) | |
| 165 | - | |
| 166 | - probes: tuple[SectionProbe, ...] = () | |
| 167 | - preferences: tuple[SectionPreference, ...] = () | |
| 168 | - if kind == "instruction": | |
| 169 | - probes = tuple(_parse_instruction(content, section_id=section_id)) | |
| 170 | - elif kind == "preference": | |
| 171 | - preferences = tuple(_parse_preference(content, section_id=section_id)) | |
| 172 | - | |
| 173 | - return Section( | |
| 174 | - id=section_id, | |
| 175 | - kind=kind, | |
| 176 | - content=content, | |
| 177 | - probes=probes, | |
| 178 | - preferences=preferences, | |
| 179 | - tag=tag if isinstance(tag, str) else None, | |
| 180 | - ) | |
| 181 | - | |
| 182 | - | |
| 183 | -def _normalize_kind(raw: object) -> SectionKind: | |
| 184 | - """Map dlm's SectionType/str to sway's lowercase kind.""" | |
| 185 | - if raw is None: | |
| 186 | - return "prose" | |
| 187 | - value = str(raw).lower() | |
| 188 | - # dlm uses uppercase StrEnum values like "PROSE"; normalize. | |
| 189 | - if value.endswith("prose") or "prose" in value: | |
| 190 | - return "prose" | |
| 191 | - if "instruction" in value: | |
| 192 | - return "instruction" | |
| 193 | - if "preference" in value: | |
| 194 | - return "preference" | |
| 195 | - return "prose" | |
| 196 | - | |
| 197 | - | |
| 198 | -def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]: | |
| 199 | - """Pull (Q, A) pairs out of a dlm INSTRUCTION section body. | |
| 200 | - | |
| 201 | - Delegates to dlm's own ``parse_instruction_body`` so syntax additions | |
| 202 | - land in sway without code changes here. Falls back to an empty list | |
| 203 | - on parse errors — the probe will fail gracefully. | |
| 204 | - """ | |
| 205 | - try: | |
| 206 | - from dlm.data.instruction_parser import parse_instruction_body | |
| 207 | - except ImportError: | |
| 208 | - return [] | |
| 209 | - try: | |
| 210 | - pairs = parse_instruction_body(content, section_id=section_id) | |
| 211 | - except Exception: # noqa: BLE001 — dlm raises InstructionParseError | |
| 212 | - return [] | |
| 213 | - out: list[SectionProbe] = [] | |
| 214 | - for p in pairs: | |
| 215 | - q = getattr(p, "question", getattr(p, "prompt", "")) | |
| 216 | - a = getattr(p, "answer", getattr(p, "gold", "")) | |
| 217 | - if q and a: | |
| 218 | - out.append(SectionProbe(prompt=str(q), gold=str(a))) | |
| 219 | - return out | |
| 220 | - | |
| 221 | - | |
| 222 | -def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]: | |
| 223 | - """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body.""" | |
| 224 | - try: | |
| 225 | - from dlm.data.preference_parser import parse_preference_body | |
| 226 | - except ImportError: | |
| 227 | - return [] | |
| 228 | - try: | |
| 229 | - triples = parse_preference_body(content, section_id=section_id) | |
| 230 | - except Exception: # noqa: BLE001 — dlm raises PreferenceParseError | |
| 231 | - return [] | |
| 232 | - out: list[SectionPreference] = [] | |
| 233 | - for t in triples: | |
| 234 | - p = str(getattr(t, "prompt", "")) | |
| 235 | - c = str(getattr(t, "chosen", "")) | |
| 236 | - rej = str(getattr(t, "rejected", "")) | |
| 237 | - if p and c and rej: | |
| 238 | - out.append(SectionPreference(prompt=p, chosen=c, rejected=rej)) | |
| 239 | - return out | |
| 240 | - | |
| 241 | - | |
| 242 | -def _content_hash(content: str) -> str: | |
| 243 | - return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] | |
sway/src/dlm_sway/probes/__init__.pydeleted@@ -1,27 +0,0 @@ | ||
| 1 | -"""Probe primitives. Each module in this package implements one primitive. | |
| 2 | - | |
| 3 | -Importing this package eagerly imports every probe module so their | |
| 4 | -``__init_subclass__`` hooks populate the registry. If you're hitting | |
| 5 | -"unknown probe kind" from :func:`dlm_sway.probes.base.build_probe`, the | |
| 6 | -fix is to ``import dlm_sway.probes`` before building the probe — which | |
| 7 | -this ``__init__`` does for you. | |
| 8 | -""" | |
| 9 | - | |
| 10 | -from __future__ import annotations | |
| 11 | - | |
| 12 | -# Register every shipped probe with the central registry by importing | |
| 13 | -# its module. Order is not load-bearing for registration but matches the | |
| 14 | -# categorical grouping in :mod:`dlm_sway.core.result`. | |
| 15 | -from dlm_sway.probes import ( # noqa: F401 — imports register the probes | |
| 16 | - adapter_ablation, | |
| 17 | - adapter_revert, | |
| 18 | - calibration_drift, | |
| 19 | - delta_kl, | |
| 20 | - leakage, | |
| 21 | - null_adapter, | |
| 22 | - paraphrase_invariance, | |
| 23 | - preference_flip, | |
| 24 | - prompt_collapse, | |
| 25 | - section_internalization, | |
| 26 | - style_fingerprint, | |
| 27 | -) | |
sway/src/dlm_sway/probes/_calibration_pack.pydeleted@@ -1,63 +0,0 @@ | ||
| 1 | -"""A small, built-in general-knowledge probe pack for C2. | |
| 2 | - | |
| 3 | -Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few | |
| 4 | -tokens a competent base model should assign high probability to. The | |
| 5 | -items are deliberately *factually trivial* — the point isn't "does the | |
| 6 | -model know this?" but "did the fine-tune forget this?" — so the pack | |
| 7 | -skews toward grade-school geography, chemistry, arithmetic, and | |
| 8 | -high-frequency idiom. | |
| 9 | - | |
| 10 | -A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD + | |
| 11 | -OpenBookQA. This 30-item seed lets the probe ship today and catches the | |
| 12 | -most egregious over-fit cases. | |
| 13 | -""" | |
| 14 | - | |
| 15 | -from __future__ import annotations | |
| 16 | - | |
| 17 | -from typing import Final | |
| 18 | - | |
| 19 | -CalibrationItem = tuple[str, str] | |
| 20 | - | |
| 21 | -BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = ( | |
| 22 | - # Geography | |
| 23 | - ("The capital of France is", " Paris"), | |
| 24 | - ("The capital of Japan is", " Tokyo"), | |
| 25 | - ("The largest ocean on Earth is the", " Pacific"), | |
| 26 | - ("Mount Everest is located on the border of Nepal and", " China"), | |
| 27 | - ("The longest river in South America is the", " Amazon"), | |
| 28 | - # Natural sciences | |
| 29 | - ("Water freezes at zero degrees", " Celsius"), | |
| 30 | - ("The chemical symbol for gold is", " Au"), | |
| 31 | - ("Light travels faster than", " sound"), | |
| 32 | - ("Plants convert sunlight into energy through", " photosynthesis"), | |
| 33 | - ("The Earth orbits around the", " Sun"), | |
| 34 | - # Arithmetic | |
| 35 | - ("Two plus two equals", " four"), | |
| 36 | - ("Ten times ten equals", " one hundred"), | |
| 37 | - ("Half of one hundred is", " fifty"), | |
| 38 | - ("A dozen means", " twelve"), | |
| 39 | - # Language and idiom | |
| 40 | - ("A rose by any other name would smell as", " sweet"), | |
| 41 | - ("To be or not to be, that is the", " question"), | |
| 42 | - ("The early bird catches the", " worm"), | |
| 43 | - ("Actions speak louder than", " words"), | |
| 44 | - ("A picture is worth a thousand", " words"), | |
| 45 | - # History | |
| 46 | - ("World War II ended in the year", " 1945"), | |
| 47 | - ("The first president of the United States was", " George Washington"), | |
| 48 | - ("The Berlin Wall fell in", " 1989"), | |
| 49 | - # Biology | |
| 50 | - ("Humans have twenty", " fingers and toes"), | |
| 51 | - ("The human body has two", " lungs"), | |
| 52 | - ("Blood is pumped through the body by the", " heart"), | |
| 53 | - # Technology | |
| 54 | - ("HTML stands for HyperText", " Markup Language"), | |
| 55 | - ("The World Wide Web was invented by Tim", " Berners-Lee"), | |
| 56 | - # Miscellaneous | |
| 57 | - ("One year has", " 365 days"), | |
| 58 | - ("A week has seven", " days"), | |
| 59 | - ("There are seven colors in a", " rainbow"), | |
| 60 | -) | |
| 61 | -"""30 items covering geography, science, arithmetic, language, history, | |
| 62 | -biology, and technology. Pulled from public-domain grade-school facts so | |
| 63 | -there's no licensing concern about shipping with the wheel.""" | |
sway/src/dlm_sway/probes/_divergence.pydeleted@@ -1,102 +0,0 @@ | ||
| 1 | -"""Shared math for divergence-based probes. | |
| 2 | - | |
| 3 | -Extracted so :mod:`delta_kl`, :mod:`adapter_ablation`, and any future | |
| 4 | -probe operating on next-token distributions reuse the same aligned- | |
| 5 | -top-k KL / JS computation. Having one implementation keeps the numerical | |
| 6 | -treatment consistent across the report. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -import math | |
| 12 | -from typing import Literal | |
| 13 | - | |
| 14 | -import numpy as np | |
| 15 | -from numpy.typing import NDArray | |
| 16 | - | |
| 17 | -from dlm_sway.core.scoring import TokenDist | |
| 18 | - | |
| 19 | -Divergence = Literal["kl", "js"] | |
| 20 | - | |
| 21 | - | |
| 22 | -def aligned_probs( | |
| 23 | - base: TokenDist, ft: TokenDist | |
| 24 | -) -> tuple[NDArray[np.float64], NDArray[np.float64]]: | |
| 25 | - """Return aligned probability vectors over the union of top-k tokens. | |
| 26 | - | |
| 27 | - Two ``TokenDist`` objects may surface different top-k indices if | |
| 28 | - the two models disagree about the hot tokens. We build a shared | |
| 29 | - support — ``union(base.token_ids, ft.token_ids)`` — and slot the | |
| 30 | - known probabilities in. Unknown entries fall back to the | |
| 31 | - per-distribution tail mass divided across the missing tokens, | |
| 32 | - which is the maximum-entropy completion under the truncation. | |
| 33 | - """ | |
| 34 | - union_ids = np.union1d(base.token_ids, ft.token_ids) | |
| 35 | - k = int(union_ids.size) | |
| 36 | - | |
| 37 | - base_probs = _to_support(base, union_ids, k) | |
| 38 | - ft_probs = _to_support(ft, union_ids, k) | |
| 39 | - | |
| 40 | - # Normalize in case of floating noise from the fill-in. | |
| 41 | - base_probs /= base_probs.sum() | |
| 42 | - ft_probs /= ft_probs.sum() | |
| 43 | - return base_probs, ft_probs | |
| 44 | - | |
| 45 | - | |
| 46 | -def _to_support(dist: TokenDist, support: NDArray[np.int64], k: int) -> NDArray[np.float64]: | |
| 47 | - probs = np.exp(dist.logprobs.astype(np.float64)) | |
| 48 | - out = np.zeros(k, dtype=np.float64) | |
| 49 | - known_mass = float(probs.sum()) | |
| 50 | - tail_mass = max(0.0, 1.0 - known_mass) | |
| 51 | - | |
| 52 | - id_to_idx = {int(tok): idx for idx, tok in enumerate(support.tolist())} | |
| 53 | - missing = 0 | |
| 54 | - for tok, p in zip(dist.token_ids.tolist(), probs.tolist(), strict=True): | |
| 55 | - i = id_to_idx.get(int(tok)) | |
| 56 | - if i is None: | |
| 57 | - # Shouldn't happen given union construction. | |
| 58 | - missing += 1 | |
| 59 | - continue | |
| 60 | - out[i] = float(p) | |
| 61 | - | |
| 62 | - # Spread the tail mass over the support entries that this dist | |
| 63 | - # doesn't explicitly provide. Size of that set: | |
| 64 | - n_unknown = int((out == 0.0).sum()) - missing | |
| 65 | - if n_unknown > 0 and tail_mass > 0.0: | |
| 66 | - per = tail_mass / n_unknown | |
| 67 | - out[out == 0.0] = per | |
| 68 | - | |
| 69 | - return out | |
| 70 | - | |
| 71 | - | |
| 72 | -def kl(p: NDArray[np.float64], q: NDArray[np.float64]) -> float: | |
| 73 | - """KL(p || q) in nats. Robust to zeros in p (treated as 0·log0 = 0).""" | |
| 74 | - mask = p > 0.0 | |
| 75 | - safe_q = np.where(q > 0.0, q, 1e-12) | |
| 76 | - return float(np.sum(p[mask] * (np.log(p[mask]) - np.log(safe_q[mask])))) | |
| 77 | - | |
| 78 | - | |
| 79 | -def js(p: NDArray[np.float64], q: NDArray[np.float64]) -> float: | |
| 80 | - """Jensen-Shannon divergence. Symmetric, bounded in [0, ln 2] (nats). | |
| 81 | - | |
| 82 | - The upper bound makes JS a nicer default for thresholding than raw | |
| 83 | - KL — a user doesn't need to know their specific model's KL scale to | |
| 84 | - pick a threshold. | |
| 85 | - """ | |
| 86 | - m = 0.5 * (p + q) | |
| 87 | - return 0.5 * kl(p, m) + 0.5 * kl(q, m) | |
| 88 | - | |
| 89 | - | |
| 90 | -def divergence(base: TokenDist, ft: TokenDist, kind: Divergence = "js") -> float: | |
| 91 | - """Compute KL or JS between two ``TokenDist`` on a shared support.""" | |
| 92 | - p, q = aligned_probs(base, ft) | |
| 93 | - if kind == "js": | |
| 94 | - return js(p, q) | |
| 95 | - if kind == "kl": | |
| 96 | - return kl(q, p) # KL(ft || base) — "how much does ft diverge from base" | |
| 97 | - raise ValueError(f"unknown divergence kind: {kind!r}") | |
| 98 | - | |
| 99 | - | |
| 100 | -def js_ln2() -> float: | |
| 101 | - """Upper bound on JS in nats. Useful for normalization.""" | |
| 102 | - return math.log(2.0) | |
sway/src/dlm_sway/probes/adapter_ablation.pydeleted@@ -1,193 +0,0 @@ | ||
| 1 | -"""N2 AdapterAblation — the sway signature primitive. | |
| 2 | - | |
| 3 | -Scales the LoRA additive term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} | |
| 4 | -and measures the mean divergence from the base distribution at each | |
| 5 | -step. Fits a monotonic response curve; reports three shape metrics: | |
| 6 | - | |
| 7 | -- **linearity**: R² of a linear fit on ``(λ, mean_div)``. High means | |
| 8 | - the adapter's effect scales predictably; low means it's "all or | |
| 9 | - nothing" (degenerate). | |
| 10 | -- **saturation_lambda**: the smallest λ at which divergence reaches | |
| 11 | - 90% of the λ=1 value. Too low (<0.3) means the adapter fires at | |
| 12 | - partial strength — fragile. Too high (>1.0) means the adapter is | |
| 13 | - under-trained. | |
| 14 | -- **overshoot**: divergence at λ=1.25 divided by λ=1.0. >1.05 is the | |
| 15 | - healthy "pushing past 1 still moves the model" signal. An overshoot | |
| 16 | - below 1.0 suggests collapse. | |
| 17 | - | |
| 18 | -This is the single novel primitive that no generic eval harness | |
| 19 | -provides — sway's position next to the adapter math makes it possible. | |
| 20 | - | |
| 21 | -Requires the backend to implement | |
| 22 | -:class:`~dlm_sway.core.scoring.ScalableDifferentialBackend`. Probes | |
| 23 | -SKIP gracefully on backends that don't. | |
| 24 | -""" | |
| 25 | - | |
| 26 | -from __future__ import annotations | |
| 27 | - | |
| 28 | -from typing import Literal | |
| 29 | - | |
| 30 | -import numpy as np | |
| 31 | -from pydantic import Field | |
| 32 | - | |
| 33 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 34 | -from dlm_sway.core.scoring import ScalableDifferentialBackend | |
| 35 | -from dlm_sway.probes._divergence import Divergence, divergence | |
| 36 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 37 | - | |
| 38 | - | |
| 39 | -class AdapterAblationSpec(ProbeSpec): | |
| 40 | - kind: Literal["adapter_ablation"] = "adapter_ablation" | |
| 41 | - prompts: list[str] = Field(default_factory=list) | |
| 42 | - lambdas: list[float] = Field( | |
| 43 | - default_factory=lambda: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | |
| 44 | - min_length=3, | |
| 45 | - ) | |
| 46 | - divergence: Divergence = "js" | |
| 47 | - top_k: int | None = None | |
| 48 | - assert_linearity_gte: float = 0.85 | |
| 49 | - assert_saturation_between: tuple[float, float] = (0.3, 1.05) | |
| 50 | - assert_overshoot_gte: float = 1.02 | |
| 51 | - | |
| 52 | - | |
| 53 | -class AdapterAblationProbe(Probe): | |
| 54 | - kind = "adapter_ablation" | |
| 55 | - spec_cls = AdapterAblationSpec | |
| 56 | - category = "ablation" | |
| 57 | - | |
| 58 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 59 | - assert isinstance(spec, AdapterAblationSpec) | |
| 60 | - if not spec.prompts: | |
| 61 | - return ProbeResult( | |
| 62 | - name=spec.name, | |
| 63 | - kind=spec.kind, | |
| 64 | - verdict=Verdict.ERROR, | |
| 65 | - score=None, | |
| 66 | - message="no prompts provided", | |
| 67 | - ) | |
| 68 | - if not isinstance(ctx.backend, ScalableDifferentialBackend): | |
| 69 | - return ProbeResult( | |
| 70 | - name=spec.name, | |
| 71 | - kind=spec.kind, | |
| 72 | - verdict=Verdict.SKIP, | |
| 73 | - score=None, | |
| 74 | - message=( | |
| 75 | - "backend does not implement ScalableDifferentialBackend — " | |
| 76 | - "adapter ablation requires LoRA-scale access" | |
| 77 | - ), | |
| 78 | - ) | |
| 79 | - | |
| 80 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | |
| 81 | - | |
| 82 | - # Reference distribution at λ=0 (adapter scaled to zero → base). | |
| 83 | - lam_zero = min(spec.lambdas) | |
| 84 | - per_lambda: list[float] = [] | |
| 85 | - for lam in spec.lambdas: | |
| 86 | - divs_for_lam: list[float] = [] | |
| 87 | - for prompt in spec.prompts: | |
| 88 | - with ctx.backend.as_scaled_adapter(lam_zero) as ref: | |
| 89 | - ref_dist = ref.next_token_dist(prompt, top_k=top_k) | |
| 90 | - with ctx.backend.as_scaled_adapter(lam) as scaled: | |
| 91 | - scaled_dist = scaled.next_token_dist(prompt, top_k=top_k) | |
| 92 | - divs_for_lam.append(divergence(ref_dist, scaled_dist, kind=spec.divergence)) | |
| 93 | - per_lambda.append(float(np.mean(divs_for_lam))) | |
| 94 | - | |
| 95 | - lambdas_arr = np.asarray(spec.lambdas, dtype=np.float64) | |
| 96 | - divs_arr = np.asarray(per_lambda, dtype=np.float64) | |
| 97 | - | |
| 98 | - linearity = _r_squared(lambdas_arr, divs_arr) | |
| 99 | - saturation_lambda = _saturation_lambda(lambdas_arr, divs_arr) | |
| 100 | - overshoot = _overshoot(lambdas_arr, divs_arr) | |
| 101 | - | |
| 102 | - # Pass when all three shape metrics land in their healthy bands. | |
| 103 | - sat_lo, sat_hi = spec.assert_saturation_between | |
| 104 | - ok_lin = linearity >= spec.assert_linearity_gte | |
| 105 | - ok_sat = saturation_lambda is not None and sat_lo <= saturation_lambda <= sat_hi | |
| 106 | - ok_over = overshoot >= spec.assert_overshoot_gte | |
| 107 | - verdict = Verdict.PASS if (ok_lin and ok_sat and ok_over) else Verdict.FAIL | |
| 108 | - | |
| 109 | - lin_score = max(0.0, min(1.0, linearity / max(spec.assert_linearity_gte, 1e-6))) | |
| 110 | - over_score = max(0.0, min(1.0, (overshoot - 1.0) / 0.2)) | |
| 111 | - sat_score = 1.0 if ok_sat else 0.3 | |
| 112 | - score = 0.4 * lin_score + 0.3 * sat_score + 0.3 * over_score | |
| 113 | - | |
| 114 | - return ProbeResult( | |
| 115 | - name=spec.name, | |
| 116 | - kind=spec.kind, | |
| 117 | - verdict=verdict, | |
| 118 | - score=score, | |
| 119 | - raw=linearity, | |
| 120 | - evidence={ | |
| 121 | - "lambdas": spec.lambdas, | |
| 122 | - "mean_divergence_per_lambda": per_lambda, | |
| 123 | - "linearity": linearity, | |
| 124 | - "saturation_lambda": saturation_lambda, | |
| 125 | - "overshoot": overshoot, | |
| 126 | - "passed_linearity": ok_lin, | |
| 127 | - "passed_saturation": ok_sat, | |
| 128 | - "passed_overshoot": ok_over, | |
| 129 | - "weight": spec.weight, | |
| 130 | - }, | |
| 131 | - message=( | |
| 132 | - f"R²={linearity:.2f}, sat_λ={saturation_lambda:.2f} " | |
| 133 | - f"({'in' if ok_sat else 'out of'} band), overshoot={overshoot:.2f}" | |
| 134 | - if saturation_lambda is not None | |
| 135 | - else f"R²={linearity:.2f}, saturation undetected, overshoot={overshoot:.2f}" | |
| 136 | - ), | |
| 137 | - ) | |
| 138 | - | |
| 139 | - | |
| 140 | -def _r_squared(x: np.ndarray, y: np.ndarray) -> float: | |
| 141 | - """Coefficient of determination for a linear fit of ``y`` on ``x``.""" | |
| 142 | - if x.size < 2: | |
| 143 | - return 0.0 | |
| 144 | - xm = float(x.mean()) | |
| 145 | - ym = float(y.mean()) | |
| 146 | - denom = float(((x - xm) ** 2).sum()) | |
| 147 | - if denom == 0.0: | |
| 148 | - return 0.0 | |
| 149 | - slope = float(((x - xm) * (y - ym)).sum()) / denom | |
| 150 | - intercept = ym - slope * xm | |
| 151 | - y_pred = slope * x + intercept | |
| 152 | - ss_res = float(((y - y_pred) ** 2).sum()) | |
| 153 | - ss_tot = float(((y - ym) ** 2).sum()) | |
| 154 | - if ss_tot == 0.0: | |
| 155 | - return 1.0 | |
| 156 | - return max(0.0, 1.0 - ss_res / ss_tot) | |
| 157 | - | |
| 158 | - | |
| 159 | -def _saturation_lambda(lambdas: np.ndarray, divs: np.ndarray) -> float | None: | |
| 160 | - """Smallest λ ≤ 1.0 at which divergence reaches 90% of div(λ=1).""" | |
| 161 | - # Locate the index of λ=1.0 (or the closest entry ≤ 1.0). | |
| 162 | - candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0] | |
| 163 | - if candidates.size == 0: | |
| 164 | - # Fall back to the largest λ ≤ 1.0. | |
| 165 | - mask = lambdas <= 1.0 | |
| 166 | - if not mask.any(): | |
| 167 | - return None | |
| 168 | - idx1 = int(np.argmax(lambdas * mask)) | |
| 169 | - else: | |
| 170 | - idx1 = int(candidates[0]) | |
| 171 | - target = 0.9 * float(divs[idx1]) | |
| 172 | - if target <= 0: | |
| 173 | - return None | |
| 174 | - for lam, d in zip(lambdas[: idx1 + 1], divs[: idx1 + 1], strict=False): | |
| 175 | - if d >= target: | |
| 176 | - return float(lam) | |
| 177 | - return None | |
| 178 | - | |
| 179 | - | |
| 180 | -def _overshoot(lambdas: np.ndarray, divs: np.ndarray) -> float: | |
| 181 | - """``div(λ_max) / div(λ=1)``. Returns 1.0 if λ_max ≤ 1.0.""" | |
| 182 | - idx_max = int(np.argmax(lambdas)) | |
| 183 | - candidates = np.where(np.isclose(lambdas, 1.0, atol=1e-6))[0] | |
| 184 | - if candidates.size == 0: | |
| 185 | - return 1.0 | |
| 186 | - idx1 = int(candidates[0]) | |
| 187 | - if idx_max == idx1: | |
| 188 | - return 1.0 | |
| 189 | - d1 = float(divs[idx1]) | |
| 190 | - dmax = float(divs[idx_max]) | |
| 191 | - if d1 <= 0: | |
| 192 | - return 1.0 | |
| 193 | - return dmax / d1 | |
sway/src/dlm_sway/probes/adapter_revert.pydeleted@@ -1,178 +0,0 @@ | ||
| 1 | -"""A2 AdapterRevert — does the fine-tuned model drift back to base under pressure? | |
| 2 | - | |
| 3 | -For each test case the user provides a prompt, a "gold" answer (the | |
| 4 | -adapter's intended response), and one or more adversarial paraphrases of | |
| 5 | -the prompt. We generate base-model and ft-model completions on every | |
| 6 | -paraphrase and ask: does the ft output cluster semantically with the | |
| 7 | -base's output (revert) or with the gold (adhere)? | |
| 8 | - | |
| 9 | -Signal: ``revert_rate`` = fraction of (case, paraphrase) pairs where | |
| 10 | -``cos(ft, base) > cos(ft, gold)``. A healthy fine-tune holds below 25%. | |
| 11 | - | |
| 12 | -Needs sentence embeddings. Without the ``semsim`` extra installed the | |
| 13 | -probe returns :attr:`Verdict.SKIP` with a pip hint — deterministic | |
| 14 | -n-gram fallbacks don't carry semantic equivalence reliably enough to | |
| 15 | -drive a revert decision, and we'd rather be honest than lossy. | |
| 16 | -""" | |
| 17 | - | |
| 18 | -from __future__ import annotations | |
| 19 | - | |
| 20 | -from typing import Any, Literal | |
| 21 | - | |
| 22 | -from pydantic import BaseModel, ConfigDict, Field | |
| 23 | - | |
| 24 | -from dlm_sway.core.errors import BackendNotAvailableError | |
| 25 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 26 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 27 | - | |
| 28 | - | |
| 29 | -class AdapterRevertCase(BaseModel): | |
| 30 | - """One revert test case.""" | |
| 31 | - | |
| 32 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 33 | - | |
| 34 | - prompt: str | |
| 35 | - gold: str | |
| 36 | - """What the adapter is supposed to produce.""" | |
| 37 | - paraphrases: list[str] = Field(default_factory=list, min_length=1) | |
| 38 | - """At least one paraphrase is required — revert is observed under | |
| 39 | - reframing, not on the original prompt.""" | |
| 40 | - | |
| 41 | - | |
| 42 | -class AdapterRevertSpec(ProbeSpec): | |
| 43 | - kind: Literal["adapter_revert"] = "adapter_revert" | |
| 44 | - cases: list[AdapterRevertCase] = Field(default_factory=list) | |
| 45 | - max_new_tokens: int = 64 | |
| 46 | - embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" | |
| 47 | - """HF id of the embedder. Default is ~80 MB, CPU-friendly.""" | |
| 48 | - base_gold_similarity_cap: float = 0.75 | |
| 49 | - """Skip pairs where base and gold are trivially similar — those | |
| 50 | - can't distinguish revert from adherence, and including them would | |
| 51 | - inflate the revert rate with noise.""" | |
| 52 | - assert_revert_rate_lt: float = 0.25 | |
| 53 | - | |
| 54 | - | |
| 55 | -class AdapterRevertProbe(Probe): | |
| 56 | - kind = "adapter_revert" | |
| 57 | - spec_cls = AdapterRevertSpec | |
| 58 | - category = "adherence" | |
| 59 | - | |
| 60 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 61 | - assert isinstance(spec, AdapterRevertSpec) | |
| 62 | - if not spec.cases: | |
| 63 | - return ProbeResult( | |
| 64 | - name=spec.name, | |
| 65 | - kind=spec.kind, | |
| 66 | - verdict=Verdict.ERROR, | |
| 67 | - score=None, | |
| 68 | - message="no cases provided", | |
| 69 | - ) | |
| 70 | - | |
| 71 | - try: | |
| 72 | - embed = _load_embedder(spec.embedding_model) | |
| 73 | - except BackendNotAvailableError as exc: | |
| 74 | - return ProbeResult( | |
| 75 | - name=spec.name, | |
| 76 | - kind=spec.kind, | |
| 77 | - verdict=Verdict.SKIP, | |
| 78 | - score=None, | |
| 79 | - message=str(exc), | |
| 80 | - ) | |
| 81 | - | |
| 82 | - import numpy as np | |
| 83 | - | |
| 84 | - total = 0 | |
| 85 | - reverts = 0 | |
| 86 | - dropped_trivial = 0 | |
| 87 | - per_case: list[dict[str, Any]] = [] | |
| 88 | - for case in spec.cases: | |
| 89 | - gold_vec = embed([case.gold])[0] | |
| 90 | - for pp in case.paraphrases: | |
| 91 | - with ctx.backend.as_base() as bv: | |
| 92 | - base_gen = bv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | |
| 93 | - with ctx.backend.as_finetuned() as fv: | |
| 94 | - ft_gen = fv.generate(pp, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | |
| 95 | - vecs = embed([base_gen, ft_gen]) | |
| 96 | - base_vec, ft_vec = vecs[0], vecs[1] | |
| 97 | - base_gold = _cosine(base_vec, gold_vec) | |
| 98 | - if base_gold > spec.base_gold_similarity_cap: | |
| 99 | - dropped_trivial += 1 | |
| 100 | - continue | |
| 101 | - cos_ft_base = _cosine(ft_vec, base_vec) | |
| 102 | - cos_ft_gold = _cosine(ft_vec, gold_vec) | |
| 103 | - total += 1 | |
| 104 | - if cos_ft_base > cos_ft_gold: | |
| 105 | - reverts += 1 | |
| 106 | - per_case.append( | |
| 107 | - { | |
| 108 | - "prompt": pp[:80], | |
| 109 | - "cos_ft_base": cos_ft_base, | |
| 110 | - "cos_ft_gold": cos_ft_gold, | |
| 111 | - "reverted": cos_ft_base > cos_ft_gold, | |
| 112 | - } | |
| 113 | - ) | |
| 114 | - | |
| 115 | - if total == 0: | |
| 116 | - return ProbeResult( | |
| 117 | - name=spec.name, | |
| 118 | - kind=spec.kind, | |
| 119 | - verdict=Verdict.WARN, | |
| 120 | - score=0.5, | |
| 121 | - message=( | |
| 122 | - f"all {dropped_trivial} cases had base≈gold (> " | |
| 123 | - f"{spec.base_gold_similarity_cap}) — no separable signal" | |
| 124 | - ), | |
| 125 | - evidence={"dropped_trivial": dropped_trivial, "weight": spec.weight}, | |
| 126 | - ) | |
| 127 | - | |
| 128 | - rate = reverts / total | |
| 129 | - verdict = Verdict.PASS if rate < spec.assert_revert_rate_lt else Verdict.FAIL | |
| 130 | - score = max(0.0, 1.0 - rate / max(spec.assert_revert_rate_lt, 1e-6)) | |
| 131 | - score = float(np.clip(score, 0.0, 1.0)) | |
| 132 | - | |
| 133 | - return ProbeResult( | |
| 134 | - name=spec.name, | |
| 135 | - kind=spec.kind, | |
| 136 | - verdict=verdict, | |
| 137 | - score=score, | |
| 138 | - raw=rate, | |
| 139 | - evidence={ | |
| 140 | - "revert_rate": rate, | |
| 141 | - "reverts": reverts, | |
| 142 | - "total": total, | |
| 143 | - "dropped_trivial": dropped_trivial, | |
| 144 | - "per_case": per_case[:8], # cap to keep JSON bounded | |
| 145 | - "weight": spec.weight, | |
| 146 | - }, | |
| 147 | - message=f"revert_rate={rate:.2%} (reverts={reverts}/{total}, dropped_trivial={dropped_trivial})", | |
| 148 | - ) | |
| 149 | - | |
| 150 | - | |
| 151 | -def _load_embedder(model_id: str): # type: ignore[no-untyped-def] | |
| 152 | - """Return a callable ``list[str] -> np.ndarray`` over encoded vectors.""" | |
| 153 | - try: | |
| 154 | - from sentence_transformers import SentenceTransformer | |
| 155 | - except ImportError as exc: | |
| 156 | - raise BackendNotAvailableError( | |
| 157 | - "adapter_revert", | |
| 158 | - extra="semsim", | |
| 159 | - hint="adapter_revert relies on sentence embeddings.", | |
| 160 | - ) from exc | |
| 161 | - st = SentenceTransformer(model_id) | |
| 162 | - | |
| 163 | - def _embed(texts: list[str]): # type: ignore[no-untyped-def] | |
| 164 | - return st.encode(texts, convert_to_numpy=True, normalize_embeddings=True) | |
| 165 | - | |
| 166 | - return _embed | |
| 167 | - | |
| 168 | - | |
| 169 | -def _cosine(a: Any, b: Any) -> float: | |
| 170 | - import numpy as np | |
| 171 | - | |
| 172 | - av = np.asarray(a, dtype=np.float64) | |
| 173 | - bv = np.asarray(b, dtype=np.float64) | |
| 174 | - na = float(np.linalg.norm(av)) | |
| 175 | - nb = float(np.linalg.norm(bv)) | |
| 176 | - if na == 0.0 or nb == 0.0: | |
| 177 | - return 0.0 | |
| 178 | - return float(np.dot(av, bv) / (na * nb)) | |
sway/src/dlm_sway/probes/base.pydeleted@@ -1,131 +0,0 @@ | ||
| 1 | -"""Probe abstract base + per-kind registry. | |
| 2 | - | |
| 3 | -The registry is the extension point. Adding a new probe means: | |
| 4 | - | |
| 5 | -1. Subclass :class:`ProbeSpec` with a unique ``kind`` field (Literal). | |
| 6 | -2. Subclass :class:`Probe` setting ``kind`` and ``spec_cls``. | |
| 7 | -3. Importing the probe module at least once (its subclass hook registers | |
| 8 | - itself). | |
| 9 | - | |
| 10 | -The runner uses :func:`build_probe` to map each raw spec dict to a | |
| 11 | -``(Probe, ProbeSpec)`` pair. Validation errors are turned into | |
| 12 | -:class:`~dlm_sway.core.errors.SpecValidationError` with the probe name | |
| 13 | -as the source so error messages localize to the offending entry. | |
| 14 | -""" | |
| 15 | - | |
| 16 | -from __future__ import annotations | |
| 17 | - | |
| 18 | -from abc import ABC, abstractmethod | |
| 19 | -from dataclasses import dataclass, field | |
| 20 | -from typing import Any, ClassVar | |
| 21 | - | |
| 22 | -from pydantic import BaseModel, ConfigDict, ValidationError | |
| 23 | - | |
| 24 | -from dlm_sway.core.errors import SpecValidationError | |
| 25 | -from dlm_sway.core.result import ProbeResult | |
| 26 | -from dlm_sway.core.scoring import DifferentialBackend | |
| 27 | -from dlm_sway.core.sections import Section | |
| 28 | - | |
| 29 | - | |
| 30 | -class ProbeSpec(BaseModel): | |
| 31 | - """Common fields for every probe's spec entry in ``sway.yaml``.""" | |
| 32 | - | |
| 33 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 34 | - | |
| 35 | - name: str | |
| 36 | - """Unique within a suite; surfaces in the report.""" | |
| 37 | - kind: str | |
| 38 | - """Discriminator — must match a registered :class:`Probe` subclass.""" | |
| 39 | - enabled: bool = True | |
| 40 | - """If ``False`` the runner records a :class:`~dlm_sway.core.result.Verdict.SKIP`.""" | |
| 41 | - weight: float = 1.0 | |
| 42 | - """Weight inside the probe's component (adherence / attribution / …).""" | |
| 43 | - | |
| 44 | - | |
| 45 | -@dataclass(frozen=True, slots=True) | |
| 46 | -class RunContext: | |
| 47 | - """What a probe can read beyond its own spec. | |
| 48 | - | |
| 49 | - Probes should receive exactly what they need and nothing more; fat | |
| 50 | - contexts encourage coupling between unrelated probes. | |
| 51 | - | |
| 52 | - Attributes | |
| 53 | - ---------- | |
| 54 | - backend: | |
| 55 | - The differential backend holding base + fine-tuned views. | |
| 56 | - seed: | |
| 57 | - Seed for deterministic probe RNGs (paraphrase sampling, etc). | |
| 58 | - top_k: | |
| 59 | - Default truncation for next-token distributions. | |
| 60 | - sections: | |
| 61 | - Optional list of typed sections (populated by the .dlm bridge; | |
| 62 | - ``None`` when sway is invoked against bare HF+PEFT). | |
| 63 | - doc_text: | |
| 64 | - Raw document text, if available. | |
| 65 | - null_stats: | |
| 66 | - Null-adapter baseline stats for z-score calibration, keyed by | |
| 67 | - probe *kind*. Populated by the runner after it's executed the | |
| 68 | - ``null_adapter`` probe (if configured). | |
| 69 | - """ | |
| 70 | - | |
| 71 | - backend: DifferentialBackend | |
| 72 | - seed: int = 0 | |
| 73 | - top_k: int = 256 | |
| 74 | - sections: tuple[Section, ...] | None = None | |
| 75 | - doc_text: str | None = None | |
| 76 | - null_stats: dict[str, dict[str, float]] = field(default_factory=dict) | |
| 77 | - | |
| 78 | - | |
| 79 | -_REGISTRY: dict[str, type[Probe]] = {} | |
| 80 | - | |
| 81 | - | |
| 82 | -class Probe(ABC): | |
| 83 | - """Concrete probe. One instance per probe spec in the suite.""" | |
| 84 | - | |
| 85 | - kind: ClassVar[str] | |
| 86 | - """The string used in ``sway.yaml``'s ``kind`` field.""" | |
| 87 | - spec_cls: ClassVar[type[ProbeSpec]] | |
| 88 | - """The pydantic model class that validates this probe's spec.""" | |
| 89 | - category: ClassVar[str] = "adherence" | |
| 90 | - """One of: ``adherence``, ``attribution``, ``calibration``, | |
| 91 | - ``ablation``, ``baseline``. Drives composite scoring.""" | |
| 92 | - | |
| 93 | - def __init_subclass__(cls, **kwargs: Any) -> None: | |
| 94 | - super().__init_subclass__(**kwargs) | |
| 95 | - # The abstract class itself has no `kind`; skip registration. | |
| 96 | - if "kind" not in cls.__dict__: | |
| 97 | - return | |
| 98 | - kind = cls.kind | |
| 99 | - if kind in _REGISTRY: | |
| 100 | - raise ValueError(f"duplicate probe kind {kind!r}: {_REGISTRY[kind]!r} vs {cls!r}") | |
| 101 | - _REGISTRY[kind] = cls | |
| 102 | - | |
| 103 | - @abstractmethod | |
| 104 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: ... | |
| 105 | - | |
| 106 | - | |
| 107 | -def registry() -> dict[str, type[Probe]]: | |
| 108 | - """Read-only view of registered probes.""" | |
| 109 | - return dict(_REGISTRY) | |
| 110 | - | |
| 111 | - | |
| 112 | -def build_probe(raw: dict[str, Any]) -> tuple[Probe, ProbeSpec]: | |
| 113 | - """Validate a raw YAML probe entry and return (Probe instance, spec).""" | |
| 114 | - kind = raw.get("kind") | |
| 115 | - if not isinstance(kind, str): | |
| 116 | - raise SpecValidationError( | |
| 117 | - "probe entry missing string 'kind' field", | |
| 118 | - source=str(raw.get("name", "<unknown>")), | |
| 119 | - ) | |
| 120 | - if kind not in _REGISTRY: | |
| 121 | - known = ", ".join(sorted(_REGISTRY)) | |
| 122 | - raise SpecValidationError( | |
| 123 | - f"unknown probe kind {kind!r} (registered: {known})", | |
| 124 | - source=str(raw.get("name", "<unknown>")), | |
| 125 | - ) | |
| 126 | - probe_cls = _REGISTRY[kind] | |
| 127 | - try: | |
| 128 | - spec = probe_cls.spec_cls.model_validate(raw) | |
| 129 | - except ValidationError as exc: | |
| 130 | - raise SpecValidationError(str(exc), source=str(raw.get("name", "<unknown>"))) from exc | |
| 131 | - return probe_cls(), spec | |
sway/src/dlm_sway/probes/calibration_drift.pydeleted@@ -1,135 +0,0 @@ | ||
| 1 | -"""C2 CalibrationDrift — did we break general knowledge while fitting the doc? | |
| 2 | - | |
| 3 | -The classic small-doc fine-tune failure mode: the adapter learned the | |
| 4 | -document so well that it forgot the world. C2 catches this by scoring | |
| 5 | -base and ft on a packaged set of general-knowledge completions (the | |
| 6 | -``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts) | |
| 7 | -and flagging items whose per-token logprob regressed significantly. | |
| 8 | - | |
| 9 | -A healthy fine-tune: some items drift slightly (mild confidence shift, | |
| 10 | -normal), but essentially none regress below a nat of slack. An over-fit | |
| 11 | -fine-tune: 20%+ of items regress, the adapter has torched its ability | |
| 12 | -to answer anything outside the document. | |
| 13 | - | |
| 14 | -Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND | |
| 15 | -``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default | |
| 16 | -to values that trigger on genuine damage but tolerate normal drift. | |
| 17 | -""" | |
| 18 | - | |
| 19 | -from __future__ import annotations | |
| 20 | - | |
| 21 | -import statistics | |
| 22 | -from typing import Literal | |
| 23 | - | |
| 24 | -from pydantic import Field | |
| 25 | - | |
| 26 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 27 | -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK | |
| 28 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 29 | - | |
| 30 | - | |
| 31 | -class CalibrationItemSpec(ProbeSpec): | |
| 32 | - """Not used directly — documents the shape of an item override.""" | |
| 33 | - | |
| 34 | - kind: Literal["__calibration_item"] = "__calibration_item" | |
| 35 | - prompt: str = "" | |
| 36 | - gold: str = "" | |
| 37 | - | |
| 38 | - | |
| 39 | -class CalibrationDriftSpec(ProbeSpec): | |
| 40 | - kind: Literal["calibration_drift"] = "calibration_drift" | |
| 41 | - pack: Literal["builtin"] = "builtin" | |
| 42 | - """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom | |
| 43 | - packs will ship via a file reference in a later milestone.""" | |
| 44 | - items_limit: int | None = None | |
| 45 | - """If set, truncate the pack to this many items (for fast runs).""" | |
| 46 | - assert_fraction_regressed_lt: float = 0.15 | |
| 47 | - assert_mean_delta_gte: float = -0.5 | |
| 48 | - """Mean per-token logprob delta (ft − base) across the pack. Slightly | |
| 49 | - negative is tolerable; deeply negative is not.""" | |
| 50 | - regression_nats: float = 1.0 | |
| 51 | - """How many nats worse an item must get to count as regressed.""" | |
| 52 | - items: list[tuple[str, str]] = Field(default_factory=list) | |
| 53 | - """Optional inline override of the packaged items.""" | |
| 54 | - | |
| 55 | - | |
| 56 | -class CalibrationDriftProbe(Probe): | |
| 57 | - kind = "calibration_drift" | |
| 58 | - spec_cls = CalibrationDriftSpec | |
| 59 | - category = "calibration" | |
| 60 | - | |
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 62 | - assert isinstance(spec, CalibrationDriftSpec) | |
| 63 | - items = list(spec.items) if spec.items else list(BUILT_IN_PACK) | |
| 64 | - if spec.items_limit is not None: | |
| 65 | - items = items[: spec.items_limit] | |
| 66 | - if not items: | |
| 67 | - return ProbeResult( | |
| 68 | - name=spec.name, | |
| 69 | - kind=spec.kind, | |
| 70 | - verdict=Verdict.ERROR, | |
| 71 | - score=None, | |
| 72 | - message="no calibration items", | |
| 73 | - ) | |
| 74 | - | |
| 75 | - deltas: list[float] = [] | |
| 76 | - regressed = 0 | |
| 77 | - worst: list[dict[str, float | str]] = [] | |
| 78 | - | |
| 79 | - for prompt, gold in items: | |
| 80 | - tokens = max(_token_estimate(gold), 1) | |
| 81 | - with ctx.backend.as_base() as b: | |
| 82 | - lp_base = b.logprob_of(prompt, gold) / tokens | |
| 83 | - with ctx.backend.as_finetuned() as f: | |
| 84 | - lp_ft = f.logprob_of(prompt, gold) / tokens | |
| 85 | - delta = lp_ft - lp_base | |
| 86 | - deltas.append(delta) | |
| 87 | - if delta < -spec.regression_nats: | |
| 88 | - regressed += 1 | |
| 89 | - worst.append({"prompt": prompt, "gold": gold, "delta": delta}) | |
| 90 | - | |
| 91 | - # Surface the worst offenders — up to 5. | |
| 92 | - worst.sort(key=lambda d: float(d["delta"])) | |
| 93 | - worst = worst[:5] | |
| 94 | - | |
| 95 | - frac_regressed = regressed / len(items) | |
| 96 | - mean_delta = statistics.fmean(deltas) | |
| 97 | - | |
| 98 | - passed = ( | |
| 99 | - frac_regressed < spec.assert_fraction_regressed_lt | |
| 100 | - and mean_delta >= spec.assert_mean_delta_gte | |
| 101 | - ) | |
| 102 | - verdict = Verdict.PASS if passed else Verdict.FAIL | |
| 103 | - # Score: 1.0 at zero regression + zero drift, declining with either. | |
| 104 | - regress_component = max( | |
| 105 | - 0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6) | |
| 106 | - ) | |
| 107 | - drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5)) | |
| 108 | - score = 0.6 * regress_component + 0.4 * drift_component | |
| 109 | - | |
| 110 | - return ProbeResult( | |
| 111 | - name=spec.name, | |
| 112 | - kind=spec.kind, | |
| 113 | - verdict=verdict, | |
| 114 | - score=score, | |
| 115 | - raw=frac_regressed, | |
| 116 | - base_value=None, | |
| 117 | - ft_value=mean_delta, | |
| 118 | - evidence={ | |
| 119 | - "fraction_regressed": frac_regressed, | |
| 120 | - "mean_delta_nats": mean_delta, | |
| 121 | - "regressed_count": regressed, | |
| 122 | - "total_items": len(items), | |
| 123 | - "worst_offenders": worst, | |
| 124 | - "regression_nats_threshold": spec.regression_nats, | |
| 125 | - "weight": spec.weight, | |
| 126 | - }, | |
| 127 | - message=( | |
| 128 | - f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats " | |
| 129 | - f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok" | |
| 130 | - ), | |
| 131 | - ) | |
| 132 | - | |
| 133 | - | |
| 134 | -def _token_estimate(s: str) -> int: | |
| 135 | - return max(1, len(s) // 4) | |
sway/src/dlm_sway/probes/delta_kl.pydeleted@@ -1,121 +0,0 @@ | ||
| 1 | -"""A1 DeltaKL — the simplest adherence probe. | |
| 2 | - | |
| 3 | -For each prompt, compute the JS (default) or KL divergence between the | |
| 4 | -base and fine-tuned model's next-token distributions at the position | |
| 5 | -after the prompt. Aggregate across prompts with a mean. | |
| 6 | - | |
| 7 | -*What it tells you:* whether the adapter is distinguishable from the base | |
| 8 | -on things the document cares about. A zero-divergence result is a red | |
| 9 | -flag — the adapter is ignored. | |
| 10 | - | |
| 11 | -*What it can't tell you:* whether the change is semantically *correct*. | |
| 12 | -Direction and correctness are what :mod:`dir`, :mod:`adapter_revert`, | |
| 13 | -and the attribution probes cover. | |
| 14 | -""" | |
| 15 | - | |
| 16 | -from __future__ import annotations | |
| 17 | - | |
| 18 | -import statistics | |
| 19 | -from typing import Literal | |
| 20 | - | |
| 21 | -from pydantic import Field | |
| 22 | - | |
| 23 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 24 | -from dlm_sway.probes._divergence import Divergence, divergence, js_ln2 | |
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 26 | -from dlm_sway.probes.null_adapter import get_null_stats | |
| 27 | - | |
| 28 | - | |
| 29 | -class DeltaKLSpec(ProbeSpec): | |
| 30 | - """Spec for ``kind: delta_kl``.""" | |
| 31 | - | |
| 32 | - kind: Literal["delta_kl"] = "delta_kl" | |
| 33 | - prompts: list[str] = Field(default_factory=list, min_length=0) | |
| 34 | - """Inline prompts. At least one of ``prompts`` / ``prompts_from`` must | |
| 35 | - be non-empty at run time; the prompts-from path is wired via | |
| 36 | - :mod:`dlm_sway.integrations.dlm.autogen`.""" | |
| 37 | - divergence: Divergence = "js" | |
| 38 | - top_k: int | None = None | |
| 39 | - """Override the suite-wide ``top_k``. ``None`` → use ``ctx.top_k``.""" | |
| 40 | - assert_mean_gte: float = 0.02 | |
| 41 | - """Fixed-threshold pass criterion when no null stats are available.""" | |
| 42 | - assert_z_gte: float = 3.0 | |
| 43 | - """Z-score pass criterion against the null-adapter baseline, when it | |
| 44 | - exists. The more principled metric — prefer this over the raw | |
| 45 | - threshold.""" | |
| 46 | - | |
| 47 | - | |
| 48 | -class DeltaKLProbe(Probe): | |
| 49 | - """The canonical "is the adapter changing anything?" probe.""" | |
| 50 | - | |
| 51 | - kind = "delta_kl" | |
| 52 | - spec_cls = DeltaKLSpec | |
| 53 | - category = "adherence" | |
| 54 | - | |
| 55 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 56 | - assert isinstance(spec, DeltaKLSpec) | |
| 57 | - if not spec.prompts: | |
| 58 | - return ProbeResult( | |
| 59 | - name=spec.name, | |
| 60 | - kind=spec.kind, | |
| 61 | - verdict=Verdict.ERROR, | |
| 62 | - score=None, | |
| 63 | - message="no prompts provided (inline 'prompts' was empty)", | |
| 64 | - ) | |
| 65 | - | |
| 66 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | |
| 67 | - divergences: list[float] = [] | |
| 68 | - for prompt in spec.prompts: | |
| 69 | - with ctx.backend.as_base() as base_view: | |
| 70 | - base_dist = base_view.next_token_dist(prompt, top_k=top_k) | |
| 71 | - with ctx.backend.as_finetuned() as ft_view: | |
| 72 | - ft_dist = ft_view.next_token_dist(prompt, top_k=top_k) | |
| 73 | - divergences.append(divergence(base_dist, ft_dist, kind=spec.divergence)) | |
| 74 | - | |
| 75 | - raw_mean = statistics.fmean(divergences) | |
| 76 | - raw_max = max(divergences) | |
| 77 | - | |
| 78 | - # Null-adapter calibration wins when available. | |
| 79 | - null = get_null_stats(ctx, spec.kind) | |
| 80 | - z = None | |
| 81 | - if null is not None and null.get("std", 0.0) > 0.0: | |
| 82 | - z = (raw_mean - null["mean"]) / null["std"] | |
| 83 | - verdict = Verdict.PASS if z >= spec.assert_z_gte else Verdict.FAIL | |
| 84 | - message = f"mean {spec.divergence}={raw_mean:.4f}, z={z:+.2f}σ vs null" | |
| 85 | - else: | |
| 86 | - verdict = Verdict.PASS if raw_mean >= spec.assert_mean_gte else Verdict.FAIL | |
| 87 | - message = ( | |
| 88 | - f"mean {spec.divergence}={raw_mean:.4f} " | |
| 89 | - f"({'≥' if verdict == Verdict.PASS else '<'} {spec.assert_mean_gte})" | |
| 90 | - ) | |
| 91 | - | |
| 92 | - # Normalized score for composite: JS is bounded by ln(2), so | |
| 93 | - # sigmoid-ish on (z, or raw / bound) keeps the number in [0, 1]. | |
| 94 | - if z is not None: | |
| 95 | - score = _sigmoid(z / 3.0) | |
| 96 | - else: | |
| 97 | - bound = js_ln2() if spec.divergence == "js" else 1.0 | |
| 98 | - score = min(1.0, raw_mean / bound) if bound > 0.0 else 0.0 | |
| 99 | - | |
| 100 | - return ProbeResult( | |
| 101 | - name=spec.name, | |
| 102 | - kind=spec.kind, | |
| 103 | - verdict=verdict, | |
| 104 | - score=score, | |
| 105 | - raw=raw_mean, | |
| 106 | - z_score=z, | |
| 107 | - evidence={ | |
| 108 | - "divergence_kind": spec.divergence, | |
| 109 | - "per_prompt": divergences, | |
| 110 | - "max": raw_max, | |
| 111 | - "num_prompts": len(spec.prompts), | |
| 112 | - "weight": spec.weight, | |
| 113 | - }, | |
| 114 | - message=message, | |
| 115 | - ) | |
| 116 | - | |
| 117 | - | |
| 118 | -def _sigmoid(x: float) -> float: | |
| 119 | - import math | |
| 120 | - | |
| 121 | - return 1.0 / (1.0 + math.exp(-x)) | |
sway/src/dlm_sway/probes/leakage.pydeleted@@ -1,194 +0,0 @@ | ||
| 1 | -"""C3 LeakageSusceptibility — can the fine-tuned model recite training text verbatim? | |
| 2 | - | |
| 3 | -For each PROSE section, take the first ``prefix_chars`` as a trigger and | |
| 4 | -greedy-generate a continuation. Measure how much of the actual section | |
| 5 | -continuation the model recovers (via LCS ratio). Also re-run under | |
| 6 | -small prefix perturbations (typo, case flip, punctuation change) and | |
| 7 | -report the **fragility** — a genuinely generalized model degrades | |
| 8 | -smoothly under perturbation; a memorizer drops off a cliff. | |
| 9 | - | |
| 10 | -Default pass: ``greedy_recall < 0.5``. That default is tuned for the | |
| 11 | -common "don't leak my document" use case. Sections tagged ``intent: | |
| 12 | -memorize`` invert the interpretation — the .dlm bridge handles that | |
| 13 | -flip at spec-generation time. | |
| 14 | -""" | |
| 15 | - | |
| 16 | -from __future__ import annotations | |
| 17 | - | |
| 18 | -import difflib | |
| 19 | -import statistics | |
| 20 | -from typing import Literal | |
| 21 | - | |
| 22 | -from pydantic import Field | |
| 23 | - | |
| 24 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 26 | - | |
| 27 | -PerturbationKind = Literal["typo", "case_flip", "drop_punct"] | |
| 28 | - | |
| 29 | - | |
| 30 | -def _default_perturbations() -> list[PerturbationKind]: | |
| 31 | - return ["typo", "case_flip", "drop_punct"] | |
| 32 | - | |
| 33 | - | |
| 34 | -class LeakageSusceptibilitySpec(ProbeSpec): | |
| 35 | - kind: Literal["leakage"] = "leakage" | |
| 36 | - prefix_chars: int = 128 | |
| 37 | - continuation_chars: int = 256 | |
| 38 | - max_new_tokens: int = 96 | |
| 39 | - perturbations: list[PerturbationKind] = Field(default_factory=_default_perturbations) | |
| 40 | - assert_recall_lt: float = 0.5 | |
| 41 | - """Default anti-leak gate: pass when verbatim recall is modest. Invert | |
| 42 | - by bumping this to ``>1.0`` when intentional memorization is desired.""" | |
| 43 | - min_fragility: float = 0.3 | |
| 44 | - """Fragility = (clean - perturbed) / max(clean, eps). A low value | |
| 45 | - with high recall indicates true memorization; a high value suggests | |
| 46 | - the model generalized and recall was incidental.""" | |
| 47 | - | |
| 48 | - | |
| 49 | -class LeakageSusceptibilityProbe(Probe): | |
| 50 | - kind = "leakage" | |
| 51 | - spec_cls = LeakageSusceptibilitySpec | |
| 52 | - category = "calibration" | |
| 53 | - | |
| 54 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 55 | - assert isinstance(spec, LeakageSusceptibilitySpec) | |
| 56 | - if ctx.sections is None: | |
| 57 | - return ProbeResult( | |
| 58 | - name=spec.name, | |
| 59 | - kind=spec.kind, | |
| 60 | - verdict=Verdict.SKIP, | |
| 61 | - score=None, | |
| 62 | - message="no sections in context — provide via the .dlm bridge", | |
| 63 | - ) | |
| 64 | - prose = [s for s in ctx.sections if s.kind == "prose" and s.content.strip()] | |
| 65 | - if not prose: | |
| 66 | - return ProbeResult( | |
| 67 | - name=spec.name, | |
| 68 | - kind=spec.kind, | |
| 69 | - verdict=Verdict.SKIP, | |
| 70 | - score=None, | |
| 71 | - message="no PROSE sections to test for leakage", | |
| 72 | - ) | |
| 73 | - | |
| 74 | - clean_recalls: list[float] = [] | |
| 75 | - perturbed_recalls: list[float] = [] | |
| 76 | - per_section: list[dict[str, float | str]] = [] | |
| 77 | - | |
| 78 | - with ctx.backend.as_finetuned() as ft: | |
| 79 | - for s in prose: | |
| 80 | - prefix = s.content[: spec.prefix_chars] | |
| 81 | - target = s.content[spec.prefix_chars : spec.prefix_chars + spec.continuation_chars] | |
| 82 | - if not target.strip(): | |
| 83 | - continue | |
| 84 | - clean_gen = ft.generate(prefix, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | |
| 85 | - clean = _lcs_ratio(clean_gen, target) | |
| 86 | - clean_recalls.append(clean) | |
| 87 | - | |
| 88 | - per_sec_perturbed: list[float] = [] | |
| 89 | - for perturbation in spec.perturbations: | |
| 90 | - perturbed_prefix = _perturb(prefix, perturbation) | |
| 91 | - perturbed_gen = ft.generate( | |
| 92 | - perturbed_prefix, | |
| 93 | - max_new_tokens=spec.max_new_tokens, | |
| 94 | - seed=ctx.seed, | |
| 95 | - ) | |
| 96 | - per_sec_perturbed.append(_lcs_ratio(perturbed_gen, target)) | |
| 97 | - mean_pert = statistics.fmean(per_sec_perturbed) if per_sec_perturbed else clean | |
| 98 | - perturbed_recalls.append(mean_pert) | |
| 99 | - | |
| 100 | - per_section.append( | |
| 101 | - { | |
| 102 | - "section_id": s.id, | |
| 103 | - "clean_recall": clean, | |
| 104 | - "perturbed_recall": mean_pert, | |
| 105 | - "fragility": _fragility(clean, mean_pert), | |
| 106 | - } | |
| 107 | - ) | |
| 108 | - | |
| 109 | - if not clean_recalls: | |
| 110 | - return ProbeResult( | |
| 111 | - name=spec.name, | |
| 112 | - kind=spec.kind, | |
| 113 | - verdict=Verdict.SKIP, | |
| 114 | - score=None, | |
| 115 | - message="no PROSE sections had scorable continuations", | |
| 116 | - ) | |
| 117 | - | |
| 118 | - mean_clean = statistics.fmean(clean_recalls) | |
| 119 | - mean_pert = statistics.fmean(perturbed_recalls) | |
| 120 | - mean_fragility = _fragility(mean_clean, mean_pert) | |
| 121 | - | |
| 122 | - verdict = ( | |
| 123 | - Verdict.PASS | |
| 124 | - if mean_clean < spec.assert_recall_lt or mean_fragility >= spec.min_fragility | |
| 125 | - else Verdict.FAIL | |
| 126 | - ) | |
| 127 | - # Score: 1.0 at zero recall, declining as recall approaches threshold. | |
| 128 | - recall_score = max(0.0, min(1.0, 1.0 - mean_clean / max(spec.assert_recall_lt, 1e-6))) | |
| 129 | - # Bonus: high fragility is good (genuine generalization). | |
| 130 | - fragility_bonus = min(1.0, max(0.0, mean_fragility / max(spec.min_fragility, 1e-6))) | |
| 131 | - score = 0.7 * recall_score + 0.3 * fragility_bonus | |
| 132 | - | |
| 133 | - return ProbeResult( | |
| 134 | - name=spec.name, | |
| 135 | - kind=spec.kind, | |
| 136 | - verdict=verdict, | |
| 137 | - score=score, | |
| 138 | - raw=mean_clean, | |
| 139 | - base_value=None, | |
| 140 | - ft_value=mean_fragility, | |
| 141 | - evidence={ | |
| 142 | - "mean_clean_recall": mean_clean, | |
| 143 | - "mean_perturbed_recall": mean_pert, | |
| 144 | - "mean_fragility": mean_fragility, | |
| 145 | - "per_section": per_section[:10], | |
| 146 | - "weight": spec.weight, | |
| 147 | - }, | |
| 148 | - message=( | |
| 149 | - f"greedy_recall={mean_clean:.2f} " | |
| 150 | - f"(perturbed={mean_pert:.2f}, fragility={mean_fragility:.2f})" | |
| 151 | - ), | |
| 152 | - ) | |
| 153 | - | |
| 154 | - | |
| 155 | -# -- helpers ----------------------------------------------------------- | |
| 156 | - | |
| 157 | - | |
| 158 | -def _lcs_ratio(generated: str, target: str) -> float: | |
| 159 | - """Longest common subsequence ratio via difflib. | |
| 160 | - | |
| 161 | - Returns 0 for empty inputs, 1.0 for identical strings. difflib's | |
| 162 | - ``ratio`` is a gestalt similarity; close enough to a true LCS for | |
| 163 | - our purposes and has no external deps. | |
| 164 | - """ | |
| 165 | - if not generated or not target: | |
| 166 | - return 0.0 | |
| 167 | - return difflib.SequenceMatcher(None, generated, target).ratio() | |
| 168 | - | |
| 169 | - | |
| 170 | -def _perturb(text: str, kind: str) -> str: | |
| 171 | - """Apply a deterministic textual perturbation.""" | |
| 172 | - if not text: | |
| 173 | - return text | |
| 174 | - if kind == "typo": | |
| 175 | - # Swap the first two characters; trivial typo the model must reconstruct. | |
| 176 | - if len(text) < 2: | |
| 177 | - return text | |
| 178 | - return text[1] + text[0] + text[2:] | |
| 179 | - if kind == "case_flip": | |
| 180 | - # Flip case of the first alpha char. | |
| 181 | - for i, ch in enumerate(text): | |
| 182 | - if ch.isalpha(): | |
| 183 | - flipped = ch.lower() if ch.isupper() else ch.upper() | |
| 184 | - return text[:i] + flipped + text[i + 1 :] | |
| 185 | - return text | |
| 186 | - if kind == "drop_punct": | |
| 187 | - return "".join(ch for ch in text if ch not in ".,;:!?-—") | |
| 188 | - raise ValueError(f"unknown perturbation: {kind!r}") | |
| 189 | - | |
| 190 | - | |
| 191 | -def _fragility(clean: float, perturbed: float) -> float: | |
| 192 | - if clean <= 0.0: | |
| 193 | - return 0.0 | |
| 194 | - return max(0.0, (clean - perturbed) / clean) | |
sway/src/dlm_sway/probes/null_adapter.pydeleted@@ -1,144 +0,0 @@ | ||
| 1 | -"""Null-adapter baseline probe. | |
| 2 | - | |
| 3 | -Every numeric primitive reports its raw metric *and* a z-score against a | |
| 4 | -null-adapter distribution. This probe is the runtime engine that | |
| 5 | -establishes that distribution — it builds random-init "null" adapters | |
| 6 | -(structurally identical to the real adapter but with weights drawn from | |
| 7 | -a Gaussian) and measures how much signal they produce. | |
| 8 | - | |
| 9 | -The resulting ``(mean, std, n)`` per kind is attached to this probe's | |
| 10 | -``evidence["null_stats"]``. The runner picks it up and threads it into | |
| 11 | -:attr:`RunContext.null_stats`, where every downstream probe can read it | |
| 12 | -and turn a raw metric into a z-score. | |
| 13 | - | |
| 14 | -Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend` | |
| 15 | -cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back | |
| 16 | -to their fixed thresholds in that case. | |
| 17 | -""" | |
| 18 | - | |
| 19 | -from __future__ import annotations | |
| 20 | - | |
| 21 | -import statistics | |
| 22 | -from typing import Literal | |
| 23 | - | |
| 24 | -from pydantic import Field | |
| 25 | - | |
| 26 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 27 | -from dlm_sway.core.scoring import NullCalibratedBackend | |
| 28 | -from dlm_sway.probes._divergence import divergence | |
| 29 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 30 | - | |
| 31 | - | |
| 32 | -class NullAdapterSpec(ProbeSpec): | |
| 33 | - """Spec for ``kind: null_adapter``. | |
| 34 | - | |
| 35 | - Authors place this probe **first** in the suite so its output | |
| 36 | - populates :attr:`RunContext.null_stats` before subsequent probes | |
| 37 | - consult it. | |
| 38 | - """ | |
| 39 | - | |
| 40 | - kind: Literal["null_adapter"] = "null_adapter" | |
| 41 | - runs: int = Field(default=3, ge=1, le=10) | |
| 42 | - """Number of independent null adapters to evaluate. Three is the | |
| 43 | - smallest that yields a usable std; more is better but quickly | |
| 44 | - dominates suite runtime.""" | |
| 45 | - prompts: list[str] = Field(default_factory=list) | |
| 46 | - """Prompt set for null calibration. Keep small — calibration runs | |
| 47 | - ``runs × len(prompts)`` forward passes. 4–8 prompts is typical. | |
| 48 | - If empty, a minimal built-in prompt set is used so the probe | |
| 49 | - always produces stats.""" | |
| 50 | - init_scale: float = 0.02 | |
| 51 | - """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B.""" | |
| 52 | - seed_base: int = 1000 | |
| 53 | - """First seed; successive runs use ``seed_base + run_idx``.""" | |
| 54 | - | |
| 55 | - | |
| 56 | -_DEFAULT_PROMPTS: tuple[str, ...] = ( | |
| 57 | - "The quick brown fox", | |
| 58 | - "Once upon a time", | |
| 59 | - "In this document we explain", | |
| 60 | - "The key takeaway is", | |
| 61 | - "An important point to remember", | |
| 62 | -) | |
| 63 | - | |
| 64 | - | |
| 65 | -class NullAdapterProbe(Probe): | |
| 66 | - """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself. | |
| 67 | - | |
| 68 | - The probe never fails on its own terms — its *job* is calibration. | |
| 69 | - Downstream probes pick up :attr:`RunContext.null_stats` keyed by | |
| 70 | - probe kind (``delta_kl``, ``adapter_ablation`` …) and use the | |
| 71 | - populated mean/std to z-score their own raw metrics. | |
| 72 | - """ | |
| 73 | - | |
| 74 | - kind = "null_adapter" | |
| 75 | - spec_cls = NullAdapterSpec | |
| 76 | - category = "baseline" | |
| 77 | - | |
| 78 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 79 | - assert isinstance(spec, NullAdapterSpec) | |
| 80 | - if not isinstance(ctx.backend, NullCalibratedBackend): | |
| 81 | - return ProbeResult( | |
| 82 | - name=spec.name, | |
| 83 | - kind=spec.kind, | |
| 84 | - verdict=Verdict.SKIP, | |
| 85 | - score=None, | |
| 86 | - message=( | |
| 87 | - "backend does not implement NullCalibratedBackend — " | |
| 88 | - "numeric probes will fall back to fixed thresholds" | |
| 89 | - ), | |
| 90 | - ) | |
| 91 | - prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS) | |
| 92 | - | |
| 93 | - per_seed_means: list[float] = [] | |
| 94 | - for run_idx in range(spec.runs): | |
| 95 | - seed = spec.seed_base + run_idx | |
| 96 | - per_prompt: list[float] = [] | |
| 97 | - for prompt in prompts: | |
| 98 | - with ctx.backend.as_base() as base_view: | |
| 99 | - base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k) | |
| 100 | - with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view: | |
| 101 | - null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k) | |
| 102 | - per_prompt.append(divergence(base_dist, null_dist, kind="js")) | |
| 103 | - per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0) | |
| 104 | - | |
| 105 | - mean = statistics.fmean(per_seed_means) | |
| 106 | - std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0 | |
| 107 | - | |
| 108 | - # Publish per-kind stats. delta_kl is the primary kind; other | |
| 109 | - # divergence-based probes (adapter_ablation) share this scale. | |
| 110 | - null_stats = { | |
| 111 | - "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)}, | |
| 112 | - "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)}, | |
| 113 | - } | |
| 114 | - | |
| 115 | - return ProbeResult( | |
| 116 | - name=spec.name, | |
| 117 | - kind=spec.kind, | |
| 118 | - verdict=Verdict.PASS, | |
| 119 | - score=1.0, | |
| 120 | - raw=mean, | |
| 121 | - evidence={ | |
| 122 | - "null_stats": null_stats, | |
| 123 | - "per_seed_mean_js": per_seed_means, | |
| 124 | - "init_scale": spec.init_scale, | |
| 125 | - "runs": spec.runs, | |
| 126 | - "num_prompts": len(prompts), | |
| 127 | - "weight": spec.weight, | |
| 128 | - }, | |
| 129 | - message=( | |
| 130 | - f"null JS divergence μ={mean:.4f} ± {std:.4f} " | |
| 131 | - f"(over {spec.runs} seeds × {len(prompts)} prompts) — " | |
| 132 | - f"downstream probes will z-score against this baseline" | |
| 133 | - ), | |
| 134 | - ) | |
| 135 | - | |
| 136 | - | |
| 137 | -def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None: | |
| 138 | - """Look up null-adapter stats for ``probe_kind``. | |
| 139 | - | |
| 140 | - Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for | |
| 141 | - this kind, else ``None``. Probes treat ``None`` as "fall back to the | |
| 142 | - fixed threshold from your spec." | |
| 143 | - """ | |
| 144 | - return ctx.null_stats.get(probe_kind) | |
sway/src/dlm_sway/probes/paraphrase_invariance.pydeleted@@ -1,148 +0,0 @@ | ||
| 1 | -"""B2 ParaphraseInvariance — memorization vs generalization, per case. | |
| 2 | - | |
| 3 | -For each ``(prompt, gold, paraphrases)`` test case: | |
| 4 | - | |
| 5 | -- ``verbatim_lift``: Δ-per-token = logprob_ft(prompt, gold) - logprob_base(prompt, gold) | |
| 6 | -- ``paraphrase_lift``: mean Δ-per-token over the paraphrased prompts | |
| 7 | - | |
| 8 | -A model that memorized the exact prompt has high ``verbatim_lift`` but | |
| 9 | -near-zero ``paraphrase_lift``. A model that learned the underlying | |
| 10 | -*pattern* has both values positive and close to each other. | |
| 11 | - | |
| 12 | -We report: | |
| 13 | - | |
| 14 | -- ``generalization_ratio = paraphrase_lift / max(verbatim_lift, eps)`` | |
| 15 | -- ``verbatim_score``: whether the adapter significantly moved the | |
| 16 | - verbatim-prompt logprob (sanity check) | |
| 17 | - | |
| 18 | -The pass criterion depends on the stated intent: by default we require | |
| 19 | -both high verbatim lift and high generalization ratio. If the spec's | |
| 20 | -``intent`` is ``"memorize"``, the ratio requirement inverts — we *want* | |
| 21 | -verbatim >> paraphrase. | |
| 22 | -""" | |
| 23 | - | |
| 24 | -from __future__ import annotations | |
| 25 | - | |
| 26 | -import statistics | |
| 27 | -from typing import Literal | |
| 28 | - | |
| 29 | -from pydantic import BaseModel, ConfigDict, Field | |
| 30 | - | |
| 31 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 32 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 33 | - | |
| 34 | -Intent = Literal["generalize", "memorize", "both"] | |
| 35 | - | |
| 36 | - | |
| 37 | -class ParaphraseCase(BaseModel): | |
| 38 | - """One paraphrase-invariance case.""" | |
| 39 | - | |
| 40 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 41 | - | |
| 42 | - prompt: str | |
| 43 | - gold: str | |
| 44 | - paraphrases: list[str] = Field(default_factory=list, min_length=1) | |
| 45 | - | |
| 46 | - | |
| 47 | -class ParaphraseInvarianceSpec(ProbeSpec): | |
| 48 | - kind: Literal["paraphrase_invariance"] = "paraphrase_invariance" | |
| 49 | - cases: list[ParaphraseCase] = Field(default_factory=list) | |
| 50 | - intent: Intent = "generalize" | |
| 51 | - min_verbatim_lift: float = 0.2 | |
| 52 | - min_generalization_ratio: float = 0.5 | |
| 53 | - max_generalization_ratio_if_memorize: float = 0.5 | |
| 54 | - | |
| 55 | - | |
| 56 | -class ParaphraseInvarianceProbe(Probe): | |
| 57 | - kind = "paraphrase_invariance" | |
| 58 | - spec_cls = ParaphraseInvarianceSpec | |
| 59 | - category = "attribution" | |
| 60 | - | |
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 62 | - assert isinstance(spec, ParaphraseInvarianceSpec) | |
| 63 | - if not spec.cases: | |
| 64 | - return ProbeResult( | |
| 65 | - name=spec.name, | |
| 66 | - kind=spec.kind, | |
| 67 | - verdict=Verdict.ERROR, | |
| 68 | - score=None, | |
| 69 | - message="no cases provided", | |
| 70 | - ) | |
| 71 | - | |
| 72 | - verbatim_lifts: list[float] = [] | |
| 73 | - paraphrase_lifts: list[float] = [] | |
| 74 | - per_case: list[dict[str, float | str]] = [] | |
| 75 | - | |
| 76 | - for case in spec.cases: | |
| 77 | - tokens = max(_token_estimate(case.gold), 1) | |
| 78 | - with ctx.backend.as_base() as b: | |
| 79 | - lp_base_verb = b.logprob_of(case.prompt, case.gold) / tokens | |
| 80 | - lp_base_par = [b.logprob_of(p, case.gold) / tokens for p in case.paraphrases] | |
| 81 | - with ctx.backend.as_finetuned() as f: | |
| 82 | - lp_ft_verb = f.logprob_of(case.prompt, case.gold) / tokens | |
| 83 | - lp_ft_par = [f.logprob_of(p, case.gold) / tokens for p in case.paraphrases] | |
| 84 | - | |
| 85 | - verb_lift = lp_ft_verb - lp_base_verb | |
| 86 | - par_lift = statistics.fmean( | |
| 87 | - (ft - base) for base, ft in zip(lp_base_par, lp_ft_par, strict=True) | |
| 88 | - ) | |
| 89 | - verbatim_lifts.append(verb_lift) | |
| 90 | - paraphrase_lifts.append(par_lift) | |
| 91 | - per_case.append( | |
| 92 | - { | |
| 93 | - "prompt": case.prompt[:80], | |
| 94 | - "verbatim_lift": verb_lift, | |
| 95 | - "paraphrase_lift": par_lift, | |
| 96 | - } | |
| 97 | - ) | |
| 98 | - | |
| 99 | - mean_verb = statistics.fmean(verbatim_lifts) | |
| 100 | - mean_par = statistics.fmean(paraphrase_lifts) | |
| 101 | - ratio = mean_par / mean_verb if abs(mean_verb) > 1e-9 else 0.0 | |
| 102 | - | |
| 103 | - verdict, score, msg = _decide(spec, mean_verb, mean_par, ratio) | |
| 104 | - | |
| 105 | - return ProbeResult( | |
| 106 | - name=spec.name, | |
| 107 | - kind=spec.kind, | |
| 108 | - verdict=verdict, | |
| 109 | - score=score, | |
| 110 | - raw=ratio, | |
| 111 | - base_value=mean_verb, | |
| 112 | - ft_value=mean_par, | |
| 113 | - evidence={ | |
| 114 | - "verbatim_lift_mean": mean_verb, | |
| 115 | - "paraphrase_lift_mean": mean_par, | |
| 116 | - "generalization_ratio": ratio, | |
| 117 | - "intent": spec.intent, | |
| 118 | - "per_case": per_case[:8], | |
| 119 | - "weight": spec.weight, | |
| 120 | - }, | |
| 121 | - message=msg, | |
| 122 | - ) | |
| 123 | - | |
| 124 | - | |
| 125 | -def _decide( | |
| 126 | - spec: ParaphraseInvarianceSpec, verb: float, par: float, ratio: float | |
| 127 | -) -> tuple[Verdict, float, str]: | |
| 128 | - """Apply the intent-aware pass rule and return (verdict, score, message).""" | |
| 129 | - base_msg = f"verb={verb:+.3f}, para={par:+.3f}, ratio={ratio:.2f}" | |
| 130 | - if spec.intent == "memorize": | |
| 131 | - verd = ( | |
| 132 | - Verdict.PASS | |
| 133 | - if verb >= spec.min_verbatim_lift and ratio <= spec.max_generalization_ratio_if_memorize | |
| 134 | - else Verdict.FAIL | |
| 135 | - ) | |
| 136 | - score = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6))) | |
| 137 | - return verd, score, f"{base_msg} — intent=memorize" | |
| 138 | - # Default: generalize (or "both") | |
| 139 | - passed = verb >= spec.min_verbatim_lift and ratio >= spec.min_generalization_ratio | |
| 140 | - verd = Verdict.PASS if passed else Verdict.FAIL | |
| 141 | - gen_component = min(1.0, max(0.0, ratio / max(spec.min_generalization_ratio, 1e-6))) | |
| 142 | - verb_component = min(1.0, max(0.0, verb / max(spec.min_verbatim_lift, 1e-6))) | |
| 143 | - score = 0.5 * gen_component + 0.5 * verb_component | |
| 144 | - return verd, score, f"{base_msg} — intent={spec.intent}" | |
| 145 | - | |
| 146 | - | |
| 147 | -def _token_estimate(s: str) -> int: | |
| 148 | - return max(1, len(s) // 4) | |
sway/src/dlm_sway/probes/preference_flip.pydeleted@@ -1,140 +0,0 @@ | ||
| 1 | -"""B3 PreferenceFlip — did DPO/ORPO actually flip the chosen/rejected ranking? | |
| 2 | - | |
| 3 | -For each ``(prompt, chosen, rejected)`` triple, compute the margin | |
| 4 | - | |
| 5 | -.. math:: | |
| 6 | - m = \\log p(\\text{chosen} \\mid \\text{prompt}) - \\log p(\\text{rejected} \\mid \\text{prompt}) | |
| 7 | - | |
| 8 | -under both base and fine-tuned views. Interesting triples are the ones | |
| 9 | -where base got the sign *wrong* (``m_base < 0``); we fail if the | |
| 10 | -fine-tune doesn't flip a large enough fraction of them. | |
| 11 | - | |
| 12 | -Triples come from either an inline ``triples:`` block in the spec or | |
| 13 | -from PREFERENCE sections in :attr:`RunContext.sections`. The probe | |
| 14 | -returns :attr:`Verdict.SKIP` when no triples are present — this is the | |
| 15 | -"no PREFERENCE sections in your document" case, graceful by design. | |
| 16 | -""" | |
| 17 | - | |
| 18 | -from __future__ import annotations | |
| 19 | - | |
| 20 | -import statistics | |
| 21 | -from typing import Literal | |
| 22 | - | |
| 23 | -from pydantic import BaseModel, ConfigDict, Field | |
| 24 | - | |
| 25 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 26 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 27 | - | |
| 28 | - | |
| 29 | -class PreferenceTriple(BaseModel): | |
| 30 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 31 | - | |
| 32 | - prompt: str | |
| 33 | - chosen: str | |
| 34 | - rejected: str | |
| 35 | - | |
| 36 | - | |
| 37 | -class PreferenceFlipSpec(ProbeSpec): | |
| 38 | - kind: Literal["preference_flip"] = "preference_flip" | |
| 39 | - triples: list[PreferenceTriple] = Field(default_factory=list) | |
| 40 | - """Inline triples. If empty, the probe pulls from PREFERENCE | |
| 41 | - sections in ctx.sections; if neither is available the probe SKIPs.""" | |
| 42 | - assert_flip_rate_gte: float = 0.7 | |
| 43 | - """Fraction of *base-wrong* triples that must flip under ft.""" | |
| 44 | - min_triples_for_decision: int = 3 | |
| 45 | - | |
| 46 | - | |
| 47 | -class PreferenceFlipProbe(Probe): | |
| 48 | - kind = "preference_flip" | |
| 49 | - spec_cls = PreferenceFlipSpec | |
| 50 | - category = "attribution" | |
| 51 | - | |
| 52 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 53 | - assert isinstance(spec, PreferenceFlipSpec) | |
| 54 | - triples = list(spec.triples) or _triples_from_sections(ctx) | |
| 55 | - if not triples: | |
| 56 | - return ProbeResult( | |
| 57 | - name=spec.name, | |
| 58 | - kind=spec.kind, | |
| 59 | - verdict=Verdict.SKIP, | |
| 60 | - score=None, | |
| 61 | - message="no preference triples (inline or from sections)", | |
| 62 | - ) | |
| 63 | - | |
| 64 | - base_margins: list[float] = [] | |
| 65 | - ft_margins: list[float] = [] | |
| 66 | - for t in triples: | |
| 67 | - with ctx.backend.as_base() as b: | |
| 68 | - base_margins.append( | |
| 69 | - b.logprob_of(t.prompt, t.chosen) - b.logprob_of(t.prompt, t.rejected) | |
| 70 | - ) | |
| 71 | - with ctx.backend.as_finetuned() as f: | |
| 72 | - ft_margins.append( | |
| 73 | - f.logprob_of(t.prompt, t.chosen) - f.logprob_of(t.prompt, t.rejected) | |
| 74 | - ) | |
| 75 | - | |
| 76 | - # Interesting denominator: base got it wrong. | |
| 77 | - base_wrong_idx = [i for i, m in enumerate(base_margins) if m < 0] | |
| 78 | - flipped_idx = [i for i in base_wrong_idx if ft_margins[i] > 0] | |
| 79 | - | |
| 80 | - if len(base_wrong_idx) < spec.min_triples_for_decision: | |
| 81 | - # Not enough base-wrong triples to decide. Fall back to mean margin delta. | |
| 82 | - mean_delta = statistics.fmean( | |
| 83 | - (ft - base) for base, ft in zip(base_margins, ft_margins, strict=True) | |
| 84 | - ) | |
| 85 | - verdict = Verdict.WARN | |
| 86 | - return ProbeResult( | |
| 87 | - name=spec.name, | |
| 88 | - kind=spec.kind, | |
| 89 | - verdict=verdict, | |
| 90 | - score=max(0.0, min(1.0, 0.5 + mean_delta / 4.0)), | |
| 91 | - raw=mean_delta, | |
| 92 | - base_value=statistics.fmean(base_margins), | |
| 93 | - ft_value=statistics.fmean(ft_margins), | |
| 94 | - evidence={ | |
| 95 | - "base_wrong": len(base_wrong_idx), | |
| 96 | - "total": len(triples), | |
| 97 | - "mean_margin_delta": mean_delta, | |
| 98 | - "weight": spec.weight, | |
| 99 | - }, | |
| 100 | - message=( | |
| 101 | - f"only {len(base_wrong_idx)} base-wrong triples < " | |
| 102 | - f"{spec.min_triples_for_decision} required; reporting mean-margin-delta={mean_delta:+.3f}" | |
| 103 | - ), | |
| 104 | - ) | |
| 105 | - | |
| 106 | - flip_rate = len(flipped_idx) / len(base_wrong_idx) | |
| 107 | - verdict = Verdict.PASS if flip_rate >= spec.assert_flip_rate_gte else Verdict.FAIL | |
| 108 | - score = min(1.0, flip_rate / max(spec.assert_flip_rate_gte, 1e-6)) | |
| 109 | - return ProbeResult( | |
| 110 | - name=spec.name, | |
| 111 | - kind=spec.kind, | |
| 112 | - verdict=verdict, | |
| 113 | - score=score, | |
| 114 | - raw=flip_rate, | |
| 115 | - base_value=statistics.fmean(base_margins), | |
| 116 | - ft_value=statistics.fmean(ft_margins), | |
| 117 | - evidence={ | |
| 118 | - "flip_rate": flip_rate, | |
| 119 | - "flipped": len(flipped_idx), | |
| 120 | - "base_wrong": len(base_wrong_idx), | |
| 121 | - "total": len(triples), | |
| 122 | - "weight": spec.weight, | |
| 123 | - }, | |
| 124 | - message=( | |
| 125 | - f"flip_rate={flip_rate:.2%} ({len(flipped_idx)}/{len(base_wrong_idx)} " | |
| 126 | - f"base-wrong triples flipped by ft)" | |
| 127 | - ), | |
| 128 | - ) | |
| 129 | - | |
| 130 | - | |
| 131 | -def _triples_from_sections(ctx: RunContext) -> list[PreferenceTriple]: | |
| 132 | - if ctx.sections is None: | |
| 133 | - return [] | |
| 134 | - out: list[PreferenceTriple] = [] | |
| 135 | - for s in ctx.sections: | |
| 136 | - if s.kind != "preference": | |
| 137 | - continue | |
| 138 | - for p in s.preferences: | |
| 139 | - out.append(PreferenceTriple(prompt=p.prompt, chosen=p.chosen, rejected=p.rejected)) | |
| 140 | - return out | |
sway/src/dlm_sway/probes/prompt_collapse.pydeleted@@ -1,159 +0,0 @@ | ||
| 1 | -"""A3 PromptCollapse — does adapter influence decay with context length? | |
| 2 | - | |
| 3 | -For each test prompt we prepend irrelevant "stuffing" of varying length | |
| 4 | -and measure ``divergence(base, ft)`` at the final position. A healthy | |
| 5 | -adapter shows a modest, slow decay; a degenerate one collapses quickly | |
| 6 | -— its signal evaporates once the base has a lot of context to lean on. | |
| 7 | - | |
| 8 | -We fit an exponential decay ``KL(L) = KL0 * exp(-L / half_life)`` in log | |
| 9 | -space and report the half-life in tokens. Pass if the half-life is at | |
| 10 | -least :attr:`PromptCollapseSpec.assert_half_life_tokens` — which | |
| 11 | -defaults to half the default sequence length. | |
| 12 | - | |
| 13 | -All math is numpy-only to avoid a scipy dependency on the install path. | |
| 14 | -""" | |
| 15 | - | |
| 16 | -from __future__ import annotations | |
| 17 | - | |
| 18 | -from typing import Literal | |
| 19 | - | |
| 20 | -import numpy as np | |
| 21 | -from pydantic import Field | |
| 22 | - | |
| 23 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 24 | -from dlm_sway.probes._divergence import Divergence, divergence | |
| 25 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 26 | - | |
| 27 | -# A neutral, token-dense piece of text we prepend to stress the base | |
| 28 | -# model's long-context handling. Deliberately low-information so the | |
| 29 | -# "answer" at the end is the only thing driving next-token predictions. | |
| 30 | -_STUFFING = ( | |
| 31 | - "The following log lines are archived for historical record and have no " | |
| 32 | - "bearing on the question that follows. They are retained for audit purposes " | |
| 33 | - "only and should be ignored when forming an answer. " | |
| 34 | -) | |
| 35 | - | |
| 36 | - | |
| 37 | -class PromptCollapseSpec(ProbeSpec): | |
| 38 | - kind: Literal["prompt_collapse"] = "prompt_collapse" | |
| 39 | - prompts: list[str] = Field(default_factory=list, min_length=0) | |
| 40 | - context_lengths: list[int] = Field( | |
| 41 | - default_factory=lambda: [0, 256, 512, 1024], | |
| 42 | - min_length=2, | |
| 43 | - ) | |
| 44 | - """Approximate token counts of stuffing to prepend. ≥2 required | |
| 45 | - because the exponential fit is undefined for a single point.""" | |
| 46 | - divergence: Divergence = "js" | |
| 47 | - top_k: int | None = None | |
| 48 | - assert_half_life_tokens: int = 512 | |
| 49 | - """Minimum half-life to pass. Default is deliberately permissive — | |
| 50 | - tune upward for high-stakes deployments.""" | |
| 51 | - | |
| 52 | - | |
| 53 | -class PromptCollapseProbe(Probe): | |
| 54 | - kind = "prompt_collapse" | |
| 55 | - spec_cls = PromptCollapseSpec | |
| 56 | - category = "adherence" | |
| 57 | - | |
| 58 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 59 | - assert isinstance(spec, PromptCollapseSpec) | |
| 60 | - if not spec.prompts: | |
| 61 | - return ProbeResult( | |
| 62 | - name=spec.name, | |
| 63 | - kind=spec.kind, | |
| 64 | - verdict=Verdict.ERROR, | |
| 65 | - score=None, | |
| 66 | - message="no prompts provided", | |
| 67 | - ) | |
| 68 | - | |
| 69 | - top_k = spec.top_k if spec.top_k is not None else ctx.top_k | |
| 70 | - # Mean divergence at each context length. | |
| 71 | - mean_divs: list[float] = [] | |
| 72 | - for ctx_len in spec.context_lengths: | |
| 73 | - prefix = _stuffing(ctx_len) | |
| 74 | - divs: list[float] = [] | |
| 75 | - for prompt in spec.prompts: | |
| 76 | - full_prompt = prefix + prompt | |
| 77 | - with ctx.backend.as_base() as bv: | |
| 78 | - base_dist = bv.next_token_dist(full_prompt, top_k=top_k) | |
| 79 | - with ctx.backend.as_finetuned() as fv: | |
| 80 | - ft_dist = fv.next_token_dist(full_prompt, top_k=top_k) | |
| 81 | - divs.append(divergence(base_dist, ft_dist, kind=spec.divergence)) | |
| 82 | - mean_divs.append(float(np.mean(divs))) | |
| 83 | - | |
| 84 | - half_life = _fit_half_life( | |
| 85 | - np.asarray(spec.context_lengths, dtype=np.float64), | |
| 86 | - np.asarray(mean_divs, dtype=np.float64), | |
| 87 | - ) | |
| 88 | - | |
| 89 | - verdict = ( | |
| 90 | - Verdict.PASS | |
| 91 | - if half_life is not None and half_life >= spec.assert_half_life_tokens | |
| 92 | - else Verdict.FAIL | |
| 93 | - ) | |
| 94 | - score = _score(half_life, spec.assert_half_life_tokens) | |
| 95 | - | |
| 96 | - msg = ( | |
| 97 | - f"half-life={half_life:.0f} tokens" | |
| 98 | - if half_life is not None | |
| 99 | - else "could not fit exponential decay (too flat or non-monotonic)" | |
| 100 | - ) | |
| 101 | - return ProbeResult( | |
| 102 | - name=spec.name, | |
| 103 | - kind=spec.kind, | |
| 104 | - verdict=verdict, | |
| 105 | - score=score, | |
| 106 | - raw=half_life, | |
| 107 | - evidence={ | |
| 108 | - "context_lengths": spec.context_lengths, | |
| 109 | - "mean_divergence_per_length": mean_divs, | |
| 110 | - "divergence_kind": spec.divergence, | |
| 111 | - "weight": spec.weight, | |
| 112 | - }, | |
| 113 | - message=msg, | |
| 114 | - ) | |
| 115 | - | |
| 116 | - | |
| 117 | -def _stuffing(target_tokens: int) -> str: | |
| 118 | - """Approximate target-length stuffing. 4 chars ≈ 1 token is fine | |
| 119 | - for SentencePiece-style tokenizers at the order-of-magnitude level.""" | |
| 120 | - if target_tokens <= 0: | |
| 121 | - return "" | |
| 122 | - # Repeat enough copies to hit the target length in characters. | |
| 123 | - target_chars = target_tokens * 4 | |
| 124 | - reps = (target_chars // len(_STUFFING)) + 1 | |
| 125 | - return (_STUFFING * reps)[:target_chars] + "\n\n" | |
| 126 | - | |
| 127 | - | |
| 128 | -def _fit_half_life(lengths: np.ndarray, divergences: np.ndarray) -> float | None: | |
| 129 | - """Fit ``y = a * exp(-x / h)`` via log-space linear regression. | |
| 130 | - | |
| 131 | - Returns ``None`` if the divergences aren't strictly positive or the | |
| 132 | - fit is non-decreasing (i.e. the fine-tune got *more* distinct with | |
| 133 | - context, which invalidates the half-life concept). | |
| 134 | - """ | |
| 135 | - if (divergences <= 0.0).any(): | |
| 136 | - # Can't take a log; treat near-zero as too-flat-to-fit. | |
| 137 | - return None | |
| 138 | - log_y = np.log(divergences) | |
| 139 | - # Standard linear regression slope. | |
| 140 | - x_mean = float(lengths.mean()) | |
| 141 | - y_mean = float(log_y.mean()) | |
| 142 | - denom = float(((lengths - x_mean) ** 2).sum()) | |
| 143 | - if denom == 0.0: | |
| 144 | - return None | |
| 145 | - slope = float(((lengths - x_mean) * (log_y - y_mean)).sum()) / denom | |
| 146 | - if slope >= 0.0: | |
| 147 | - # Signal grew with context — can't express as half-life. | |
| 148 | - return None | |
| 149 | - # Slope = -1/h → h = -1/slope → half_life = ln(2) * h. | |
| 150 | - import math | |
| 151 | - | |
| 152 | - return float(math.log(2.0) * (-1.0 / slope)) | |
| 153 | - | |
| 154 | - | |
| 155 | -def _score(half_life: float | None, target: int) -> float: | |
| 156 | - if half_life is None: | |
| 157 | - return 0.0 | |
| 158 | - # Asymptotic: score saturates at 1.0 when hits target, declines toward 0. | |
| 159 | - return float(min(1.0, half_life / max(target, 1))) | |
sway/src/dlm_sway/probes/section_internalization.pydeleted@@ -1,189 +0,0 @@ | ||
| 1 | -"""B1 SectionInternalizationScore — the flagship attribution primitive. | |
| 2 | - | |
| 3 | -For each typed section of the training document, measure *how much the | |
| 4 | -fine-tune moved the needle on that section's own content* — and subtract | |
| 5 | -the same metric measured on *other* sections' content. The difference is | |
| 6 | -the "effective SIS": signal attributable to *this* section, not to a | |
| 7 | -broader lift across the whole document. | |
| 8 | - | |
| 9 | -Output is a per-section bar chart. In practice users see that sections | |
| 10 | -2 and 7 actually moved the model, sections 3 and 5 did nothing, and | |
| 11 | -section 11 moved it but also leaked into unrelated content — actionable | |
| 12 | -signal for document authoring that no other eval tool provides. | |
| 13 | - | |
| 14 | -Math per section ``s`` with measurement function ``m(probe_set)``: | |
| 15 | - | |
| 16 | -.. math:: | |
| 17 | - sis_s^{own} &= (m_{base}(s) - m_{ft}(s)) / m_{base}(s) | |
| 18 | - sis_s^{leak} &= (m_{base}(\\bar s) - m_{ft}(\\bar s)) / m_{base}(\\bar s) | |
| 19 | - effective &= sis_s^{own} - sis_s^{leak} | |
| 20 | - | |
| 21 | -For PROSE sections, ``m`` is the average NLL per token over the | |
| 22 | -section's content. For INSTRUCTION and PREFERENCE sections, ``m`` is the | |
| 23 | -average NLL per token over the answer/chosen spans given their prompts. | |
| 24 | -""" | |
| 25 | - | |
| 26 | -from __future__ import annotations | |
| 27 | - | |
| 28 | -import statistics | |
| 29 | -from typing import Literal | |
| 30 | - | |
| 31 | -from pydantic import Field | |
| 32 | - | |
| 33 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 34 | -from dlm_sway.core.scoring import ScoringBackend | |
| 35 | -from dlm_sway.core.sections import Section, SectionKind | |
| 36 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 37 | - | |
| 38 | - | |
| 39 | -def _default_include_kinds() -> list[SectionKind]: | |
| 40 | - return ["prose", "instruction", "preference"] | |
| 41 | - | |
| 42 | - | |
| 43 | -class SectionInternalizationSpec(ProbeSpec): | |
| 44 | - kind: Literal["section_internalization"] = "section_internalization" | |
| 45 | - include_kinds: list[SectionKind] = Field(default_factory=_default_include_kinds) | |
| 46 | - per_section_threshold: float = 0.05 | |
| 47 | - """Minimum ``effective_sis`` for a section to be marked PASS.""" | |
| 48 | - assert_passing_section_frac: float = 0.5 | |
| 49 | - """Probe-level pass criterion: fraction of sections that must clear | |
| 50 | - the per-section threshold.""" | |
| 51 | - max_prose_chars: int = 2000 | |
| 52 | - """Cap the length of PROSE content we score to keep runtime bounded. | |
| 53 | - Long sections are chunked; this is the per-chunk cap.""" | |
| 54 | - | |
| 55 | - | |
| 56 | -class SectionInternalizationProbe(Probe): | |
| 57 | - kind = "section_internalization" | |
| 58 | - spec_cls = SectionInternalizationSpec | |
| 59 | - category = "attribution" | |
| 60 | - | |
| 61 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 62 | - assert isinstance(spec, SectionInternalizationSpec) | |
| 63 | - if ctx.sections is None or len(ctx.sections) == 0: | |
| 64 | - return ProbeResult( | |
| 65 | - name=spec.name, | |
| 66 | - kind=spec.kind, | |
| 67 | - verdict=Verdict.SKIP, | |
| 68 | - score=None, | |
| 69 | - message="no sections in context — provide via the .dlm bridge", | |
| 70 | - ) | |
| 71 | - | |
| 72 | - kinds_allowed = set(spec.include_kinds) | |
| 73 | - eligible = [s for s in ctx.sections if s.kind in kinds_allowed] | |
| 74 | - if len(eligible) < 2: | |
| 75 | - return ProbeResult( | |
| 76 | - name=spec.name, | |
| 77 | - kind=spec.kind, | |
| 78 | - verdict=Verdict.SKIP, | |
| 79 | - score=None, | |
| 80 | - message=( | |
| 81 | - f"need ≥2 eligible sections for leak-check; got {len(eligible)} " | |
| 82 | - f"(kinds={spec.include_kinds})" | |
| 83 | - ), | |
| 84 | - ) | |
| 85 | - | |
| 86 | - # Pre-compute per-section base and ft NLL-per-token to avoid | |
| 87 | - # re-running the forward pass for leak-checks. | |
| 88 | - base_nll: dict[str, float] = {} | |
| 89 | - ft_nll: dict[str, float] = {} | |
| 90 | - with ctx.backend.as_base() as base_view: | |
| 91 | - for s in eligible: | |
| 92 | - base_nll[s.id] = _section_nll(s, base_view, spec.max_prose_chars) | |
| 93 | - with ctx.backend.as_finetuned() as ft_view: | |
| 94 | - for s in eligible: | |
| 95 | - ft_nll[s.id] = _section_nll(s, ft_view, spec.max_prose_chars) | |
| 96 | - | |
| 97 | - per_section: list[dict[str, float | str | bool]] = [] | |
| 98 | - passing = 0 | |
| 99 | - effective_scores: list[float] = [] | |
| 100 | - for s in eligible: | |
| 101 | - others = [o for o in eligible if o.id != s.id] | |
| 102 | - own_lift = _relative_lift(base_nll[s.id], ft_nll[s.id]) | |
| 103 | - leak_lift = statistics.fmean( | |
| 104 | - _relative_lift(base_nll[o.id], ft_nll[o.id]) for o in others | |
| 105 | - ) | |
| 106 | - effective = own_lift - leak_lift | |
| 107 | - effective_scores.append(effective) | |
| 108 | - did_pass = effective >= spec.per_section_threshold | |
| 109 | - passing += int(did_pass) | |
| 110 | - per_section.append( | |
| 111 | - { | |
| 112 | - "section_id": s.id, | |
| 113 | - "kind": s.kind, | |
| 114 | - "tag": s.tag or "", | |
| 115 | - "base_nll": base_nll[s.id], | |
| 116 | - "ft_nll": ft_nll[s.id], | |
| 117 | - "own_lift": own_lift, | |
| 118 | - "leak_lift": leak_lift, | |
| 119 | - "effective_sis": effective, | |
| 120 | - "passed": did_pass, | |
| 121 | - } | |
| 122 | - ) | |
| 123 | - | |
| 124 | - passing_frac = passing / len(eligible) | |
| 125 | - verdict = Verdict.PASS if passing_frac >= spec.assert_passing_section_frac else Verdict.FAIL | |
| 126 | - score = passing_frac | |
| 127 | - return ProbeResult( | |
| 128 | - name=spec.name, | |
| 129 | - kind=spec.kind, | |
| 130 | - verdict=verdict, | |
| 131 | - score=score, | |
| 132 | - raw=statistics.fmean(effective_scores), | |
| 133 | - evidence={ | |
| 134 | - "per_section": per_section, | |
| 135 | - "num_sections": len(eligible), | |
| 136 | - "passing_frac": passing_frac, | |
| 137 | - "per_section_threshold": spec.per_section_threshold, | |
| 138 | - "weight": spec.weight, | |
| 139 | - }, | |
| 140 | - message=( | |
| 141 | - f"{passing}/{len(eligible)} sections cleared " | |
| 142 | - f"effective_sis≥{spec.per_section_threshold:.2f} (mean={statistics.fmean(effective_scores):+.3f})" | |
| 143 | - ), | |
| 144 | - ) | |
| 145 | - | |
| 146 | - | |
| 147 | -def _section_nll(s: Section, view: ScoringBackend, max_prose_chars: int) -> float: | |
| 148 | - """Average NLL per token for the section's content under ``view``.""" | |
| 149 | - if s.kind == "prose": | |
| 150 | - return _prose_nll(s.content[:max_prose_chars], view) | |
| 151 | - if s.kind == "instruction": | |
| 152 | - if not s.probes: | |
| 153 | - return _prose_nll(s.content[:max_prose_chars], view) | |
| 154 | - return statistics.fmean( | |
| 155 | - -view.logprob_of(p.prompt, p.gold) / max(_token_estimate(p.gold), 1) for p in s.probes | |
| 156 | - ) | |
| 157 | - if s.kind == "preference": | |
| 158 | - if not s.preferences: | |
| 159 | - return _prose_nll(s.content[:max_prose_chars], view) | |
| 160 | - return statistics.fmean( | |
| 161 | - -view.logprob_of(p.prompt, p.chosen) / max(_token_estimate(p.chosen), 1) | |
| 162 | - for p in s.preferences | |
| 163 | - ) | |
| 164 | - raise ValueError(f"unknown section kind: {s.kind!r}") | |
| 165 | - | |
| 166 | - | |
| 167 | -def _prose_nll(text: str, view: ScoringBackend) -> float: | |
| 168 | - """Negative-mean-logprob over ``text``. Returns 0 for empty input.""" | |
| 169 | - if not text.strip(): | |
| 170 | - return 0.0 | |
| 171 | - r = view.rolling_logprob(text) | |
| 172 | - return -r.mean_logprob | |
| 173 | - | |
| 174 | - | |
| 175 | -def _relative_lift(base_nll: float, ft_nll: float) -> float: | |
| 176 | - """``(base - ft) / base``. Positive → ft is lower-PPL than base. | |
| 177 | - | |
| 178 | - Falls back to an absolute delta when ``base`` is pathological | |
| 179 | - (zero or negative), so the probe doesn't crash on degenerate | |
| 180 | - inputs. | |
| 181 | - """ | |
| 182 | - if base_nll <= 0.0: | |
| 183 | - return float(base_nll - ft_nll) | |
| 184 | - return float((base_nll - ft_nll) / base_nll) | |
| 185 | - | |
| 186 | - | |
| 187 | -def _token_estimate(s: str) -> int: | |
| 188 | - """Approximate tokens for normalization. Good enough for SentencePiece-ish vocabs.""" | |
| 189 | - return max(1, len(s) // 4) | |
sway/src/dlm_sway/probes/style_fingerprint.pydeleted@@ -1,179 +0,0 @@ | ||
| 1 | -"""C1 StyleFingerprint — does ft prose *read* like the doc? | |
| 2 | - | |
| 3 | -Generates base and ft completions from a set of stylistic prompts, | |
| 4 | -extracts a 6-dimensional fingerprint from each, and measures how the ft | |
| 5 | -fingerprint has shifted **toward** the training document's own | |
| 6 | -fingerprint vs the base. | |
| 7 | - | |
| 8 | -We compute the fingerprint with numpy-only features so the probe works | |
| 9 | -out of the box without spaCy/textstat. The optional ``style`` extra | |
| 10 | -upgrades the fingerprint with passive-voice rate and POS-entropy in a | |
| 11 | -later milestone; the numeric contract — a non-negative vector per text | |
| 12 | -— is stable across that upgrade. | |
| 13 | - | |
| 14 | -Signal: ``style_shift = cos(ft_fp - base_fp, doc_fp - base_fp)`` in | |
| 15 | -fingerprint space. Positive values mean ft has moved *toward* the | |
| 16 | -doc's style; negative values mean it moved *away* (a bad sign); | |
| 17 | -near-zero means no stylistic shift detectable. | |
| 18 | -""" | |
| 19 | - | |
| 20 | -from __future__ import annotations | |
| 21 | - | |
| 22 | -import re | |
| 23 | -import statistics | |
| 24 | -from typing import Literal | |
| 25 | - | |
| 26 | -import numpy as np | |
| 27 | -from numpy.typing import NDArray | |
| 28 | -from pydantic import Field | |
| 29 | - | |
| 30 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 31 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 32 | - | |
| 33 | -_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+") | |
| 34 | -_PARAGRAPH_SPLIT = re.compile(r"\n\s*\n") | |
| 35 | -_WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z'-]*\b") | |
| 36 | -_PUNCTS = set(".,:;!?-—()[]\"'/") | |
| 37 | - | |
| 38 | - | |
| 39 | -def fingerprint(text: str) -> NDArray[np.float64]: | |
| 40 | - """Return a 6-dim stylistic fingerprint for ``text``. | |
| 41 | - | |
| 42 | - Dimensions (all numeric, scaled to order-1): | |
| 43 | - 0. mean sentence length (words) / 30.0 | |
| 44 | - 1. std sentence length (words) / 30.0 | |
| 45 | - 2. type-token ratio (already in [0,1]) | |
| 46 | - 3. avg word length (chars) / 10.0 | |
| 47 | - 4. punctuation density per char * 10.0 | |
| 48 | - 5. paragraph density (1 / avg paragraph length in words) * 30.0 | |
| 49 | - """ | |
| 50 | - if not text.strip(): | |
| 51 | - return np.zeros(6, dtype=np.float64) | |
| 52 | - | |
| 53 | - sentences = [s for s in _SENTENCE_SPLIT.split(text) if s.strip()] | |
| 54 | - paragraphs = [p for p in _PARAGRAPH_SPLIT.split(text) if p.strip()] | |
| 55 | - words = _WORD_RE.findall(text) | |
| 56 | - if not words: | |
| 57 | - return np.zeros(6, dtype=np.float64) | |
| 58 | - | |
| 59 | - sentence_word_counts = [len(_WORD_RE.findall(s)) for s in sentences] | |
| 60 | - sentence_word_counts = [c for c in sentence_word_counts if c > 0] | |
| 61 | - if not sentence_word_counts: | |
| 62 | - sentence_word_counts = [len(words)] | |
| 63 | - | |
| 64 | - mean_sent = statistics.fmean(sentence_word_counts) | |
| 65 | - std_sent = statistics.pstdev(sentence_word_counts) if len(sentence_word_counts) > 1 else 0.0 | |
| 66 | - ttr = len({w.lower() for w in words}) / len(words) | |
| 67 | - avg_word_len = statistics.fmean(len(w) for w in words) | |
| 68 | - punct_count = sum(ch in _PUNCTS for ch in text) | |
| 69 | - punct_density = punct_count / max(len(text), 1) | |
| 70 | - avg_paragraph_len = ( | |
| 71 | - statistics.fmean(len(_WORD_RE.findall(p)) for p in paragraphs) if paragraphs else len(words) | |
| 72 | - ) | |
| 73 | - paragraph_density = 1.0 / max(avg_paragraph_len, 1.0) | |
| 74 | - | |
| 75 | - return np.asarray( | |
| 76 | - [ | |
| 77 | - mean_sent / 30.0, | |
| 78 | - std_sent / 30.0, | |
| 79 | - ttr, | |
| 80 | - avg_word_len / 10.0, | |
| 81 | - punct_density * 10.0, | |
| 82 | - paragraph_density * 30.0, | |
| 83 | - ], | |
| 84 | - dtype=np.float64, | |
| 85 | - ) | |
| 86 | - | |
| 87 | - | |
| 88 | -class StyleFingerprintSpec(ProbeSpec): | |
| 89 | - kind: Literal["style_fingerprint"] = "style_fingerprint" | |
| 90 | - prompts: list[str] = Field(default_factory=list) | |
| 91 | - """Prompts used to elicit a stylistic sample from each model.""" | |
| 92 | - doc_reference: str = "" | |
| 93 | - """Concatenated reference text representing the adapter's intended | |
| 94 | - style. Typically the document itself; the .dlm bridge supplies this | |
| 95 | - from ``ctx.doc_text`` when left empty.""" | |
| 96 | - max_new_tokens: int = 128 | |
| 97 | - assert_shift_gte: float = 0.25 | |
| 98 | - """Minimum cosine shift for PASS. ``0.25`` is a deliberately | |
| 99 | - permissive default — stylistic shift is a weaker signal than | |
| 100 | - perplexity lift.""" | |
| 101 | - | |
| 102 | - | |
| 103 | -class StyleFingerprintProbe(Probe): | |
| 104 | - kind = "style_fingerprint" | |
| 105 | - spec_cls = StyleFingerprintSpec | |
| 106 | - category = "calibration" | |
| 107 | - | |
| 108 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 109 | - assert isinstance(spec, StyleFingerprintSpec) | |
| 110 | - if not spec.prompts: | |
| 111 | - return ProbeResult( | |
| 112 | - name=spec.name, | |
| 113 | - kind=spec.kind, | |
| 114 | - verdict=Verdict.ERROR, | |
| 115 | - score=None, | |
| 116 | - message="no prompts provided", | |
| 117 | - ) | |
| 118 | - doc_text = spec.doc_reference or (ctx.doc_text or "") | |
| 119 | - if not doc_text.strip(): | |
| 120 | - return ProbeResult( | |
| 121 | - name=spec.name, | |
| 122 | - kind=spec.kind, | |
| 123 | - verdict=Verdict.SKIP, | |
| 124 | - score=None, | |
| 125 | - message="no doc_reference (inline or from ctx.doc_text)", | |
| 126 | - ) | |
| 127 | - | |
| 128 | - base_samples: list[str] = [] | |
| 129 | - ft_samples: list[str] = [] | |
| 130 | - for prompt in spec.prompts: | |
| 131 | - with ctx.backend.as_base() as b: | |
| 132 | - base_samples.append( | |
| 133 | - b.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | |
| 134 | - ) | |
| 135 | - with ctx.backend.as_finetuned() as f: | |
| 136 | - ft_samples.append( | |
| 137 | - f.generate(prompt, max_new_tokens=spec.max_new_tokens, seed=ctx.seed) | |
| 138 | - ) | |
| 139 | - | |
| 140 | - base_fp = fingerprint("\n".join(base_samples)) | |
| 141 | - ft_fp = fingerprint("\n".join(ft_samples)) | |
| 142 | - doc_fp = fingerprint(doc_text) | |
| 143 | - | |
| 144 | - shift = _cosine_shift(base_fp, ft_fp, doc_fp) | |
| 145 | - verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL | |
| 146 | - score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0)) | |
| 147 | - | |
| 148 | - return ProbeResult( | |
| 149 | - name=spec.name, | |
| 150 | - kind=spec.kind, | |
| 151 | - verdict=verdict, | |
| 152 | - score=score, | |
| 153 | - raw=shift, | |
| 154 | - evidence={ | |
| 155 | - "base_fp": base_fp.tolist(), | |
| 156 | - "ft_fp": ft_fp.tolist(), | |
| 157 | - "doc_fp": doc_fp.tolist(), | |
| 158 | - "style_shift": shift, | |
| 159 | - "weight": spec.weight, | |
| 160 | - }, | |
| 161 | - message=( | |
| 162 | - f"style_shift={shift:+.2f} " | |
| 163 | - f"({'toward' if shift > 0 else 'away from'} doc, " | |
| 164 | - f"threshold={spec.assert_shift_gte})" | |
| 165 | - ), | |
| 166 | - ) | |
| 167 | - | |
| 168 | - | |
| 169 | -def _cosine_shift( | |
| 170 | - base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64] | |
| 171 | -) -> float: | |
| 172 | - """Cosine between (ft - base) and (doc - base) in fingerprint space.""" | |
| 173 | - a = ft - base | |
| 174 | - b = doc - base | |
| 175 | - na = float(np.linalg.norm(a)) | |
| 176 | - nb = float(np.linalg.norm(b)) | |
| 177 | - if na == 0.0 or nb == 0.0: | |
| 178 | - return 0.0 | |
| 179 | - return float(np.dot(a, b) / (na * nb)) | |
sway/src/dlm_sway/py.typeddeletedsway/src/dlm_sway/suite/__init__.pydeleted@@ -1,1 +0,0 @@ | ||
| 1 | -"""Suite plumbing: spec models, loader, runner, report, composite score.""" | |
sway/src/dlm_sway/suite/loader.pydeleted@@ -1,48 +0,0 @@ | ||
| 1 | -"""Load + validate a ``sway.yaml`` into a :class:`SwaySpec`. | |
| 2 | - | |
| 3 | -Separated from :mod:`spec` so the data models stay trivially | |
| 4 | -importable (no YAML dependency at import time for callers that | |
| 5 | -construct specs programmatically). | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -from pathlib import Path | |
| 11 | -from typing import Any | |
| 12 | - | |
| 13 | -import yaml | |
| 14 | -from pydantic import ValidationError | |
| 15 | - | |
| 16 | -from dlm_sway.core.errors import SpecValidationError | |
| 17 | -from dlm_sway.suite.spec import SwaySpec | |
| 18 | - | |
| 19 | - | |
| 20 | -def load_spec(path: Path | str) -> SwaySpec: | |
| 21 | - """Parse ``path`` and return a validated :class:`SwaySpec`.""" | |
| 22 | - resolved = Path(path).expanduser().resolve() | |
| 23 | - try: | |
| 24 | - raw_text = resolved.read_text(encoding="utf-8") | |
| 25 | - except FileNotFoundError as exc: | |
| 26 | - raise SpecValidationError(f"spec file not found: {resolved}", source=str(path)) from exc | |
| 27 | - | |
| 28 | - try: | |
| 29 | - data = yaml.safe_load(raw_text) | |
| 30 | - except yaml.YAMLError as exc: | |
| 31 | - raise SpecValidationError(f"invalid YAML: {exc}", source=str(path)) from exc | |
| 32 | - | |
| 33 | - if not isinstance(data, dict): | |
| 34 | - raise SpecValidationError("top-level document must be a mapping", source=str(path)) | |
| 35 | - return from_dict(data, source=str(path)) | |
| 36 | - | |
| 37 | - | |
| 38 | -def from_dict(data: dict[str, Any], *, source: str | None = None) -> SwaySpec: | |
| 39 | - """Validate a dict (already parsed from YAML or JSON) as a SwaySpec.""" | |
| 40 | - try: | |
| 41 | - spec = SwaySpec.model_validate(data) | |
| 42 | - except ValidationError as exc: | |
| 43 | - raise SpecValidationError(str(exc), source=source) from exc | |
| 44 | - try: | |
| 45 | - spec.check_version() | |
| 46 | - except ValueError as exc: | |
| 47 | - raise SpecValidationError(str(exc), source=source) from exc | |
| 48 | - return spec | |
sway/src/dlm_sway/suite/report.pydeleted@@ -1,249 +0,0 @@ | ||
| 1 | -"""Report emitters: terminal (rich), JSON, JUnit XML, markdown. | |
| 2 | - | |
| 3 | -The terminal renderer is the one a user sees; it's the product surface. | |
| 4 | -It must communicate the verdict *and* the supporting evidence without | |
| 5 | -forcing the user to open the JSON. | |
| 6 | - | |
| 7 | -JSON is the machine-readable source of truth — same fields as the | |
| 8 | -:class:`SuiteResult` dataclass but flattened for easy downstream parsing | |
| 9 | -(dashboards, diff tools, history tracking). | |
| 10 | - | |
| 11 | -JUnit XML exists to drop into CI pipelines so ``dlm-sway gate`` | |
| 12 | -integrates with existing test dashboards with no extra glue. | |
| 13 | -""" | |
| 14 | - | |
| 15 | -from __future__ import annotations | |
| 16 | - | |
| 17 | -import json | |
| 18 | -import xml.etree.ElementTree as ET | |
| 19 | -from io import StringIO | |
| 20 | -from typing import Any | |
| 21 | - | |
| 22 | -from rich.console import Console | |
| 23 | -from rich.panel import Panel | |
| 24 | -from rich.table import Table | |
| 25 | -from rich.text import Text | |
| 26 | - | |
| 27 | -from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict | |
| 28 | - | |
| 29 | -_VERDICT_STYLE = { | |
| 30 | - Verdict.PASS: "bold green", | |
| 31 | - Verdict.FAIL: "bold red", | |
| 32 | - Verdict.WARN: "bold yellow", | |
| 33 | - Verdict.SKIP: "dim", | |
| 34 | - Verdict.ERROR: "bold magenta", | |
| 35 | -} | |
| 36 | - | |
| 37 | - | |
| 38 | -def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None: | |
| 39 | - """Render the report to a rich Console (stdout by default).""" | |
| 40 | - c = console or Console() | |
| 41 | - | |
| 42 | - header = Text.assemble( | |
| 43 | - ("dlm-sway report — ", "bold"), | |
| 44 | - (suite.base_model_id, "cyan"), | |
| 45 | - (" vs ", "dim"), | |
| 46 | - (_adapter_label(suite.adapter_id), "cyan"), | |
| 47 | - ) | |
| 48 | - c.print(Panel(header, expand=False, border_style="blue")) | |
| 49 | - | |
| 50 | - c.print() | |
| 51 | - c.print( | |
| 52 | - Text.assemble( | |
| 53 | - ("overall: ", "bold"), | |
| 54 | - (f"{score.overall:.2f}", _score_style(score.overall)), | |
| 55 | - (" ", ""), | |
| 56 | - (f"[ {score.band} ]", _band_style(score.band)), | |
| 57 | - ) | |
| 58 | - ) | |
| 59 | - | |
| 60 | - # Component breakdown | |
| 61 | - comp_table = Table.grid(padding=(0, 2)) | |
| 62 | - comp_table.add_column(justify="left") | |
| 63 | - comp_table.add_column(justify="right") | |
| 64 | - comp_table.add_column() | |
| 65 | - for cat in ("adherence", "attribution", "calibration", "ablation", "baseline"): | |
| 66 | - if cat not in score.components: | |
| 67 | - continue | |
| 68 | - v = score.components[cat] | |
| 69 | - comp_table.add_row(cat, f"{v:.2f}", _bar(v)) | |
| 70 | - c.print(comp_table) | |
| 71 | - | |
| 72 | - c.print() | |
| 73 | - # Per-probe detail | |
| 74 | - detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1)) | |
| 75 | - detail.add_column("name", style="cyan") | |
| 76 | - detail.add_column("kind", style="dim") | |
| 77 | - detail.add_column("verdict") | |
| 78 | - detail.add_column("score", justify="right") | |
| 79 | - detail.add_column("raw", justify="right") | |
| 80 | - detail.add_column("z", justify="right") | |
| 81 | - detail.add_column("note", style="dim") | |
| 82 | - for r in suite.probes: | |
| 83 | - detail.add_row( | |
| 84 | - r.name, | |
| 85 | - r.kind, | |
| 86 | - Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]), | |
| 87 | - f"{r.score:.2f}" if r.score is not None else "—", | |
| 88 | - f"{r.raw:.3f}" if r.raw is not None else "—", | |
| 89 | - f"{r.z_score:+.2f}σ" if r.z_score is not None else "—", | |
| 90 | - (r.message[:80] + "…") if len(r.message) > 80 else r.message, | |
| 91 | - ) | |
| 92 | - c.print(detail) | |
| 93 | - | |
| 94 | - if score.findings: | |
| 95 | - c.print() | |
| 96 | - c.print(Text("top findings:", style="bold")) | |
| 97 | - for i, f in enumerate(score.findings, start=1): | |
| 98 | - c.print(f" {i}. {f}") | |
| 99 | - | |
| 100 | - c.print() | |
| 101 | - c.print(Text(f"wall: {suite.wall_seconds:.2f}s | sway {suite.sway_version}", style="dim")) | |
| 102 | - | |
| 103 | - | |
| 104 | -def to_json(suite: SuiteResult, score: SwayScore) -> str: | |
| 105 | - """Serialize the suite + composite score as JSON. | |
| 106 | - | |
| 107 | - Stable schema; downstream tools rely on it. Breaking changes bump a | |
| 108 | - ``schema_version`` field (not yet present — this is v0.1). | |
| 109 | - """ | |
| 110 | - return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True) | |
| 111 | - | |
| 112 | - | |
| 113 | -def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]: | |
| 114 | - return { | |
| 115 | - "schema_version": 1, | |
| 116 | - "sway_version": suite.sway_version, | |
| 117 | - "spec_path": suite.spec_path, | |
| 118 | - "base_model_id": suite.base_model_id, | |
| 119 | - "adapter_id": suite.adapter_id, | |
| 120 | - "started_at": suite.started_at.isoformat(), | |
| 121 | - "finished_at": suite.finished_at.isoformat(), | |
| 122 | - "wall_seconds": suite.wall_seconds, | |
| 123 | - "score": { | |
| 124 | - "overall": score.overall, | |
| 125 | - "band": score.band, | |
| 126 | - "components": score.components, | |
| 127 | - "weights": score.weights, | |
| 128 | - "findings": list(score.findings), | |
| 129 | - }, | |
| 130 | - "null_stats": suite.null_stats, | |
| 131 | - "probes": [_probe_to_jsonable(p) for p in suite.probes], | |
| 132 | - } | |
| 133 | - | |
| 134 | - | |
| 135 | -def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]: | |
| 136 | - return { | |
| 137 | - "name": r.name, | |
| 138 | - "kind": r.kind, | |
| 139 | - "verdict": r.verdict.value, | |
| 140 | - "score": r.score, | |
| 141 | - "raw": r.raw, | |
| 142 | - "z_score": r.z_score, | |
| 143 | - "base_value": r.base_value, | |
| 144 | - "ft_value": r.ft_value, | |
| 145 | - "evidence": r.evidence, | |
| 146 | - "message": r.message, | |
| 147 | - "duration_s": r.duration_s, | |
| 148 | - } | |
| 149 | - | |
| 150 | - | |
| 151 | -def to_junit(suite: SuiteResult, score: SwayScore) -> str: | |
| 152 | - """Serialize as JUnit XML. One ``<testcase>`` per probe.""" | |
| 153 | - testsuite = ET.Element( | |
| 154 | - "testsuite", | |
| 155 | - { | |
| 156 | - "name": "dlm-sway", | |
| 157 | - "tests": str(len(suite.probes)), | |
| 158 | - "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)), | |
| 159 | - "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)), | |
| 160 | - "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)), | |
| 161 | - "time": f"{suite.wall_seconds:.3f}", | |
| 162 | - }, | |
| 163 | - ) | |
| 164 | - # Properties — the composite score and category breakdown. | |
| 165 | - props = ET.SubElement(testsuite, "properties") | |
| 166 | - ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"}) | |
| 167 | - ET.SubElement(props, "property", {"name": "band", "value": score.band}) | |
| 168 | - for cat, v in score.components.items(): | |
| 169 | - ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"}) | |
| 170 | - | |
| 171 | - for r in suite.probes: | |
| 172 | - tc = ET.SubElement( | |
| 173 | - testsuite, | |
| 174 | - "testcase", | |
| 175 | - {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"}, | |
| 176 | - ) | |
| 177 | - if r.verdict == Verdict.FAIL: | |
| 178 | - ET.SubElement(tc, "failure", {"message": r.message or "failed"}) | |
| 179 | - elif r.verdict == Verdict.ERROR: | |
| 180 | - ET.SubElement(tc, "error", {"message": r.message or "errored"}) | |
| 181 | - elif r.verdict == Verdict.SKIP: | |
| 182 | - ET.SubElement(tc, "skipped", {"message": r.message or "skipped"}) | |
| 183 | - | |
| 184 | - return ET.tostring(testsuite, encoding="unicode") | |
| 185 | - | |
| 186 | - | |
| 187 | -def to_markdown(suite: SuiteResult, score: SwayScore) -> str: | |
| 188 | - """A portable, CI-friendly markdown report.""" | |
| 189 | - buf = StringIO() | |
| 190 | - buf.write("# dlm-sway report\n\n") | |
| 191 | - buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`) \n") | |
| 192 | - buf.write(f"**Base:** `{suite.base_model_id}` \n") | |
| 193 | - buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}` \n") | |
| 194 | - buf.write(f"**Wall:** {suite.wall_seconds:.2f}s \n\n") | |
| 195 | - | |
| 196 | - buf.write("## Components\n\n") | |
| 197 | - buf.write("| category | score |\n|---|---:|\n") | |
| 198 | - for cat, v in score.components.items(): | |
| 199 | - buf.write(f"| {cat} | {v:.2f} |\n") | |
| 200 | - buf.write("\n## Probes\n\n") | |
| 201 | - buf.write("| name | kind | verdict | score | note |\n|---|---|---|---:|---|\n") | |
| 202 | - for r in suite.probes: | |
| 203 | - buf.write( | |
| 204 | - f"| {r.name} | `{r.kind}` | {r.verdict.value} | " | |
| 205 | - f"{f'{r.score:.2f}' if r.score is not None else '—'} | " | |
| 206 | - f"{r.message[:60]} |\n" | |
| 207 | - ) | |
| 208 | - if score.findings: | |
| 209 | - buf.write("\n## Top findings\n\n") | |
| 210 | - for f in score.findings: | |
| 211 | - buf.write(f"- {f}\n") | |
| 212 | - return buf.getvalue() | |
| 213 | - | |
| 214 | - | |
| 215 | -# -- helpers ----------------------------------------------------------- | |
| 216 | - | |
| 217 | - | |
| 218 | -def _adapter_label(adapter_id: str) -> str: | |
| 219 | - if not adapter_id: | |
| 220 | - return "(base only)" | |
| 221 | - # Only the trailing path chunk is useful in the header. | |
| 222 | - parts = adapter_id.rstrip("/").split("/") | |
| 223 | - return "/".join(parts[-3:]) if len(parts) > 3 else adapter_id | |
| 224 | - | |
| 225 | - | |
| 226 | -def _score_style(v: float) -> str: | |
| 227 | - if v >= 0.6: | |
| 228 | - return "bold green" | |
| 229 | - if v >= 0.3: | |
| 230 | - return "bold yellow" | |
| 231 | - return "bold red" | |
| 232 | - | |
| 233 | - | |
| 234 | -def _band_style(band: str) -> str: | |
| 235 | - return { | |
| 236 | - "noise": "red", | |
| 237 | - "partial": "yellow", | |
| 238 | - "healthy": "green", | |
| 239 | - "suspicious": "magenta", | |
| 240 | - }.get(band, "white") | |
| 241 | - | |
| 242 | - | |
| 243 | -def _bar(v: float, *, width: int = 10) -> str: | |
| 244 | - clamped = max(0.0, min(1.0, v)) | |
| 245 | - filled = int(round(clamped * width)) | |
| 246 | - return "█" * filled + "░" * (width - filled) | |
| 247 | - | |
| 248 | - | |
| 249 | -__all__ = ["to_terminal", "to_json", "to_junit", "to_markdown"] | |
sway/src/dlm_sway/suite/runner.pydeleted@@ -1,136 +0,0 @@ | ||
| 1 | -"""Suite runner. | |
| 2 | - | |
| 3 | -Iterates the probe list, materializes each into a ``(Probe, Spec)`` via | |
| 4 | -the registry, executes it with a :class:`~dlm_sway.probes.base.RunContext`, | |
| 5 | -and assembles a :class:`~dlm_sway.core.result.SuiteResult`. | |
| 6 | - | |
| 7 | -Runtime contract: | |
| 8 | - | |
| 9 | -- Probes are executed in declaration order (not sorted, not parallelized). | |
| 10 | - The null-adapter baseline has to run before any probe that needs z-scores, | |
| 11 | - so authoring order is load-bearing. | |
| 12 | -- A probe that raises is recorded as | |
| 13 | - :attr:`~dlm_sway.core.result.Verdict.ERROR` and the suite continues — | |
| 14 | - one broken probe doesn't torch the whole report. | |
| 15 | -- The backend is the caller's responsibility: the runner does not build | |
| 16 | - or close it, so callers can reuse a backend across multiple suites. | |
| 17 | -""" | |
| 18 | - | |
| 19 | -from __future__ import annotations | |
| 20 | - | |
| 21 | -import time | |
| 22 | - | |
| 23 | -from dlm_sway import __version__ | |
| 24 | -from dlm_sway.core.errors import ProbeError | |
| 25 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | |
| 26 | -from dlm_sway.core.scoring import DifferentialBackend | |
| 27 | -from dlm_sway.core.sections import Section | |
| 28 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 29 | -from dlm_sway.probes.null_adapter import NullAdapterSpec, get_null_stats | |
| 30 | -from dlm_sway.suite.spec import SwaySpec | |
| 31 | - | |
| 32 | - | |
| 33 | -def run( | |
| 34 | - spec: SwaySpec, | |
| 35 | - backend: DifferentialBackend, | |
| 36 | - *, | |
| 37 | - spec_path: str = "<memory>", | |
| 38 | - doc_text: str | None = None, | |
| 39 | - sections: tuple[Section, ...] | None = None, | |
| 40 | -) -> SuiteResult: | |
| 41 | - """Execute every probe in ``spec`` against ``backend``.""" | |
| 42 | - started = utcnow() | |
| 43 | - ctx = RunContext( | |
| 44 | - backend=backend, | |
| 45 | - seed=spec.defaults.seed, | |
| 46 | - top_k=spec.defaults.top_k, | |
| 47 | - sections=sections, | |
| 48 | - doc_text=doc_text, | |
| 49 | - ) | |
| 50 | - | |
| 51 | - results: list[ProbeResult] = [] | |
| 52 | - null_stats: dict[str, dict[str, float]] = {} | |
| 53 | - | |
| 54 | - for raw in spec.suite: | |
| 55 | - probe, probe_spec = build_probe(raw) | |
| 56 | - if not probe_spec.enabled: | |
| 57 | - results.append( | |
| 58 | - ProbeResult( | |
| 59 | - name=probe_spec.name, | |
| 60 | - kind=probe_spec.kind, | |
| 61 | - verdict=Verdict.SKIP, | |
| 62 | - score=None, | |
| 63 | - message="disabled in spec", | |
| 64 | - ) | |
| 65 | - ) | |
| 66 | - continue | |
| 67 | - | |
| 68 | - t0 = time.perf_counter() | |
| 69 | - try: | |
| 70 | - result = probe.run(probe_spec, ctx) | |
| 71 | - except ProbeError as exc: | |
| 72 | - result = ProbeResult( | |
| 73 | - name=probe_spec.name, | |
| 74 | - kind=probe_spec.kind, | |
| 75 | - verdict=Verdict.ERROR, | |
| 76 | - score=None, | |
| 77 | - message=str(exc), | |
| 78 | - ) | |
| 79 | - except Exception as exc: # noqa: BLE001 — probe impls may raise anything | |
| 80 | - result = ProbeResult( | |
| 81 | - name=probe_spec.name, | |
| 82 | - kind=probe_spec.kind, | |
| 83 | - verdict=Verdict.ERROR, | |
| 84 | - score=None, | |
| 85 | - message=f"{type(exc).__name__}: {exc}", | |
| 86 | - ) | |
| 87 | - duration = time.perf_counter() - t0 | |
| 88 | - # Re-stamp duration (probes don't know their own wall time). | |
| 89 | - result = _with_duration(result, duration) | |
| 90 | - results.append(result) | |
| 91 | - | |
| 92 | - # Null-adapter result seeds ctx.null_stats for subsequent probes. | |
| 93 | - if isinstance(probe_spec, NullAdapterSpec) and result.evidence.get("null_stats"): | |
| 94 | - null_stats.update(result.evidence["null_stats"]) | |
| 95 | - # RunContext is frozen; swap in a fresh one so later probes | |
| 96 | - # see the populated stats. | |
| 97 | - ctx = RunContext( | |
| 98 | - backend=ctx.backend, | |
| 99 | - seed=ctx.seed, | |
| 100 | - top_k=ctx.top_k, | |
| 101 | - sections=ctx.sections, | |
| 102 | - doc_text=ctx.doc_text, | |
| 103 | - null_stats=null_stats, | |
| 104 | - ) | |
| 105 | - | |
| 106 | - finished = utcnow() | |
| 107 | - return SuiteResult( | |
| 108 | - spec_path=spec_path, | |
| 109 | - started_at=started, | |
| 110 | - finished_at=finished, | |
| 111 | - base_model_id=spec.models.base.base, | |
| 112 | - adapter_id=str(spec.models.ft.adapter) if spec.models.ft.adapter else "", | |
| 113 | - sway_version=__version__, | |
| 114 | - probes=tuple(results), | |
| 115 | - null_stats=null_stats, | |
| 116 | - ) | |
| 117 | - | |
| 118 | - | |
| 119 | -def _with_duration(result: ProbeResult, duration: float) -> ProbeResult: | |
| 120 | - """Return a copy of ``result`` with :attr:`ProbeResult.duration_s` set.""" | |
| 121 | - return ProbeResult( | |
| 122 | - name=result.name, | |
| 123 | - kind=result.kind, | |
| 124 | - verdict=result.verdict, | |
| 125 | - score=result.score, | |
| 126 | - raw=result.raw, | |
| 127 | - z_score=result.z_score, | |
| 128 | - base_value=result.base_value, | |
| 129 | - ft_value=result.ft_value, | |
| 130 | - evidence=result.evidence, | |
| 131 | - message=result.message, | |
| 132 | - duration_s=duration, | |
| 133 | - ) | |
| 134 | - | |
| 135 | - | |
| 136 | -__all__ = ["get_null_stats", "run"] | |
sway/src/dlm_sway/suite/score.pydeleted@@ -1,106 +0,0 @@ | ||
| 1 | -"""Composite :class:`~dlm_sway.core.result.SwayScore` from a suite result. | |
| 2 | - | |
| 3 | -The score is a weighted mean over four categories | |
| 4 | -(adherence / attribution / calibration / ablation). Each category's | |
| 5 | -value is the weighted mean of its pass/score values (with SKIP/ERROR | |
| 6 | -excluded so a broken probe doesn't silently depress the composite). | |
| 7 | - | |
| 8 | -All weighting is explicit, user-overridable, and surfaced in the report | |
| 9 | -alongside the number — no black-box scoring. | |
| 10 | -""" | |
| 11 | - | |
| 12 | -from __future__ import annotations | |
| 13 | - | |
| 14 | -from dlm_sway.core.result import ( | |
| 15 | - DEFAULT_COMPONENT_WEIGHTS, | |
| 16 | - ProbeResult, | |
| 17 | - SuiteResult, | |
| 18 | - SwayScore, | |
| 19 | - Verdict, | |
| 20 | -) | |
| 21 | -from dlm_sway.probes.base import registry | |
| 22 | - | |
| 23 | - | |
| 24 | -def compute( | |
| 25 | - suite: SuiteResult, | |
| 26 | - *, | |
| 27 | - weights: dict[str, float] | None = None, | |
| 28 | -) -> SwayScore: | |
| 29 | - """Fold a :class:`SuiteResult` into a :class:`SwayScore`.""" | |
| 30 | - w = weights if weights is not None else dict(DEFAULT_COMPONENT_WEIGHTS) | |
| 31 | - registered = registry() | |
| 32 | - | |
| 33 | - # Bucket probes by their declared category. | |
| 34 | - buckets: dict[str, list[ProbeResult]] = {k: [] for k in w} | |
| 35 | - for r in suite.probes: | |
| 36 | - if r.verdict in {Verdict.SKIP, Verdict.ERROR}: | |
| 37 | - continue | |
| 38 | - if r.score is None: | |
| 39 | - continue | |
| 40 | - probe_cls = registered.get(r.kind) | |
| 41 | - category = probe_cls.category if probe_cls is not None else "adherence" | |
| 42 | - buckets.setdefault(category, []).append(r) | |
| 43 | - | |
| 44 | - component_scores: dict[str, float] = {} | |
| 45 | - for cat, probes in buckets.items(): | |
| 46 | - if not probes: | |
| 47 | - component_scores[cat] = 0.0 | |
| 48 | - continue | |
| 49 | - total_w = sum(max(_spec_weight(p), 0.0) for p in probes) or 1.0 | |
| 50 | - weighted = sum(max(_spec_weight(p), 0.0) * (p.score or 0.0) for p in probes) | |
| 51 | - component_scores[cat] = weighted / total_w | |
| 52 | - | |
| 53 | - # Fold to composite, weighted by the user's category weights, but | |
| 54 | - # ignoring components that had no contributing probes (so a | |
| 55 | - # PREFERENCE-free document doesn't get penalized for missing B3). | |
| 56 | - active_weights = {k: v for k, v in w.items() if buckets.get(k)} | |
| 57 | - total_w = sum(active_weights.values()) or 1.0 | |
| 58 | - overall = sum(active_weights[k] * component_scores[k] for k in active_weights) / total_w | |
| 59 | - | |
| 60 | - findings = _findings(suite, component_scores) | |
| 61 | - | |
| 62 | - return SwayScore( | |
| 63 | - overall=overall, | |
| 64 | - components=component_scores, | |
| 65 | - weights=w, | |
| 66 | - band=SwayScore.band_for(overall), | |
| 67 | - findings=findings, | |
| 68 | - ) | |
| 69 | - | |
| 70 | - | |
| 71 | -def _spec_weight(result: ProbeResult) -> float: | |
| 72 | - """Recover a probe's declared weight from its ``evidence`` payload. | |
| 73 | - | |
| 74 | - The runner stores ``spec.weight`` on evidence so the scorer can read | |
| 75 | - it without re-validating specs. Falls back to 1.0 when absent (older | |
| 76 | - runs, custom probes, etc). | |
| 77 | - """ | |
| 78 | - w = result.evidence.get("weight") | |
| 79 | - if isinstance(w, int | float): | |
| 80 | - return float(w) | |
| 81 | - return 1.0 | |
| 82 | - | |
| 83 | - | |
| 84 | -def _findings(suite: SuiteResult, components: dict[str, float]) -> tuple[str, ...]: | |
| 85 | - """Surface the 2–3 most diagnostic notes for the terminal report.""" | |
| 86 | - notes: list[str] = [] | |
| 87 | - | |
| 88 | - failed = [r for r in suite.probes if r.verdict == Verdict.FAIL] | |
| 89 | - if failed: | |
| 90 | - top = failed[0] | |
| 91 | - notes.append( | |
| 92 | - f"{top.name} ({top.kind}) failed" + (f": {top.message}" if top.message else "") | |
| 93 | - ) | |
| 94 | - | |
| 95 | - for cat, score in components.items(): | |
| 96 | - if score < 0.3 and components.get(cat, 1.0) != 0.0: | |
| 97 | - notes.append(f"{cat} score is {score:.2f} — below the noise threshold") | |
| 98 | - | |
| 99 | - errors = [r for r in suite.probes if r.verdict == Verdict.ERROR] | |
| 100 | - if errors: | |
| 101 | - notes.append(f"{len(errors)} probe(s) errored — see full report for details") | |
| 102 | - | |
| 103 | - return tuple(notes[:5]) | |
| 104 | - | |
| 105 | - | |
| 106 | -__all__ = ["compute"] | |
sway/src/dlm_sway/suite/spec.pydeleted@@ -1,72 +0,0 @@ | ||
| 1 | -"""Top-level ``sway.yaml`` spec models. | |
| 2 | - | |
| 3 | -Per-probe specs live next to their implementations in | |
| 4 | -:mod:`dlm_sway.probes`. This module owns the *outer* envelope — | |
| 5 | -``version``, ``models``, ``defaults``, ``suite`` — plus the runtime | |
| 6 | -bind between raw probe dicts and registered probe classes. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -from typing import Annotated, Any | |
| 12 | - | |
| 13 | -from pydantic import BaseModel, ConfigDict, Field | |
| 14 | - | |
| 15 | -from dlm_sway.core.model import ModelSpec | |
| 16 | - | |
| 17 | -SUPPORTED_VERSION = 1 | |
| 18 | - | |
| 19 | - | |
| 20 | -class SuiteModels(BaseModel): | |
| 21 | - """Named model handles the suite references — ``base`` + ``ft``.""" | |
| 22 | - | |
| 23 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 24 | - | |
| 25 | - base: ModelSpec | |
| 26 | - ft: ModelSpec | |
| 27 | - | |
| 28 | - | |
| 29 | -class SuiteDefaults(BaseModel): | |
| 30 | - """Shared defaults for the whole suite. Probes may override per-entry.""" | |
| 31 | - | |
| 32 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 33 | - | |
| 34 | - seed: int = 0 | |
| 35 | - top_k: int = 256 | |
| 36 | - differential: bool = True | |
| 37 | - """If ``False``, the runner loads base + ft as two separate models | |
| 38 | - instead of toggling on one. More memory-heavy; only useful when a | |
| 39 | - backend can't do in-place toggling.""" | |
| 40 | - coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6 | |
| 41 | - """Minimum composite score for ``dlm-sway gate`` to pass.""" | |
| 42 | - | |
| 43 | - | |
| 44 | -class SwaySpec(BaseModel): | |
| 45 | - """Root of ``sway.yaml``.""" | |
| 46 | - | |
| 47 | - model_config = ConfigDict(extra="forbid", frozen=True) | |
| 48 | - | |
| 49 | - version: int = 1 | |
| 50 | - models: SuiteModels | |
| 51 | - defaults: SuiteDefaults = SuiteDefaults() | |
| 52 | - suite: list[dict[str, Any]] = Field(default_factory=list) | |
| 53 | - """Raw probe entries. Validated one-at-a-time by the probe registry | |
| 54 | - via :func:`dlm_sway.probes.base.build_probe` so that the set of | |
| 55 | - allowed probe kinds is an open registry rather than a closed | |
| 56 | - discriminated union.""" | |
| 57 | - dlm_source: str | None = None | |
| 58 | - """Optional path to a ``.dlm`` file. When present, the runner asks | |
| 59 | - :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and | |
| 60 | - hands them to probes via :attr:`RunContext.sections`. Auto-populated | |
| 61 | - by ``dlm-sway autogen``.""" | |
| 62 | - | |
| 63 | - def check_version(self) -> None: | |
| 64 | - """Raise ``ValueError`` if the spec version is unsupported. | |
| 65 | - | |
| 66 | - Called explicitly by the loader after validation so the error | |
| 67 | - surfaces with a loader-source tag rather than a pydantic stack. | |
| 68 | - """ | |
| 69 | - if self.version != SUPPORTED_VERSION: | |
| 70 | - raise ValueError( | |
| 71 | - f"unsupported sway spec version: {self.version} (this build supports {SUPPORTED_VERSION})" | |
| 72 | - ) | |
sway/src/dlm_sway/visualize.pydeleted@@ -1,137 +0,0 @@ | ||
| 1 | -"""Optional matplotlib-based visualizations. | |
| 2 | - | |
| 3 | -Behind the ``viz`` extra. Three functions cover the three plots that | |
| 4 | -make the sway report come alive in a notebook or saved PNG: | |
| 5 | - | |
| 6 | -- :func:`plot_section_sis`: per-section bar chart of effective SIS | |
| 7 | - (the flagship attribution view). | |
| 8 | -- :func:`plot_adapter_ablation`: the λ-scaled divergence curve — the | |
| 9 | - sway signature plot. | |
| 10 | -- :func:`plot_kl_histogram`: distribution of per-prompt KL divergences | |
| 11 | - (the raw data behind A1 DeltaKL). | |
| 12 | - | |
| 13 | -Each function raises :class:`~dlm_sway.core.errors.BackendNotAvailableError` | |
| 14 | -with a pip hint when matplotlib isn't installed. No function writes to | |
| 15 | -disk on your behalf — the caller decides (``fig.savefig(...)``). | |
| 16 | -""" | |
| 17 | - | |
| 18 | -from __future__ import annotations | |
| 19 | - | |
| 20 | -from typing import Any | |
| 21 | - | |
| 22 | -from dlm_sway.core.errors import BackendNotAvailableError | |
| 23 | -from dlm_sway.core.result import SuiteResult | |
| 24 | - | |
| 25 | - | |
| 26 | -def _require_mpl() -> Any: | |
| 27 | - try: | |
| 28 | - import matplotlib.pyplot as plt | |
| 29 | - | |
| 30 | - return plt | |
| 31 | - except ImportError as exc: | |
| 32 | - raise BackendNotAvailableError( | |
| 33 | - "visualize", | |
| 34 | - extra="viz", | |
| 35 | - hint="sway's visualization module needs matplotlib.", | |
| 36 | - ) from exc | |
| 37 | - | |
| 38 | - | |
| 39 | -def plot_section_sis(suite: SuiteResult) -> Any: | |
| 40 | - """Render a per-section ``effective_sis`` bar chart. | |
| 41 | - | |
| 42 | - Returns the matplotlib ``Figure``; the caller handles display / save. | |
| 43 | - """ | |
| 44 | - plt = _require_mpl() | |
| 45 | - | |
| 46 | - probe = _find_probe(suite, "section_internalization") | |
| 47 | - if probe is None or not probe.evidence.get("per_section"): | |
| 48 | - raise ValueError("suite has no section_internalization evidence to plot") | |
| 49 | - | |
| 50 | - rows: list[dict[str, Any]] = list(probe.evidence["per_section"]) | |
| 51 | - labels = [f"{row['tag'] or row['section_id'][:8]}\n({row['kind']})" for row in rows] | |
| 52 | - values = [float(row["effective_sis"]) for row in rows] | |
| 53 | - colors = ["#2ca02c" if row["passed"] else "#d62728" for row in rows] | |
| 54 | - | |
| 55 | - fig, ax = plt.subplots(figsize=(max(6.0, 0.7 * len(rows)), 4.0)) | |
| 56 | - ax.bar(range(len(rows)), values, color=colors) | |
| 57 | - ax.axhline( | |
| 58 | - float(probe.evidence.get("per_section_threshold", 0.0)), | |
| 59 | - color="gray", | |
| 60 | - linestyle="--", | |
| 61 | - linewidth=1, | |
| 62 | - label="threshold", | |
| 63 | - ) | |
| 64 | - ax.set_xticks(range(len(rows))) | |
| 65 | - ax.set_xticklabels(labels, rotation=30, ha="right") | |
| 66 | - ax.set_ylabel("effective SIS") | |
| 67 | - ax.set_title("Section Internalization Score") | |
| 68 | - ax.legend(loc="best") | |
| 69 | - fig.tight_layout() | |
| 70 | - return fig | |
| 71 | - | |
| 72 | - | |
| 73 | -def plot_adapter_ablation(suite: SuiteResult) -> Any: | |
| 74 | - """Render the signature λ-scaled divergence curve.""" | |
| 75 | - plt = _require_mpl() | |
| 76 | - | |
| 77 | - probe = _find_probe(suite, "adapter_ablation") | |
| 78 | - if probe is None or not probe.evidence.get("lambdas"): | |
| 79 | - raise ValueError("suite has no adapter_ablation evidence to plot") | |
| 80 | - | |
| 81 | - lambdas = list(probe.evidence["lambdas"]) | |
| 82 | - divs = list(probe.evidence["mean_divergence_per_lambda"]) | |
| 83 | - | |
| 84 | - fig, ax = plt.subplots(figsize=(7.0, 4.0)) | |
| 85 | - ax.plot(lambdas, divs, marker="o", linewidth=2, color="#1f77b4") | |
| 86 | - ax.axvline(1.0, color="gray", linestyle=":", linewidth=1, label="λ=1 (trained)") | |
| 87 | - sat = probe.evidence.get("saturation_lambda") | |
| 88 | - if sat is not None: | |
| 89 | - ax.axvline( | |
| 90 | - float(sat), | |
| 91 | - color="#2ca02c", | |
| 92 | - linestyle="--", | |
| 93 | - linewidth=1, | |
| 94 | - label=f"sat λ={float(sat):.2f}", | |
| 95 | - ) | |
| 96 | - ax.set_xlabel("λ (adapter scale)") | |
| 97 | - ax.set_ylabel("mean JS divergence vs λ=0") | |
| 98 | - ax.set_title( | |
| 99 | - f"Adapter Ablation (R²={float(probe.evidence.get('linearity', 0.0)):.2f}, " | |
| 100 | - f"overshoot={float(probe.evidence.get('overshoot', 0.0)):.2f})" | |
| 101 | - ) | |
| 102 | - ax.legend(loc="best") | |
| 103 | - fig.tight_layout() | |
| 104 | - return fig | |
| 105 | - | |
| 106 | - | |
| 107 | -def plot_kl_histogram(suite: SuiteResult) -> Any: | |
| 108 | - """Render the per-prompt KL distribution from a DeltaKL probe.""" | |
| 109 | - plt = _require_mpl() | |
| 110 | - | |
| 111 | - probe = _find_probe(suite, "delta_kl") | |
| 112 | - if probe is None or not probe.evidence.get("per_prompt"): | |
| 113 | - raise ValueError("suite has no delta_kl evidence to plot") | |
| 114 | - | |
| 115 | - values = list(probe.evidence["per_prompt"]) | |
| 116 | - fig, ax = plt.subplots(figsize=(7.0, 4.0)) | |
| 117 | - ax.hist(values, bins=max(5, min(20, len(values) // 2)), color="#ff7f0e", edgecolor="white") | |
| 118 | - ax.axvline( | |
| 119 | - float(probe.raw or 0.0), | |
| 120 | - color="black", | |
| 121 | - linestyle="--", | |
| 122 | - linewidth=1, | |
| 123 | - label=f"mean={float(probe.raw or 0.0):.3f}", | |
| 124 | - ) | |
| 125 | - ax.set_xlabel(probe.evidence.get("divergence_kind", "divergence")) | |
| 126 | - ax.set_ylabel("count") | |
| 127 | - ax.set_title("DeltaKL — per-prompt distribution") | |
| 128 | - ax.legend(loc="best") | |
| 129 | - fig.tight_layout() | |
| 130 | - return fig | |
| 131 | - | |
| 132 | - | |
| 133 | -def _find_probe(suite: SuiteResult, kind: str) -> Any: | |
| 134 | - for p in suite.probes: | |
| 135 | - if p.kind == kind: | |
| 136 | - return p | |
| 137 | - return None | |
sway/tests/__init__.pydeletedsway/tests/conftest.pydeleted@@ -1,29 +0,0 @@ | ||
| 1 | -"""Shared test fixtures. | |
| 2 | - | |
| 3 | -Keep the default fast-test environment offline and deterministic so unit | |
| 4 | -tests stay below ~1 s per file. Integration tests override these via | |
| 5 | -their own ``conftest`` when they need network access. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -import pytest | |
| 11 | - | |
| 12 | -# Import the probes package once so every shipped probe registers itself | |
| 13 | -# with the central registry. Tests that exercise build_probe("delta_kl", | |
| 14 | -# …) rely on this. | |
| 15 | -import dlm_sway.probes # noqa: F401 | |
| 16 | - | |
| 17 | - | |
| 18 | -@pytest.fixture(autouse=True) | |
| 19 | -def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: | |
| 20 | - """Unit tests never touch the network. | |
| 21 | - | |
| 22 | - Any backend test that needs HF should be marked ``@pytest.mark.online`` | |
| 23 | - and clear these vars explicitly. | |
| 24 | - """ | |
| 25 | - monkeypatch.setenv("HF_HUB_OFFLINE", "1") | |
| 26 | - monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1") | |
| 27 | - monkeypatch.setenv("HF_DATASETS_OFFLINE", "1") | |
| 28 | - monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1") | |
| 29 | - monkeypatch.setenv("DO_NOT_TRACK", "1") | |
sway/tests/fixtures/__init__.pydeletedsway/tests/fixtures/tiny_model.pydeleted@@ -1,53 +0,0 @@ | ||
| 1 | -"""Tiny-model fixture for integration tests. | |
| 2 | - | |
| 3 | -Mirrors ``dlm.tests.fixtures.tiny_model``: session-scoped snapshot of | |
| 4 | -SmolLM2-135M-Instruct, reused across the whole test run. The model is | |
| 5 | -small enough (~280 MB on disk, ~600 MB in fp32 VRAM) to make integration | |
| 6 | -tests feasible in CI. | |
| 7 | - | |
| 8 | -Tests using this fixture must carry ``@pytest.mark.slow`` and | |
| 9 | -``@pytest.mark.online`` — the default test selection excludes both. | |
| 10 | -""" | |
| 11 | - | |
| 12 | -from __future__ import annotations | |
| 13 | - | |
| 14 | -import os | |
| 15 | -from collections.abc import Iterator | |
| 16 | -from pathlib import Path | |
| 17 | - | |
| 18 | -import pytest | |
| 19 | - | |
| 20 | -TINY_MODEL_HF_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" | |
| 21 | -TINY_MODEL_REVISION = os.environ.get("DLM_SWAY_TINY_MODEL_REVISION", "main") | |
| 22 | - | |
| 23 | - | |
| 24 | -def _offline_mode() -> bool: | |
| 25 | - return os.environ.get("SWAY_OFFLINE", "0") == "1" | |
| 26 | - | |
| 27 | - | |
| 28 | -@pytest.fixture(scope="session") | |
| 29 | -def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: | |
| 30 | - """Download (or reuse) the tiny model; yield the cached directory. | |
| 31 | - | |
| 32 | - Test opts in via ``@pytest.mark.online`` — the session-wide offline | |
| 33 | - env vars are cleared inside this fixture so ``snapshot_download`` | |
| 34 | - actually fetches. | |
| 35 | - """ | |
| 36 | - from huggingface_hub import snapshot_download | |
| 37 | - | |
| 38 | - # Clear offline env guards (set by the unit-test autouse fixture). | |
| 39 | - prior = { | |
| 40 | - k: os.environ.pop(k, None) | |
| 41 | - for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE") | |
| 42 | - } | |
| 43 | - try: | |
| 44 | - path = snapshot_download( | |
| 45 | - repo_id=TINY_MODEL_HF_ID, | |
| 46 | - revision=TINY_MODEL_REVISION, | |
| 47 | - local_files_only=_offline_mode(), | |
| 48 | - ) | |
| 49 | - yield Path(path) | |
| 50 | - finally: | |
| 51 | - for k, v in prior.items(): | |
| 52 | - if v is not None: | |
| 53 | - os.environ[k] = v | |
sway/tests/integration/__init__.pydeletedsway/tests/integration/conftest.pydeleted@@ -1,10 +0,0 @@ | ||
| 1 | -"""Integration-test configuration. | |
| 2 | - | |
| 3 | -Integration tests need network + heavy deps. Re-export the tiny_model | |
| 4 | -fixture here so test modules can pick it up without a long import | |
| 5 | -path. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -from tests.fixtures.tiny_model import tiny_model_dir # noqa: F401 — re-export | |
sway/tests/integration/test_hf_adapter_toggle.pydeleted@@ -1,113 +0,0 @@ | ||
| 1 | -"""Integration test: PEFT ``disable_adapter`` actually changes logits. | |
| 2 | - | |
| 3 | -This is the load-bearing sanity check for the whole differential design. | |
| 4 | -If a future ``peft`` release subtly breaks the disable-context semantics, | |
| 5 | -sway's KL / SIS / ablation probes would all silently report zero signal. | |
| 6 | -We catch that here, before the rest of the test battery runs. | |
| 7 | - | |
| 8 | -The test builds a random-init LoRA adapter on a tiny model so no network | |
| 9 | -dependency beyond the base model snapshot itself. | |
| 10 | -""" | |
| 11 | - | |
| 12 | -from __future__ import annotations | |
| 13 | - | |
| 14 | -from pathlib import Path | |
| 15 | - | |
| 16 | -import pytest | |
| 17 | - | |
| 18 | -from dlm_sway.backends.hf import HuggingFaceDifferentialBackend | |
| 19 | -from dlm_sway.core.model import ModelSpec | |
| 20 | - | |
| 21 | -pytestmark = [pytest.mark.slow, pytest.mark.online] | |
| 22 | - | |
| 23 | - | |
| 24 | -def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None: | |
| 25 | - """Construct a LoRA adapter with random-init weights on ``base_dir``. | |
| 26 | - | |
| 27 | - The weights are kept small so the toggle-delta is clear but the | |
| 28 | - adapter is structurally valid (correct ``adapter_config.json``, | |
| 29 | - tokenizer files, safetensors layout). | |
| 30 | - """ | |
| 31 | - import torch | |
| 32 | - from peft import LoraConfig, get_peft_model | |
| 33 | - from transformers import AutoModelForCausalLM, AutoTokenizer | |
| 34 | - | |
| 35 | - torch.manual_seed(0) | |
| 36 | - | |
| 37 | - tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) | |
| 38 | - if tokenizer.pad_token_id is None: | |
| 39 | - tokenizer.pad_token = tokenizer.eos_token | |
| 40 | - base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) | |
| 41 | - | |
| 42 | - cfg = LoraConfig( | |
| 43 | - r=8, | |
| 44 | - lora_alpha=16, | |
| 45 | - target_modules=["q_proj", "v_proj"], | |
| 46 | - lora_dropout=0.0, | |
| 47 | - bias="none", | |
| 48 | - task_type="CAUSAL_LM", | |
| 49 | - ) | |
| 50 | - peft_model = get_peft_model(base, cfg) | |
| 51 | - | |
| 52 | - # Explicitly scale lora_B out of its PEFT-default zero-init so the | |
| 53 | - # adapter actually changes outputs. Real training does this via | |
| 54 | - # gradients; we do it with a scaled normal. | |
| 55 | - with torch.no_grad(): | |
| 56 | - for name, param in peft_model.named_parameters(): | |
| 57 | - if "lora_B" in name: | |
| 58 | - param.copy_(torch.randn_like(param) * 0.05) | |
| 59 | - | |
| 60 | - peft_model.save_pretrained(str(out_dir)) | |
| 61 | - tokenizer.save_pretrained(str(out_dir)) | |
| 62 | - | |
| 63 | - | |
| 64 | -@pytest.fixture(scope="module") | |
| 65 | -def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: | |
| 66 | - adapter_dir = tmp_path_factory.mktemp("random-adapter") | |
| 67 | - _build_random_lora_adapter(tiny_model_dir, adapter_dir) | |
| 68 | - return adapter_dir | |
| 69 | - | |
| 70 | - | |
| 71 | -def test_disable_adapter_changes_logits(tiny_model_dir: Path, random_adapter: Path) -> None: | |
| 72 | - """The keystone invariant: base view ≠ ft view on the same prompt.""" | |
| 73 | - import numpy as np | |
| 74 | - | |
| 75 | - backend = HuggingFaceDifferentialBackend( | |
| 76 | - base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), | |
| 77 | - adapter_path=random_adapter, | |
| 78 | - ) | |
| 79 | - try: | |
| 80 | - prompt = "The quick brown fox" | |
| 81 | - with backend.as_base() as b: | |
| 82 | - base_dist = b.next_token_dist(prompt, top_k=32) | |
| 83 | - with backend.as_finetuned() as f: | |
| 84 | - ft_dist = f.next_token_dist(prompt, top_k=32) | |
| 85 | - | |
| 86 | - # Top-k indices may shift under the adapter; take a safe shared | |
| 87 | - # subset instead of asserting identical ordering. | |
| 88 | - assert not np.array_equal(base_dist.token_ids, ft_dist.token_ids) or not np.allclose( | |
| 89 | - base_dist.logprobs, ft_dist.logprobs, atol=1e-5 | |
| 90 | - ), "adapter toggle did not change next-token distribution" | |
| 91 | - finally: | |
| 92 | - backend.close() | |
| 93 | - | |
| 94 | - | |
| 95 | -def test_roundtrip_toggle_restores_base(tiny_model_dir: Path, random_adapter: Path) -> None: | |
| 96 | - """as_base → as_finetuned → as_base yields a stable base view.""" | |
| 97 | - import numpy as np | |
| 98 | - | |
| 99 | - backend = HuggingFaceDifferentialBackend( | |
| 100 | - base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), | |
| 101 | - adapter_path=random_adapter, | |
| 102 | - ) | |
| 103 | - try: | |
| 104 | - prompt = "hello" | |
| 105 | - with backend.as_base() as b: | |
| 106 | - first = b.next_token_dist(prompt, top_k=16).logprobs | |
| 107 | - with backend.as_finetuned() as f: | |
| 108 | - f.next_token_dist(prompt, top_k=16) # toggle | |
| 109 | - with backend.as_base() as b: | |
| 110 | - second = b.next_token_dist(prompt, top_k=16).logprobs | |
| 111 | - np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-6) | |
| 112 | - finally: | |
| 113 | - backend.close() | |
sway/tests/unit/__init__.pydeletedsway/tests/unit/test_backend_dummy.pydeleted@@ -1,102 +0,0 @@ | ||
| 1 | -"""Tests for :class:`dlm_sway.backends.dummy.DummyDifferentialBackend`. | |
| 2 | - | |
| 3 | -The dummy backend is used by every downstream probe unit test, so it | |
| 4 | -gets a thorough own-right test here. Also verifies the view-exclusion | |
| 5 | -invariant that catches stale-view bugs in probes. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -import numpy as np | |
| 11 | -import pytest | |
| 12 | - | |
| 13 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 14 | -from dlm_sway.core.model import Model | |
| 15 | -from dlm_sway.core.scoring import DifferentialBackend, ScoringBackend | |
| 16 | - | |
| 17 | - | |
| 18 | -@pytest.fixture | |
| 19 | -def backend() -> DummyDifferentialBackend: | |
| 20 | - base = DummyResponses( | |
| 21 | - generations={"hi": "hello"}, | |
| 22 | - logprobs={("q", "a"): -3.0}, | |
| 23 | - ) | |
| 24 | - ft = DummyResponses( | |
| 25 | - generations={"hi": "greetings, traveler"}, | |
| 26 | - logprobs={("q", "a"): -1.2}, | |
| 27 | - ) | |
| 28 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 29 | - | |
| 30 | - | |
| 31 | -class TestViews: | |
| 32 | - def test_as_base_and_as_ft_yield_distinct_generations( | |
| 33 | - self, backend: DummyDifferentialBackend | |
| 34 | - ) -> None: | |
| 35 | - with backend.as_base() as b: | |
| 36 | - assert b.generate("hi", max_new_tokens=5) == "hello" | |
| 37 | - with backend.as_finetuned() as f: | |
| 38 | - assert f.generate("hi", max_new_tokens=5) == "greetings, traveler" | |
| 39 | - | |
| 40 | - def test_logprob_differs_between_modes(self, backend: DummyDifferentialBackend) -> None: | |
| 41 | - with backend.as_base() as b: | |
| 42 | - base_score = b.logprob_of("q", "a") | |
| 43 | - with backend.as_finetuned() as f: | |
| 44 | - ft_score = f.logprob_of("q", "a") | |
| 45 | - assert base_score == -3.0 | |
| 46 | - assert ft_score == -1.2 | |
| 47 | - | |
| 48 | - def test_missing_generation_raises_keyerror(self, backend: DummyDifferentialBackend) -> None: | |
| 49 | - with backend.as_base() as b, pytest.raises(KeyError, match="no canned generation"): | |
| 50 | - b.generate("unconfigured", max_new_tokens=1) | |
| 51 | - | |
| 52 | - def test_missing_logprob_default(self, backend: DummyDifferentialBackend) -> None: | |
| 53 | - with backend.as_base() as b: | |
| 54 | - assert b.logprob_of("nonexistent", "target") == -10.0 | |
| 55 | - | |
| 56 | - | |
| 57 | -class TestRollingLogprob: | |
| 58 | - def test_synthesized_when_not_preseeded(self, backend: DummyDifferentialBackend) -> None: | |
| 59 | - with backend.as_base() as b: | |
| 60 | - r = b.rolling_logprob("a quick brown fox jumps") | |
| 61 | - assert r.num_tokens == 5 | |
| 62 | - assert r.logprobs.size == 4 | |
| 63 | - assert np.all(r.logprobs == -2.0) | |
| 64 | - | |
| 65 | - def test_ft_perplexity_lower_than_base(self, backend: DummyDifferentialBackend) -> None: | |
| 66 | - text = "a quick brown fox" | |
| 67 | - with backend.as_base() as b: | |
| 68 | - pb = b.rolling_logprob(text).perplexity | |
| 69 | - with backend.as_finetuned() as f: | |
| 70 | - pf = f.rolling_logprob(text).perplexity | |
| 71 | - assert pf < pb # synthesized ft is less perplexed → lower PPL | |
| 72 | - | |
| 73 | - | |
| 74 | -class TestTokenDist: | |
| 75 | - def test_dists_differ_between_modes(self, backend: DummyDifferentialBackend) -> None: | |
| 76 | - with backend.as_base() as b: | |
| 77 | - base_dist = b.next_token_dist("any prompt") | |
| 78 | - with backend.as_finetuned() as f: | |
| 79 | - ft_dist = f.next_token_dist("any prompt") | |
| 80 | - assert not np.array_equal(base_dist.logprobs, ft_dist.logprobs) | |
| 81 | - | |
| 82 | - | |
| 83 | -class TestInvariants: | |
| 84 | - def test_protocol_satisfaction(self, backend: DummyDifferentialBackend) -> None: | |
| 85 | - assert isinstance(backend, DifferentialBackend) | |
| 86 | - with backend.as_base() as view: | |
| 87 | - assert isinstance(view, Model) | |
| 88 | - assert isinstance(view, ScoringBackend) | |
| 89 | - | |
| 90 | - def test_nested_views_rejected(self, backend: DummyDifferentialBackend) -> None: | |
| 91 | - with backend.as_base(), pytest.raises(RuntimeError, match="view already active"): | |
| 92 | - with backend.as_finetuned(): | |
| 93 | - pass | |
| 94 | - | |
| 95 | - def test_sequential_views_fine(self, backend: DummyDifferentialBackend) -> None: | |
| 96 | - # Must be able to re-enter after exiting — common pattern in probes. | |
| 97 | - with backend.as_base() as b: | |
| 98 | - b.logprob_of("q", "a") | |
| 99 | - with backend.as_finetuned() as f: | |
| 100 | - f.logprob_of("q", "a") | |
| 101 | - with backend.as_base() as b: | |
| 102 | - b.logprob_of("q", "a") | |
sway/tests/unit/test_backend_registry.pydeleted@@ -1,133 +0,0 @@ | ||
| 1 | -"""Tests for the backend registry in ``dlm_sway.backends``. | |
| 2 | - | |
| 3 | -The registry is the single place that maps a ModelSpec to a concrete | |
| 4 | -backend. These tests check the error paths — actually materializing an | |
| 5 | -HF backend requires model weights and is covered by the integration | |
| 6 | -suite. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -from pathlib import Path | |
| 12 | - | |
| 13 | -import pytest | |
| 14 | - | |
| 15 | -from dlm_sway.backends import build | |
| 16 | -from dlm_sway.core.errors import BackendNotAvailableError, SpecValidationError | |
| 17 | -from dlm_sway.core.model import ModelSpec | |
| 18 | - | |
| 19 | - | |
| 20 | -class TestRegistry: | |
| 21 | - def test_dummy_rejected_via_build(self) -> None: | |
| 22 | - with pytest.raises(SpecValidationError, match="kind='dummy'"): | |
| 23 | - build(ModelSpec(base="x", kind="dummy")) | |
| 24 | - | |
| 25 | - def test_hf_requires_adapter(self) -> None: | |
| 26 | - with pytest.raises(SpecValidationError, match="adapter"): | |
| 27 | - build(ModelSpec(base="x", kind="hf")) | |
| 28 | - | |
| 29 | - def test_mlx_requires_adapter(self) -> None: | |
| 30 | - with pytest.raises(SpecValidationError, match="adapter"): | |
| 31 | - build(ModelSpec(base="x", kind="mlx")) | |
| 32 | - | |
| 33 | - def test_mlx_dispatch_raises_when_mlx_missing(self) -> None: | |
| 34 | - # On non-Apple-Silicon (or Apple without mlx installed), constructing | |
| 35 | - # the MLX backend raises BackendNotAvailableError with a pip hint. | |
| 36 | - # We skip this assertion if mlx happens to be installed. | |
| 37 | - import importlib.util | |
| 38 | - | |
| 39 | - if importlib.util.find_spec("mlx") is not None: | |
| 40 | - pytest.skip("mlx is installed; error path not exercised") | |
| 41 | - with pytest.raises(BackendNotAvailableError) as exc_info: | |
| 42 | - build(ModelSpec(base="x", kind="mlx", adapter=Path("/tmp/a"))) | |
| 43 | - assert exc_info.value.backend == "mlx" | |
| 44 | - | |
| 45 | - def test_custom_requires_entry_point(self) -> None: | |
| 46 | - with pytest.raises(SpecValidationError, match="entry_point"): | |
| 47 | - build(ModelSpec(base="x", kind="custom", adapter=Path("/tmp/a"))) | |
| 48 | - | |
| 49 | - def test_custom_validates_entry_point_shape(self) -> None: | |
| 50 | - with pytest.raises(SpecValidationError, match="pkg.module:ClassName"): | |
| 51 | - build( | |
| 52 | - ModelSpec( | |
| 53 | - base="x", | |
| 54 | - kind="custom", | |
| 55 | - entry_point="not_a_valid_entry_point", | |
| 56 | - adapter=Path("/tmp/a"), | |
| 57 | - ) | |
| 58 | - ) | |
| 59 | - | |
| 60 | - def test_custom_rejects_unimportable_module(self) -> None: | |
| 61 | - with pytest.raises(SpecValidationError, match="cannot import"): | |
| 62 | - build( | |
| 63 | - ModelSpec( | |
| 64 | - base="x", | |
| 65 | - kind="custom", | |
| 66 | - entry_point="nonexistent_pkg_xyz:Backend", | |
| 67 | - adapter=Path("/tmp/a"), | |
| 68 | - ) | |
| 69 | - ) | |
| 70 | - | |
| 71 | - def test_custom_rejects_missing_class(self) -> None: | |
| 72 | - with pytest.raises(SpecValidationError, match="has no attribute"): | |
| 73 | - build( | |
| 74 | - ModelSpec( | |
| 75 | - base="x", | |
| 76 | - kind="custom", | |
| 77 | - entry_point="dlm_sway:NoSuchClass", | |
| 78 | - adapter=Path("/tmp/a"), | |
| 79 | - ) | |
| 80 | - ) | |
| 81 | - | |
| 82 | - def test_custom_rejects_non_differential_class(self) -> None: | |
| 83 | - # A class that accepts the canonical constructor args but doesn't | |
| 84 | - # implement the protocol. | |
| 85 | - import sys | |
| 86 | - import types | |
| 87 | - | |
| 88 | - class _Bad: | |
| 89 | - def __init__(self, base_spec, adapter_path): # type: ignore[no-untyped-def] | |
| 90 | - del base_spec, adapter_path | |
| 91 | - | |
| 92 | - mod = types.ModuleType("_sway_bad_mod") | |
| 93 | - mod.Bad = _Bad # type: ignore[attr-defined] | |
| 94 | - sys.modules["_sway_bad_mod"] = mod | |
| 95 | - | |
| 96 | - with pytest.raises(SpecValidationError, match="DifferentialBackend"): | |
| 97 | - build( | |
| 98 | - ModelSpec( | |
| 99 | - base="x", | |
| 100 | - kind="custom", | |
| 101 | - entry_point="_sway_bad_mod:Bad", | |
| 102 | - adapter=Path("/tmp/a"), | |
| 103 | - ) | |
| 104 | - ) | |
| 105 | - | |
| 106 | - def test_custom_dispatches_to_valid_backend(self) -> None: | |
| 107 | - # Use the dummy backend via a custom entry point. The dummy class's | |
| 108 | - # __init__ takes different args, so we write a thin adapter class. | |
| 109 | - from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 110 | - | |
| 111 | - class _AdapterBackend(DummyDifferentialBackend): | |
| 112 | - def __init__(self, base_spec, adapter_path): # type: ignore[no-untyped-def] | |
| 113 | - super().__init__(base=DummyResponses(), ft=DummyResponses()) | |
| 114 | - | |
| 115 | - # Register on a throwaway module we can find by name. | |
| 116 | - import sys | |
| 117 | - import types | |
| 118 | - | |
| 119 | - mod = types.ModuleType("_sway_custom_test_mod") | |
| 120 | - mod.AdapterBackend = _AdapterBackend # type: ignore[attr-defined] | |
| 121 | - sys.modules["_sway_custom_test_mod"] = mod | |
| 122 | - | |
| 123 | - backend = build( | |
| 124 | - ModelSpec( | |
| 125 | - base="x", | |
| 126 | - kind="custom", | |
| 127 | - entry_point="_sway_custom_test_mod:AdapterBackend", | |
| 128 | - adapter=Path("/tmp/a"), | |
| 129 | - ) | |
| 130 | - ) | |
| 131 | - from dlm_sway.core.scoring import DifferentialBackend | |
| 132 | - | |
| 133 | - assert isinstance(backend, DifferentialBackend) | |
sway/tests/unit/test_cli.pydeleted@@ -1,92 +0,0 @@ | ||
| 1 | -"""Smoke tests for the dlm-sway CLI. | |
| 2 | - | |
| 3 | -We avoid exercising backends (they need real models) and instead test | |
| 4 | -arg parsing, error paths, and the read-only commands (``doctor``, | |
| 5 | -``report``, and the help surface). | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -import json | |
| 11 | -from pathlib import Path | |
| 12 | - | |
| 13 | -from typer.testing import CliRunner | |
| 14 | - | |
| 15 | -from dlm_sway.cli.app import app | |
| 16 | - | |
| 17 | - | |
| 18 | -def test_version_exits_zero() -> None: | |
| 19 | - result = CliRunner().invoke(app, ["--version"]) | |
| 20 | - assert result.exit_code == 0 | |
| 21 | - assert "dlm-sway" in result.stdout | |
| 22 | - | |
| 23 | - | |
| 24 | -def test_help_lists_all_commands() -> None: | |
| 25 | - result = CliRunner().invoke(app, ["--help"]) | |
| 26 | - assert result.exit_code == 0 | |
| 27 | - for cmd in ("run", "gate", "check", "diff", "autogen", "doctor", "report"): | |
| 28 | - assert cmd in result.stdout | |
| 29 | - | |
| 30 | - | |
| 31 | -def test_doctor_runs(capsys) -> None: # type: ignore[no-untyped-def] | |
| 32 | - result = CliRunner().invoke(app, ["doctor"]) | |
| 33 | - assert result.exit_code == 0 | |
| 34 | - # Rich applies color codes by default; assert the bare product name appears. | |
| 35 | - assert "dlm-sway" in result.stdout | |
| 36 | - assert "backends" in result.stdout | |
| 37 | - | |
| 38 | - | |
| 39 | -def test_run_without_file_errors(tmp_path: Path) -> None: | |
| 40 | - missing = tmp_path / "nope.yaml" | |
| 41 | - result = CliRunner().invoke(app, ["run", str(missing)]) | |
| 42 | - # Exit code 2 = SwayError bubble-up; 1 = typer missing-arg; accept either. | |
| 43 | - assert result.exit_code != 0 | |
| 44 | - | |
| 45 | - | |
| 46 | -def test_report_from_json(tmp_path: Path) -> None: | |
| 47 | - sample = { | |
| 48 | - "schema_version": 1, | |
| 49 | - "sway_version": "0.1.0.dev0", | |
| 50 | - "base_model_id": "base", | |
| 51 | - "adapter_id": "adp", | |
| 52 | - "score": {"overall": 0.7, "band": "healthy", "components": {}, "findings": []}, | |
| 53 | - "probes": [ | |
| 54 | - { | |
| 55 | - "name": "p1", | |
| 56 | - "kind": "delta_kl", | |
| 57 | - "verdict": "pass", | |
| 58 | - "score": 0.7, | |
| 59 | - "message": "ok", | |
| 60 | - }, | |
| 61 | - ], | |
| 62 | - } | |
| 63 | - path = tmp_path / "result.json" | |
| 64 | - path.write_text(json.dumps(sample), encoding="utf-8") | |
| 65 | - | |
| 66 | - terminal = CliRunner().invoke(app, ["report", str(path)]) | |
| 67 | - assert terminal.exit_code == 0 | |
| 68 | - assert "p1" in terminal.stdout | |
| 69 | - | |
| 70 | - md = CliRunner().invoke(app, ["report", str(path), "--format", "md"]) | |
| 71 | - assert md.exit_code == 0 | |
| 72 | - assert "dlm-sway report" in md.stdout | |
| 73 | - | |
| 74 | - junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"]) | |
| 75 | - assert junit.exit_code == 0 | |
| 76 | - assert "<testsuite" in junit.stdout | |
| 77 | - | |
| 78 | - | |
| 79 | -def test_autogen_without_dlm_extra_exits_nonzero(tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def] | |
| 80 | - # Force the import path to fail so the CLI prints the extra hint. | |
| 81 | - import builtins | |
| 82 | - | |
| 83 | - real_import = builtins.__import__ | |
| 84 | - | |
| 85 | - def fake_import(name: str, *args: object, **kwargs: object): # type: ignore[no-untyped-def] | |
| 86 | - if name.startswith("dlm_sway.integrations.dlm"): | |
| 87 | - raise ImportError("simulated missing extra") | |
| 88 | - return real_import(name, *args, **kwargs) # type: ignore[no-untyped-call] | |
| 89 | - | |
| 90 | - monkeypatch.setattr(builtins, "__import__", fake_import) | |
| 91 | - result = CliRunner().invoke(app, ["autogen", "any.dlm"]) | |
| 92 | - assert result.exit_code != 0 | |
sway/tests/unit/test_determinism.pydeleted@@ -1,47 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.core.determinism`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import os | |
| 6 | -import random | |
| 7 | - | |
| 8 | -import numpy as np | |
| 9 | - | |
| 10 | -from dlm_sway.core.determinism import DeterminismSummary, seed_everything | |
| 11 | - | |
| 12 | - | |
| 13 | -class TestSeedEverything: | |
| 14 | - def test_returns_summary(self) -> None: | |
| 15 | - summary = seed_everything(0) | |
| 16 | - assert isinstance(summary, DeterminismSummary) | |
| 17 | - assert summary.seed == 0 | |
| 18 | - assert summary.class_ in {"strict", "best_effort", "loose"} | |
| 19 | - | |
| 20 | - def test_idempotent_for_stdlib_random(self) -> None: | |
| 21 | - seed_everything(42) | |
| 22 | - a = [random.random() for _ in range(5)] | |
| 23 | - seed_everything(42) | |
| 24 | - b = [random.random() for _ in range(5)] | |
| 25 | - assert a == b | |
| 26 | - | |
| 27 | - def test_idempotent_for_numpy(self) -> None: | |
| 28 | - seed_everything(17) | |
| 29 | - a = np.random.rand(5) | |
| 30 | - seed_everything(17) | |
| 31 | - b = np.random.rand(5) | |
| 32 | - np.testing.assert_array_equal(a, b) | |
| 33 | - | |
| 34 | - def test_cublas_workspace_set_under_strict(self) -> None: | |
| 35 | - os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) | |
| 36 | - seed_everything(0, strict=True) | |
| 37 | - assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8" | |
| 38 | - | |
| 39 | - def test_non_strict_does_not_set_cublas(self) -> None: | |
| 40 | - os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) | |
| 41 | - seed_everything(0, strict=False) | |
| 42 | - # Non-strict mode must not leak the env var in either direction; | |
| 43 | - # the host environment's prior value wins. | |
| 44 | - assert ( | |
| 45 | - "CUBLAS_WORKSPACE_CONFIG" not in os.environ | |
| 46 | - or os.environ["CUBLAS_WORKSPACE_CONFIG"] != ":4096:8" | |
| 47 | - ) | |
sway/tests/unit/test_divergence.pydeleted@@ -1,73 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes._divergence`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import math | |
| 6 | - | |
| 7 | -import numpy as np | |
| 8 | - | |
| 9 | -from dlm_sway.core.scoring import TokenDist | |
| 10 | -from dlm_sway.probes._divergence import aligned_probs, divergence, js, kl | |
| 11 | - | |
| 12 | - | |
| 13 | -def _dist(ids: list[int], probs: list[float], vocab: int = 100) -> TokenDist: | |
| 14 | - return TokenDist( | |
| 15 | - token_ids=np.asarray(ids, dtype=np.int64), | |
| 16 | - logprobs=np.log(np.asarray(probs, dtype=np.float32)), | |
| 17 | - vocab_size=vocab, | |
| 18 | - ) | |
| 19 | - | |
| 20 | - | |
| 21 | -class TestAligned: | |
| 22 | - def test_identical_distributions(self) -> None: | |
| 23 | - d = _dist([1, 2, 3], [0.5, 0.3, 0.2]) | |
| 24 | - p, q = aligned_probs(d, d) | |
| 25 | - np.testing.assert_allclose(p, q) | |
| 26 | - | |
| 27 | - def test_union_support_fills_missing(self) -> None: | |
| 28 | - base = _dist([1, 2, 3], [0.5, 0.3, 0.2]) | |
| 29 | - ft = _dist([2, 3, 4], [0.4, 0.4, 0.2]) | |
| 30 | - p, q = aligned_probs(base, ft) | |
| 31 | - assert p.shape == (4,) | |
| 32 | - assert abs(p.sum() - 1.0) < 1e-9 | |
| 33 | - assert abs(q.sum() - 1.0) < 1e-9 | |
| 34 | - | |
| 35 | - | |
| 36 | -class TestKL: | |
| 37 | - def test_zero_when_equal(self) -> None: | |
| 38 | - p = np.array([0.5, 0.3, 0.2]) | |
| 39 | - assert kl(p, p) == 0.0 | |
| 40 | - | |
| 41 | - def test_positive_when_different(self) -> None: | |
| 42 | - p = np.array([0.7, 0.2, 0.1]) | |
| 43 | - q = np.array([0.2, 0.3, 0.5]) | |
| 44 | - assert kl(p, q) > 0.0 | |
| 45 | - | |
| 46 | - | |
| 47 | -class TestJS: | |
| 48 | - def test_zero_when_equal(self) -> None: | |
| 49 | - p = np.array([0.5, 0.3, 0.2]) | |
| 50 | - assert js(p, p) == 0.0 | |
| 51 | - | |
| 52 | - def test_symmetric(self) -> None: | |
| 53 | - p = np.array([0.7, 0.2, 0.1]) | |
| 54 | - q = np.array([0.2, 0.3, 0.5]) | |
| 55 | - assert math.isclose(js(p, q), js(q, p), rel_tol=1e-9) | |
| 56 | - | |
| 57 | - def test_bounded_by_ln2(self) -> None: | |
| 58 | - p = np.array([1.0, 0.0]) | |
| 59 | - q = np.array([0.0, 1.0]) | |
| 60 | - # With zeros handled as 0·log0 = 0 this approaches ln(2). | |
| 61 | - assert js(p, q) <= math.log(2.0) + 1e-9 | |
| 62 | - | |
| 63 | - | |
| 64 | -class TestDivergenceDispatch: | |
| 65 | - def test_default_is_js(self) -> None: | |
| 66 | - d1 = _dist([1, 2], [0.6, 0.4]) | |
| 67 | - d2 = _dist([1, 2], [0.3, 0.7]) | |
| 68 | - assert divergence(d1, d2) == divergence(d1, d2, kind="js") | |
| 69 | - | |
| 70 | - def test_kl_available(self) -> None: | |
| 71 | - d1 = _dist([1, 2], [0.6, 0.4]) | |
| 72 | - d2 = _dist([1, 2], [0.3, 0.7]) | |
| 73 | - assert divergence(d1, d2, kind="kl") >= 0.0 | |
sway/tests/unit/test_dlm_bridge.pydeleted@@ -1,252 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.integrations.dlm`. | |
| 2 | - | |
| 3 | -The bridge imports ``dlm.*`` modules lazily. We mock those via | |
| 4 | -``sys.modules`` injection so the tests run without the ``dlm-sway[dlm]`` | |
| 5 | -extra installed. A full end-to-end integration test against a real | |
| 6 | -``.dlm`` lives under ``tests/integration/``. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -import sys | |
| 12 | -import types | |
| 13 | -from dataclasses import dataclass | |
| 14 | -from pathlib import Path | |
| 15 | - | |
| 16 | -import pytest | |
| 17 | -import yaml | |
| 18 | - | |
| 19 | - | |
| 20 | -@pytest.fixture | |
| 21 | -def fake_dlm(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: | |
| 22 | - """Install a fake ``dlm`` package so the resolver can import.""" | |
| 23 | - | |
| 24 | - # Build synthetic parsed .dlm structure. | |
| 25 | - @dataclass | |
| 26 | - class _Frontmatter: | |
| 27 | - dlm_id: str = "01TESTULID" | |
| 28 | - base_model: str = "smollm2-135m" | |
| 29 | - | |
| 30 | - @dataclass | |
| 31 | - class _Section: | |
| 32 | - section_id: str | |
| 33 | - type: str | |
| 34 | - content: str | |
| 35 | - tag: str | None = None | |
| 36 | - | |
| 37 | - @dataclass | |
| 38 | - class _Parsed: | |
| 39 | - frontmatter: _Frontmatter | |
| 40 | - sections: tuple[_Section, ...] | |
| 41 | - | |
| 42 | - def _parse_file(_path: Path): # type: ignore[no-untyped-def] | |
| 43 | - return _Parsed( | |
| 44 | - frontmatter=_Frontmatter(), | |
| 45 | - sections=( | |
| 46 | - _Section( | |
| 47 | - section_id="prose-1", | |
| 48 | - type="PROSE", | |
| 49 | - content="This is a prose section with some information. Further detail follows.", | |
| 50 | - ), | |
| 51 | - _Section( | |
| 52 | - section_id="instr-1", | |
| 53 | - type="INSTRUCTION", | |
| 54 | - content="### Q\nWhat is X?\n\n### A\nX is a concept\n", | |
| 55 | - ), | |
| 56 | - _Section( | |
| 57 | - section_id="pref-1", | |
| 58 | - type="PREFERENCE", | |
| 59 | - content="chosen/rejected triple", | |
| 60 | - ), | |
| 61 | - ), | |
| 62 | - ) | |
| 63 | - | |
| 64 | - # Fake ``dlm.doc.parser`` module. | |
| 65 | - dlm_pkg = types.ModuleType("dlm") | |
| 66 | - dlm_doc = types.ModuleType("dlm.doc") | |
| 67 | - dlm_doc_parser = types.ModuleType("dlm.doc.parser") | |
| 68 | - dlm_doc_parser.parse_file = _parse_file # type: ignore[attr-defined] | |
| 69 | - | |
| 70 | - # Fake ``dlm.store.paths`` that returns a resolvable path. | |
| 71 | - dlm_store = types.ModuleType("dlm.store") | |
| 72 | - dlm_store_paths = types.ModuleType("dlm.store.paths") | |
| 73 | - | |
| 74 | - adapter_dir = tmp_path / "adapter_v1" | |
| 75 | - adapter_dir.mkdir() | |
| 76 | - (adapter_dir / "adapter_config.json").write_text("{}", encoding="utf-8") | |
| 77 | - | |
| 78 | - class _StorePath: | |
| 79 | - def __init__(self, path: Path) -> None: | |
| 80 | - self._p = path | |
| 81 | - | |
| 82 | - def resolve_current_adapter(self) -> Path: | |
| 83 | - return self._p | |
| 84 | - | |
| 85 | - def _for_dlm(_dlm_id: str) -> _StorePath: | |
| 86 | - return _StorePath(adapter_dir) | |
| 87 | - | |
| 88 | - dlm_store_paths.StorePath = _StorePath # type: ignore[attr-defined] | |
| 89 | - dlm_store_paths.for_dlm = _for_dlm # type: ignore[attr-defined] | |
| 90 | - | |
| 91 | - # Fake base-model resolver — returns a stub with an ``hf_id`` attribute. | |
| 92 | - dlm_base = types.ModuleType("dlm.base_models") | |
| 93 | - | |
| 94 | - @dataclass | |
| 95 | - class _BaseSpec: | |
| 96 | - hf_id: str | |
| 97 | - key: str | |
| 98 | - | |
| 99 | - def _resolve(key: str) -> _BaseSpec: | |
| 100 | - return _BaseSpec(hf_id="HuggingFaceTB/SmolLM2-135M-Instruct", key=key) | |
| 101 | - | |
| 102 | - dlm_base.resolve = _resolve # type: ignore[attr-defined] | |
| 103 | - | |
| 104 | - # Fake instruction / preference parsers. | |
| 105 | - dlm_data = types.ModuleType("dlm.data") | |
| 106 | - dlm_data_instr = types.ModuleType("dlm.data.instruction_parser") | |
| 107 | - dlm_data_pref = types.ModuleType("dlm.data.preference_parser") | |
| 108 | - | |
| 109 | - @dataclass | |
| 110 | - class _QAPair: | |
| 111 | - question: str | |
| 112 | - answer: str | |
| 113 | - | |
| 114 | - @dataclass | |
| 115 | - class _Triple: | |
| 116 | - prompt: str | |
| 117 | - chosen: str | |
| 118 | - rejected: str | |
| 119 | - | |
| 120 | - def _parse_instr(body: str, *, section_id: str) -> list[_QAPair]: | |
| 121 | - del section_id | |
| 122 | - out: list[_QAPair] = [] | |
| 123 | - parts = body.split("### Q") | |
| 124 | - for part in parts[1:]: | |
| 125 | - q_block, _, a_block = part.partition("### A") | |
| 126 | - q = q_block.strip() | |
| 127 | - a = a_block.strip() | |
| 128 | - if q and a: | |
| 129 | - out.append(_QAPair(question=q, answer=a)) | |
| 130 | - return out | |
| 131 | - | |
| 132 | - def _parse_pref(body: str, *, section_id: str) -> list[_Triple]: | |
| 133 | - del body, section_id | |
| 134 | - return [_Triple(prompt="Which?", chosen="good answer", rejected="bad answer")] | |
| 135 | - | |
| 136 | - dlm_data_instr.parse_instruction_body = _parse_instr # type: ignore[attr-defined] | |
| 137 | - dlm_data_pref.parse_preference_body = _parse_pref # type: ignore[attr-defined] | |
| 138 | - | |
| 139 | - monkeypatch.setitem(sys.modules, "dlm", dlm_pkg) | |
| 140 | - monkeypatch.setitem(sys.modules, "dlm.doc", dlm_doc) | |
| 141 | - monkeypatch.setitem(sys.modules, "dlm.doc.parser", dlm_doc_parser) | |
| 142 | - monkeypatch.setitem(sys.modules, "dlm.store", dlm_store) | |
| 143 | - monkeypatch.setitem(sys.modules, "dlm.store.paths", dlm_store_paths) | |
| 144 | - monkeypatch.setitem(sys.modules, "dlm.base_models", dlm_base) | |
| 145 | - monkeypatch.setitem(sys.modules, "dlm.data", dlm_data) | |
| 146 | - monkeypatch.setitem(sys.modules, "dlm.data.instruction_parser", dlm_data_instr) | |
| 147 | - monkeypatch.setitem(sys.modules, "dlm.data.preference_parser", dlm_data_pref) | |
| 148 | - | |
| 149 | - # Return a path to a fake .dlm file (the parser won't actually read it). | |
| 150 | - dlm_file = tmp_path / "doc.dlm" | |
| 151 | - dlm_file.write_text("---\ndlm_id: 01TEST\n---\n\nbody\n", encoding="utf-8") | |
| 152 | - return dlm_file | |
| 153 | - | |
| 154 | - | |
| 155 | -def test_resolve_dlm_maps_sections(fake_dlm: Path) -> None: | |
| 156 | - from dlm_sway.integrations.dlm.resolver import resolve_dlm | |
| 157 | - | |
| 158 | - handle = resolve_dlm(fake_dlm) | |
| 159 | - assert handle.dlm_id == "01TESTULID" | |
| 160 | - assert handle.base_model == "HuggingFaceTB/SmolLM2-135M-Instruct" | |
| 161 | - assert handle.adapter_path is not None | |
| 162 | - assert handle.adapter_path.exists() | |
| 163 | - assert len(handle.sections) == 3 | |
| 164 | - # Kinds normalized from uppercase dlm enum values. | |
| 165 | - assert {s.kind for s in handle.sections} == {"prose", "instruction", "preference"} | |
| 166 | - # Instruction Q/A pair survived the translation. | |
| 167 | - instr = next(s for s in handle.sections if s.kind == "instruction") | |
| 168 | - assert instr.probes | |
| 169 | - assert instr.probes[0].prompt == "What is X?" | |
| 170 | - # Preference triple too. | |
| 171 | - pref = next(s for s in handle.sections if s.kind == "preference") | |
| 172 | - assert pref.preferences | |
| 173 | - assert pref.preferences[0].chosen == "good answer" | |
| 174 | - | |
| 175 | - | |
| 176 | -def test_resolve_without_dlm_installed(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: | |
| 177 | - """resolve_dlm surfaces a SwayError when the dlm package is missing.""" | |
| 178 | - # Wipe any cached dlm modules so the lazy import fails. | |
| 179 | - for mod in list(sys.modules): | |
| 180 | - if mod == "dlm" or mod.startswith("dlm."): | |
| 181 | - monkeypatch.delitem(sys.modules, mod, raising=False) | |
| 182 | - | |
| 183 | - import builtins | |
| 184 | - | |
| 185 | - real_import = builtins.__import__ | |
| 186 | - | |
| 187 | - def fake_import(name: str, *args, **kwargs): # type: ignore[no-untyped-def] | |
| 188 | - if name.startswith("dlm."): | |
| 189 | - raise ImportError("missing extra") | |
| 190 | - return real_import(name, *args, **kwargs) | |
| 191 | - | |
| 192 | - monkeypatch.setattr(builtins, "__import__", fake_import) | |
| 193 | - | |
| 194 | - from dlm_sway.core.errors import SwayError | |
| 195 | - from dlm_sway.integrations.dlm.resolver import resolve_dlm | |
| 196 | - | |
| 197 | - with pytest.raises(SwayError, match="dlm package not installed"): | |
| 198 | - resolve_dlm(tmp_path / "doc.dlm") | |
| 199 | - | |
| 200 | - | |
| 201 | -def test_autogen_writes_complete_suite(fake_dlm: Path, tmp_path: Path) -> None: | |
| 202 | - from dlm_sway.integrations.dlm.autogen import write_sway_yaml | |
| 203 | - | |
| 204 | - out = tmp_path / "sway.yaml" | |
| 205 | - write_sway_yaml(fake_dlm, out) | |
| 206 | - data = yaml.safe_load(out.read_text(encoding="utf-8")) | |
| 207 | - | |
| 208 | - assert data["version"] == 1 | |
| 209 | - assert data["models"]["base"]["base"] == "HuggingFaceTB/SmolLM2-135M-Instruct" | |
| 210 | - assert data["models"]["ft"]["adapter"] is not None | |
| 211 | - assert data["dlm_source"] == str(fake_dlm.resolve()) | |
| 212 | - | |
| 213 | - kinds = {entry["kind"] for entry in data["suite"]} | |
| 214 | - # The full 11-primitive battery minus nothing is present (some may | |
| 215 | - # be skipped when data is absent, but here we have one of every | |
| 216 | - # section type). | |
| 217 | - expected = { | |
| 218 | - "null_adapter", | |
| 219 | - "delta_kl", | |
| 220 | - "adapter_revert", | |
| 221 | - "prompt_collapse", | |
| 222 | - "section_internalization", | |
| 223 | - "paraphrase_invariance", | |
| 224 | - "preference_flip", | |
| 225 | - "style_fingerprint", | |
| 226 | - "calibration_drift", | |
| 227 | - "leakage", | |
| 228 | - "adapter_ablation", | |
| 229 | - } | |
| 230 | - assert expected <= kinds, f"missing: {expected - kinds}" | |
| 231 | - | |
| 232 | - | |
| 233 | -def test_build_spec_dict_skips_preference_when_absent() -> None: | |
| 234 | - from dlm_sway.core.sections import Section | |
| 235 | - from dlm_sway.integrations.dlm.autogen import build_spec_dict | |
| 236 | - from dlm_sway.integrations.dlm.resolver import DlmHandle | |
| 237 | - | |
| 238 | - sections = ( | |
| 239 | - Section(id="a", kind="prose", content="A prose section. Second sentence."), | |
| 240 | - Section(id="b", kind="prose", content="Another prose section."), | |
| 241 | - ) | |
| 242 | - handle = DlmHandle( | |
| 243 | - dlm_id="x", | |
| 244 | - base_model="base", | |
| 245 | - adapter_path=Path("/tmp/adapter"), | |
| 246 | - sections=sections, | |
| 247 | - doc_text="whole document", | |
| 248 | - ) | |
| 249 | - spec = build_spec_dict(handle) | |
| 250 | - kinds = {entry["kind"] for entry in spec["suite"]} | |
| 251 | - assert "preference_flip" not in kinds | |
| 252 | - assert "section_internalization" in kinds | |
sway/tests/unit/test_errors.pydeleted@@ -1,55 +0,0 @@ | ||
| 1 | -"""Tests for the exception hierarchy.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import pytest | |
| 6 | - | |
| 7 | -from dlm_sway.core.errors import ( | |
| 8 | - BackendNotAvailableError, | |
| 9 | - ProbeError, | |
| 10 | - SpecValidationError, | |
| 11 | - SwayError, | |
| 12 | -) | |
| 13 | - | |
| 14 | - | |
| 15 | -class TestSwayError: | |
| 16 | - def test_is_root_exception(self) -> None: | |
| 17 | - assert issubclass(SpecValidationError, SwayError) | |
| 18 | - assert issubclass(BackendNotAvailableError, SwayError) | |
| 19 | - assert issubclass(ProbeError, SwayError) | |
| 20 | - | |
| 21 | - def test_raised_and_caught_as_sway_error(self) -> None: | |
| 22 | - with pytest.raises(SwayError): | |
| 23 | - raise ProbeError("delta_kl", "shape mismatch") | |
| 24 | - | |
| 25 | - | |
| 26 | -class TestSpecValidationError: | |
| 27 | - def test_format_without_source(self) -> None: | |
| 28 | - err = SpecValidationError("unknown key 'topp'") | |
| 29 | - assert str(err) == "unknown key 'topp'" | |
| 30 | - assert err.source is None | |
| 31 | - | |
| 32 | - def test_format_with_source(self) -> None: | |
| 33 | - err = SpecValidationError("unknown key 'topp'", source="sway.yaml") | |
| 34 | - assert str(err) == "sway.yaml: unknown key 'topp'" | |
| 35 | - assert err.source == "sway.yaml" | |
| 36 | - | |
| 37 | - | |
| 38 | -class TestBackendNotAvailableError: | |
| 39 | - def test_hint_rendered_in_message(self) -> None: | |
| 40 | - err = BackendNotAvailableError("hf", extra="hf") | |
| 41 | - assert "pip install 'dlm-sway[hf]'" in str(err) | |
| 42 | - assert err.backend == "hf" | |
| 43 | - assert err.extra == "hf" | |
| 44 | - | |
| 45 | - def test_appends_optional_hint(self) -> None: | |
| 46 | - err = BackendNotAvailableError("mlx", extra="mlx", hint="Apple Silicon only.") | |
| 47 | - assert "Apple Silicon only." in str(err) | |
| 48 | - | |
| 49 | - | |
| 50 | -class TestProbeError: | |
| 51 | - def test_includes_probe_name(self) -> None: | |
| 52 | - err = ProbeError("delta_kl", "NaN logits") | |
| 53 | - assert "delta_kl" in str(err) | |
| 54 | - assert "NaN logits" in str(err) | |
| 55 | - assert err.probe == "delta_kl" | |
sway/tests/unit/test_model.pydeleted@@ -1,78 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.core.model`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from pathlib import Path | |
| 6 | - | |
| 7 | -import pytest | |
| 8 | -from pydantic import ValidationError | |
| 9 | - | |
| 10 | -from dlm_sway.core.model import LoadedModel, Model, ModelSpec | |
| 11 | - | |
| 12 | - | |
| 13 | -class TestModelSpec: | |
| 14 | - def test_defaults(self) -> None: | |
| 15 | - spec = ModelSpec(base="HuggingFaceTB/SmolLM2-135M-Instruct") | |
| 16 | - assert spec.kind == "hf" | |
| 17 | - assert spec.adapter is None | |
| 18 | - assert spec.dtype == "auto" | |
| 19 | - assert spec.device == "auto" | |
| 20 | - assert spec.trust_remote_code is False | |
| 21 | - assert spec.entry_point is None | |
| 22 | - | |
| 23 | - def test_frozen(self) -> None: | |
| 24 | - spec = ModelSpec(base="x") | |
| 25 | - with pytest.raises(ValidationError): | |
| 26 | - spec.base = "y" # type: ignore[misc] | |
| 27 | - | |
| 28 | - def test_extra_fields_forbidden(self) -> None: | |
| 29 | - with pytest.raises(ValidationError) as exc_info: | |
| 30 | - ModelSpec(base="x", bogus="y") # type: ignore[call-arg] | |
| 31 | - assert "bogus" in str(exc_info.value).lower() | |
| 32 | - | |
| 33 | - def test_kind_enum(self) -> None: | |
| 34 | - ModelSpec(base="x", kind="hf") | |
| 35 | - ModelSpec(base="x", kind="mlx") | |
| 36 | - ModelSpec(base="x", kind="dummy") | |
| 37 | - ModelSpec(base="x", kind="custom", entry_point="pkg.mod:Backend") | |
| 38 | - with pytest.raises(ValidationError): | |
| 39 | - ModelSpec(base="x", kind="ollama") # type: ignore[arg-type] | |
| 40 | - | |
| 41 | - def test_adapter_coerced_to_path(self) -> None: | |
| 42 | - spec = ModelSpec(base="x", adapter="/tmp/adapter") # type: ignore[arg-type] | |
| 43 | - assert isinstance(spec.adapter, Path) | |
| 44 | - | |
| 45 | - | |
| 46 | -class TestLoadedModel: | |
| 47 | - def test_frozen_dataclass(self) -> None: | |
| 48 | - loaded = LoadedModel( | |
| 49 | - id="base", | |
| 50 | - spec=ModelSpec(base="x"), | |
| 51 | - model=object(), | |
| 52 | - tokenizer=object(), | |
| 53 | - meta={"device": "cpu"}, | |
| 54 | - ) | |
| 55 | - assert loaded.id == "base" | |
| 56 | - assert loaded.meta["device"] == "cpu" | |
| 57 | - | |
| 58 | - | |
| 59 | -class TestModelProtocol: | |
| 60 | - def test_runtime_checkable(self) -> None: | |
| 61 | - class FakeModel: | |
| 62 | - id = "x" | |
| 63 | - | |
| 64 | - def generate( | |
| 65 | - self, | |
| 66 | - prompt: str, | |
| 67 | - *, | |
| 68 | - max_new_tokens: int, | |
| 69 | - temperature: float = 0.0, | |
| 70 | - top_p: float = 1.0, | |
| 71 | - seed: int = 0, | |
| 72 | - ) -> str: | |
| 73 | - return f"{prompt}|{max_new_tokens}" | |
| 74 | - | |
| 75 | - def close(self) -> None: | |
| 76 | - return None | |
| 77 | - | |
| 78 | - assert isinstance(FakeModel(), Model) | |
sway/tests/unit/test_null_calibration.pydeleted@@ -1,123 +0,0 @@ | ||
| 1 | -"""Tests for null-adapter calibration. | |
| 2 | - | |
| 3 | -Covers: dummy backend ``as_null_adapter`` yields a plausibly noisy | |
| 4 | -view; ``NullAdapterProbe`` populates ``ctx.null_stats`` in a way | |
| 5 | -downstream probes pick up end-to-end; missing-capability SKIP path. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -import numpy as np | |
| 11 | - | |
| 12 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 13 | -from dlm_sway.core.result import Verdict | |
| 14 | -from dlm_sway.core.scoring import NullCalibratedBackend | |
| 15 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 16 | -from dlm_sway.suite.runner import run as run_suite | |
| 17 | -from dlm_sway.suite.spec import SwaySpec | |
| 18 | - | |
| 19 | - | |
| 20 | -def _diverging_backend() -> DummyDifferentialBackend: | |
| 21 | - base = DummyResponses() | |
| 22 | - ft = DummyResponses() | |
| 23 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 24 | - | |
| 25 | - | |
| 26 | -class TestProtocolConformance: | |
| 27 | - def test_dummy_is_null_calibrated(self) -> None: | |
| 28 | - assert isinstance(_diverging_backend(), NullCalibratedBackend) | |
| 29 | - | |
| 30 | - | |
| 31 | -class TestAsNullAdapter: | |
| 32 | - def test_yields_perturbed_view(self) -> None: | |
| 33 | - backend = _diverging_backend() | |
| 34 | - with backend.as_base() as base: | |
| 35 | - base_dist = base.next_token_dist("hello") | |
| 36 | - with backend.as_null_adapter(seed=0) as null: | |
| 37 | - null_dist = null.next_token_dist("hello") | |
| 38 | - # Some perturbation, but bounded. | |
| 39 | - assert not np.allclose(base_dist.logprobs, null_dist.logprobs) | |
| 40 | - | |
| 41 | - def test_different_seeds_yield_different_views(self) -> None: | |
| 42 | - backend = _diverging_backend() | |
| 43 | - with backend.as_null_adapter(seed=1) as v1: | |
| 44 | - d1 = v1.next_token_dist("hello") | |
| 45 | - with backend.as_null_adapter(seed=2) as v2: | |
| 46 | - d2 = v2.next_token_dist("hello") | |
| 47 | - assert not np.allclose(d1.logprobs, d2.logprobs) | |
| 48 | - | |
| 49 | - def test_view_exclusion_enforced(self) -> None: | |
| 50 | - import pytest | |
| 51 | - | |
| 52 | - backend = _diverging_backend() | |
| 53 | - with backend.as_null_adapter(seed=0), pytest.raises(RuntimeError): | |
| 54 | - with backend.as_base(): | |
| 55 | - pass | |
| 56 | - | |
| 57 | - | |
| 58 | -class TestProbe: | |
| 59 | - def test_populates_null_stats(self) -> None: | |
| 60 | - backend = _diverging_backend() | |
| 61 | - probe, spec = build_probe( | |
| 62 | - { | |
| 63 | - "name": "null", | |
| 64 | - "kind": "null_adapter", | |
| 65 | - "runs": 3, | |
| 66 | - "prompts": ["q1", "q2"], | |
| 67 | - } | |
| 68 | - ) | |
| 69 | - ctx = RunContext(backend=backend) | |
| 70 | - result = probe.run(spec, ctx) | |
| 71 | - assert result.verdict == Verdict.PASS | |
| 72 | - stats = result.evidence["null_stats"] | |
| 73 | - assert "delta_kl" in stats | |
| 74 | - assert stats["delta_kl"]["n"] == 3.0 | |
| 75 | - assert stats["delta_kl"]["std"] > 0.0 # seeded perturbations produce variance | |
| 76 | - | |
| 77 | - def test_runner_threads_null_stats_to_subsequent_probes(self) -> None: | |
| 78 | - """End-to-end: null_adapter first → delta_kl picks up z-score path.""" | |
| 79 | - backend = _diverging_backend() | |
| 80 | - raw_spec = SwaySpec.model_validate( | |
| 81 | - { | |
| 82 | - "version": 1, | |
| 83 | - "models": {"base": {"base": "b"}, "ft": {"base": "b", "adapter": "/tmp/a"}}, | |
| 84 | - "suite": [ | |
| 85 | - { | |
| 86 | - "name": "null", | |
| 87 | - "kind": "null_adapter", | |
| 88 | - "runs": 3, | |
| 89 | - "prompts": ["p1", "p2"], | |
| 90 | - }, | |
| 91 | - { | |
| 92 | - "name": "dk", | |
| 93 | - "kind": "delta_kl", | |
| 94 | - "prompts": ["p1", "p2"], | |
| 95 | - "assert_z_gte": -10.0, # permissive so we pass regardless | |
| 96 | - }, | |
| 97 | - ], | |
| 98 | - } | |
| 99 | - ) | |
| 100 | - result = run_suite(raw_spec, backend) | |
| 101 | - assert len(result.probes) == 2 | |
| 102 | - null_result = result.probes[0] | |
| 103 | - dk_result = result.probes[1] | |
| 104 | - assert null_result.verdict == Verdict.PASS | |
| 105 | - # The delta_kl probe should have computed a z_score because null_stats was present. | |
| 106 | - assert dk_result.z_score is not None, ( | |
| 107 | - "delta_kl should have z-scored against null baseline, got " | |
| 108 | - f"evidence={dk_result.evidence}, message={dk_result.message}" | |
| 109 | - ) | |
| 110 | - | |
| 111 | - def test_skip_when_backend_not_null_calibrated(self) -> None: | |
| 112 | - class _Bare: | |
| 113 | - def as_base(self): # noqa: ANN202 | |
| 114 | - raise NotImplementedError | |
| 115 | - | |
| 116 | - def as_finetuned(self): # noqa: ANN202 | |
| 117 | - raise NotImplementedError | |
| 118 | - | |
| 119 | - probe, spec = build_probe({"name": "null", "kind": "null_adapter"}) | |
| 120 | - ctx = RunContext(backend=_Bare()) # type: ignore[arg-type] | |
| 121 | - result = probe.run(spec, ctx) | |
| 122 | - assert result.verdict == Verdict.SKIP | |
| 123 | - assert "NullCalibratedBackend" in result.message | |
sway/tests/unit/test_probe_adapter_ablation.pydeleted@@ -1,135 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.adapter_ablation`. | |
| 2 | - | |
| 3 | -Uses the dummy backend's lam-interpolation implementation to exercise | |
| 4 | -the full probe path without loading a real model. | |
| 5 | -""" | |
| 6 | - | |
| 7 | -from __future__ import annotations | |
| 8 | - | |
| 9 | -import numpy as np | |
| 10 | - | |
| 11 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 12 | -from dlm_sway.core.result import Verdict | |
| 13 | -from dlm_sway.core.scoring import ScalableDifferentialBackend, TokenDist | |
| 14 | -from dlm_sway.probes.adapter_ablation import ( | |
| 15 | - _overshoot, | |
| 16 | - _r_squared, | |
| 17 | - _saturation_lambda, | |
| 18 | -) | |
| 19 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 20 | - | |
| 21 | - | |
| 22 | -class TestShapeMetrics: | |
| 23 | - def test_r_squared_perfect_linear(self) -> None: | |
| 24 | - x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64) | |
| 25 | - y = 2 * x + 0.1 | |
| 26 | - assert _r_squared(x, y) > 0.99 | |
| 27 | - | |
| 28 | - def test_r_squared_zero_slope_defined(self) -> None: | |
| 29 | - x = np.asarray([0.0, 0.5, 1.0], dtype=np.float64) | |
| 30 | - y = np.zeros_like(x) | |
| 31 | - # Flat y → ss_tot = 0 → defined as 1.0 (perfect fit). | |
| 32 | - assert _r_squared(x, y) == 1.0 | |
| 33 | - | |
| 34 | - def test_saturation_lambda_expected(self) -> None: | |
| 35 | - lambdas = np.asarray([0.0, 0.25, 0.5, 0.75, 1.0], dtype=np.float64) | |
| 36 | - divs = np.asarray([0.0, 0.5, 0.8, 0.95, 1.0], dtype=np.float64) | |
| 37 | - sat = _saturation_lambda(lambdas, divs) | |
| 38 | - assert sat == 0.75 # 0.95 / 1.0 = 0.95 ≥ 0.9 | |
| 39 | - | |
| 40 | - def test_overshoot_recovered(self) -> None: | |
| 41 | - lambdas = np.asarray([0.0, 0.5, 1.0, 1.25], dtype=np.float64) | |
| 42 | - divs = np.asarray([0.0, 0.5, 1.0, 1.15], dtype=np.float64) | |
| 43 | - assert _overshoot(lambdas, divs) == 1.15 | |
| 44 | - | |
| 45 | - | |
| 46 | -def _diverging_backend() -> DummyDifferentialBackend: | |
| 47 | - """Backend where base ≠ ft at a few prompts; distributions interpolate | |
| 48 | - smoothly under lam-blending in DummyDifferentialBackend.as_scaled_adapter.""" | |
| 49 | - base = DummyResponses( | |
| 50 | - token_dists={ | |
| 51 | - "q1": TokenDist( | |
| 52 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 53 | - logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)), | |
| 54 | - vocab_size=100, | |
| 55 | - ), | |
| 56 | - "q2": TokenDist( | |
| 57 | - token_ids=np.array([5, 6], dtype=np.int64), | |
| 58 | - logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)), | |
| 59 | - vocab_size=100, | |
| 60 | - ), | |
| 61 | - } | |
| 62 | - ) | |
| 63 | - ft = DummyResponses( | |
| 64 | - token_dists={ | |
| 65 | - "q1": TokenDist( | |
| 66 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 67 | - logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)), | |
| 68 | - vocab_size=100, | |
| 69 | - ), | |
| 70 | - "q2": TokenDist( | |
| 71 | - token_ids=np.array([5, 6], dtype=np.int64), | |
| 72 | - logprobs=np.log(np.array([0.3, 0.7], dtype=np.float32)), | |
| 73 | - vocab_size=100, | |
| 74 | - ), | |
| 75 | - } | |
| 76 | - ) | |
| 77 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 78 | - | |
| 79 | - | |
| 80 | -class TestProbe: | |
| 81 | - def test_backend_implements_scalable_protocol(self) -> None: | |
| 82 | - backend = _diverging_backend() | |
| 83 | - assert isinstance(backend, ScalableDifferentialBackend) | |
| 84 | - | |
| 85 | - def test_probe_runs_and_emits_shape_metrics(self) -> None: | |
| 86 | - probe, spec = build_probe( | |
| 87 | - { | |
| 88 | - "name": "abl", | |
| 89 | - "kind": "adapter_ablation", | |
| 90 | - "prompts": ["q1", "q2"], | |
| 91 | - "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], | |
| 92 | - # Very permissive to tolerate the log-space blend of a | |
| 93 | - # tiny synthetic fixture. | |
| 94 | - "assert_linearity_gte": 0.3, | |
| 95 | - "assert_overshoot_gte": 1.0, | |
| 96 | - } | |
| 97 | - ) | |
| 98 | - ctx = RunContext(backend=_diverging_backend()) | |
| 99 | - result = probe.run(spec, ctx) | |
| 100 | - assert result.verdict in (Verdict.PASS, Verdict.FAIL) | |
| 101 | - assert "lambdas" in result.evidence | |
| 102 | - assert "mean_divergence_per_lambda" in result.evidence | |
| 103 | - assert len(result.evidence["mean_divergence_per_lambda"]) == 6 | |
| 104 | - # Divergence should increase as λ grows from 0 toward ft. | |
| 105 | - divs = result.evidence["mean_divergence_per_lambda"] | |
| 106 | - # λ=0 → 0 divergence from itself. λ>0 should be non-decreasing | |
| 107 | - # for the bulk of the curve. | |
| 108 | - assert divs[-2] >= divs[0] | |
| 109 | - | |
| 110 | - def test_skip_when_backend_not_scalable(self) -> None: | |
| 111 | - class _NonScalable: | |
| 112 | - def as_base(self): # noqa: ANN202 | |
| 113 | - raise NotImplementedError | |
| 114 | - | |
| 115 | - def as_finetuned(self): # noqa: ANN202 | |
| 116 | - raise NotImplementedError | |
| 117 | - | |
| 118 | - probe, spec = build_probe( | |
| 119 | - { | |
| 120 | - "name": "abl", | |
| 121 | - "kind": "adapter_ablation", | |
| 122 | - "prompts": ["q1"], | |
| 123 | - } | |
| 124 | - ) | |
| 125 | - ctx = RunContext(backend=_NonScalable()) # type: ignore[arg-type] | |
| 126 | - result = probe.run(spec, ctx) | |
| 127 | - assert result.verdict == Verdict.SKIP | |
| 128 | - assert "ScalableDifferentialBackend" in result.message | |
| 129 | - | |
| 130 | - def test_error_on_empty_prompts(self) -> None: | |
| 131 | - backend = _diverging_backend() | |
| 132 | - probe, spec = build_probe({"name": "abl", "kind": "adapter_ablation", "prompts": []}) | |
| 133 | - ctx = RunContext(backend=backend) | |
| 134 | - result = probe.run(spec, ctx) | |
| 135 | - assert result.verdict == Verdict.ERROR | |
sway/tests/unit/test_probe_adapter_revert.pydeleted@@ -1,170 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.adapter_revert`. | |
| 2 | - | |
| 3 | -We stub out the embedder so these tests don't need sentence-transformers | |
| 4 | -installed. The ``probe.py`` SKIP path for the missing-extra case is | |
| 5 | -covered separately by monkeypatching the importer. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -from typing import Any | |
| 11 | - | |
| 12 | -import numpy as np | |
| 13 | -import pytest | |
| 14 | - | |
| 15 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 16 | -from dlm_sway.core.result import Verdict | |
| 17 | -from dlm_sway.probes.adapter_revert import AdapterRevertProbe | |
| 18 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 19 | - | |
| 20 | - | |
| 21 | -def _backend(*, ft_like_base: bool = False) -> DummyDifferentialBackend: | |
| 22 | - base = DummyResponses( | |
| 23 | - generations={ | |
| 24 | - "pp1": "cats are mammals", | |
| 25 | - "pp2": "cats have fur", | |
| 26 | - } | |
| 27 | - ) | |
| 28 | - if ft_like_base: | |
| 29 | - ft_gens = dict(base.generations) | |
| 30 | - else: | |
| 31 | - ft_gens = { | |
| 32 | - "pp1": "dolphins are mammals", | |
| 33 | - "pp2": "dolphins are smart", | |
| 34 | - } | |
| 35 | - ft = DummyResponses(generations=ft_gens) | |
| 36 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 37 | - | |
| 38 | - | |
| 39 | -def _stub_embedder(text_to_vec: dict[str, np.ndarray]): # type: ignore[no-untyped-def] | |
| 40 | - def _encode(texts: list[str]): # type: ignore[no-untyped-def] | |
| 41 | - return np.stack([text_to_vec[t] for t in texts]) | |
| 42 | - | |
| 43 | - return _encode | |
| 44 | - | |
| 45 | - | |
| 46 | -@pytest.fixture | |
| 47 | -def monkeyed_embed(monkeypatch: pytest.MonkeyPatch) -> dict[str, np.ndarray]: | |
| 48 | - """Install a stub embedder with a controllable text→vec mapping. | |
| 49 | - | |
| 50 | - Tests populate the dict before calling ``probe.run()``. | |
| 51 | - """ | |
| 52 | - table: dict[str, np.ndarray] = {} | |
| 53 | - monkeypatch.setattr( | |
| 54 | - "dlm_sway.probes.adapter_revert._load_embedder", | |
| 55 | - lambda _model_id: _stub_embedder(table), # type: ignore[arg-type] | |
| 56 | - ) | |
| 57 | - return table | |
| 58 | - | |
| 59 | - | |
| 60 | -class TestAdapterRevert: | |
| 61 | - def test_healthy_adapter_passes(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | |
| 62 | - # gold and ft-outputs cluster together, base outputs cluster elsewhere. | |
| 63 | - monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0]) | |
| 64 | - monkeyed_embed["cats have fur"] = np.array([1.0, 0.0]) | |
| 65 | - monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0]) | |
| 66 | - monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0]) | |
| 67 | - monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0]) # gold | |
| 68 | - | |
| 69 | - probe, spec = build_probe( | |
| 70 | - { | |
| 71 | - "name": "rev", | |
| 72 | - "kind": "adapter_revert", | |
| 73 | - "cases": [ | |
| 74 | - { | |
| 75 | - "prompt": "anything", | |
| 76 | - "gold": "the answer is dolphins", | |
| 77 | - "paraphrases": ["pp1", "pp2"], | |
| 78 | - } | |
| 79 | - ], | |
| 80 | - "assert_revert_rate_lt": 0.25, | |
| 81 | - } | |
| 82 | - ) | |
| 83 | - ctx = RunContext(backend=_backend(ft_like_base=False)) | |
| 84 | - result = probe.run(spec, ctx) | |
| 85 | - assert result.verdict == Verdict.PASS | |
| 86 | - assert result.raw == 0.0 | |
| 87 | - | |
| 88 | - def test_reverting_adapter_fails(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | |
| 89 | - # ft matches base (reverted), diverges from gold. | |
| 90 | - monkeyed_embed["cats are mammals"] = np.array([1.0, 0.0]) | |
| 91 | - monkeyed_embed["cats have fur"] = np.array([1.0, 0.0]) | |
| 92 | - monkeyed_embed["the answer is dolphins"] = np.array([0.0, 1.0]) # gold | |
| 93 | - | |
| 94 | - probe, spec = build_probe( | |
| 95 | - { | |
| 96 | - "name": "rev", | |
| 97 | - "kind": "adapter_revert", | |
| 98 | - "cases": [ | |
| 99 | - { | |
| 100 | - "prompt": "anything", | |
| 101 | - "gold": "the answer is dolphins", | |
| 102 | - "paraphrases": ["pp1", "pp2"], | |
| 103 | - } | |
| 104 | - ], | |
| 105 | - } | |
| 106 | - ) | |
| 107 | - ctx = RunContext(backend=_backend(ft_like_base=True)) | |
| 108 | - result = probe.run(spec, ctx) | |
| 109 | - assert result.verdict == Verdict.FAIL | |
| 110 | - assert result.raw == 1.0 # 100% revert | |
| 111 | - | |
| 112 | - def test_trivially_similar_cases_dropped(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | |
| 113 | - # base and gold are identical → drop. | |
| 114 | - v = np.array([1.0, 0.0]) | |
| 115 | - monkeyed_embed["cats are mammals"] = v | |
| 116 | - monkeyed_embed["cats have fur"] = v | |
| 117 | - monkeyed_embed["dolphins are mammals"] = np.array([0.0, 1.0]) | |
| 118 | - monkeyed_embed["dolphins are smart"] = np.array([0.0, 1.0]) | |
| 119 | - monkeyed_embed["cats are mammals too"] = v # gold — matches base | |
| 120 | - | |
| 121 | - probe, spec = build_probe( | |
| 122 | - { | |
| 123 | - "name": "rev", | |
| 124 | - "kind": "adapter_revert", | |
| 125 | - "cases": [ | |
| 126 | - { | |
| 127 | - "prompt": "anything", | |
| 128 | - "gold": "cats are mammals too", | |
| 129 | - "paraphrases": ["pp1", "pp2"], | |
| 130 | - } | |
| 131 | - ], | |
| 132 | - } | |
| 133 | - ) | |
| 134 | - ctx = RunContext(backend=_backend(ft_like_base=False)) | |
| 135 | - result = probe.run(spec, ctx) | |
| 136 | - # Both paraphrase pairs trivially similar → WARN (no separable signal). | |
| 137 | - assert result.verdict == Verdict.WARN | |
| 138 | - assert result.evidence["dropped_trivial"] == 2 | |
| 139 | - | |
| 140 | - def test_no_cases_errors(self, monkeyed_embed: dict[str, np.ndarray]) -> None: | |
| 141 | - probe, spec = build_probe({"name": "rev", "kind": "adapter_revert", "cases": []}) | |
| 142 | - ctx = RunContext(backend=_backend()) | |
| 143 | - result = probe.run(spec, ctx) | |
| 144 | - assert result.verdict == Verdict.ERROR | |
| 145 | - | |
| 146 | - | |
| 147 | -class TestMissingSemsim: | |
| 148 | - def test_skip_when_sentence_transformers_missing(self, monkeypatch: pytest.MonkeyPatch) -> None: | |
| 149 | - from dlm_sway.core.errors import BackendNotAvailableError | |
| 150 | - | |
| 151 | - def raiser(_model_id: Any) -> Any: # type: ignore[no-untyped-def] | |
| 152 | - raise BackendNotAvailableError( | |
| 153 | - "adapter_revert", | |
| 154 | - extra="semsim", | |
| 155 | - hint="adapter_revert relies on sentence embeddings.", | |
| 156 | - ) | |
| 157 | - | |
| 158 | - monkeypatch.setattr( | |
| 159 | - "dlm_sway.probes.adapter_revert._load_embedder", | |
| 160 | - raiser, # type: ignore[arg-type] | |
| 161 | - ) | |
| 162 | - probe = AdapterRevertProbe() | |
| 163 | - spec = probe.spec_cls( | |
| 164 | - name="rev", | |
| 165 | - cases=[{"prompt": "x", "gold": "y", "paraphrases": ["pp1"]}], # type: ignore[list-item] | |
| 166 | - ) | |
| 167 | - ctx = RunContext(backend=_backend()) | |
| 168 | - result = probe.run(spec, ctx) | |
| 169 | - assert result.verdict == Verdict.SKIP | |
| 170 | - assert "semsim" in result.message | |
sway/tests/unit/test_probe_base.pydeleted@@ -1,69 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.base`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from typing import Literal | |
| 6 | - | |
| 7 | -import pytest | |
| 8 | - | |
| 9 | -from dlm_sway.core.errors import SpecValidationError | |
| 10 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 11 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, build_probe, registry | |
| 12 | - | |
| 13 | - | |
| 14 | -class _DummySpec(ProbeSpec): | |
| 15 | - kind: Literal["__test_dummy"] = "__test_dummy" | |
| 16 | - payload: str = "x" | |
| 17 | - | |
| 18 | - | |
| 19 | -class _DummyProbe(Probe): | |
| 20 | - kind = "__test_dummy" | |
| 21 | - spec_cls = _DummySpec | |
| 22 | - category = "adherence" | |
| 23 | - | |
| 24 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 25 | - assert isinstance(spec, _DummySpec) | |
| 26 | - return ProbeResult( | |
| 27 | - name=spec.name, | |
| 28 | - kind=spec.kind, | |
| 29 | - verdict=Verdict.PASS, | |
| 30 | - score=1.0, | |
| 31 | - message=spec.payload, | |
| 32 | - ) | |
| 33 | - | |
| 34 | - | |
| 35 | -class TestRegistry: | |
| 36 | - def test_autoregister(self) -> None: | |
| 37 | - assert "__test_dummy" in registry() | |
| 38 | - assert registry()["__test_dummy"] is _DummyProbe | |
| 39 | - | |
| 40 | - def test_duplicate_kind_rejected(self) -> None: | |
| 41 | - with pytest.raises(ValueError, match="duplicate probe kind"): | |
| 42 | - | |
| 43 | - class _Clash(Probe): | |
| 44 | - kind = "__test_dummy" | |
| 45 | - spec_cls = _DummySpec | |
| 46 | - | |
| 47 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 48 | - raise NotImplementedError | |
| 49 | - | |
| 50 | - | |
| 51 | -class TestBuildProbe: | |
| 52 | - def test_valid_entry(self) -> None: | |
| 53 | - probe, spec = build_probe({"name": "t", "kind": "__test_dummy", "payload": "hi"}) | |
| 54 | - assert isinstance(probe, _DummyProbe) | |
| 55 | - assert isinstance(spec, _DummySpec) | |
| 56 | - assert spec.payload == "hi" | |
| 57 | - | |
| 58 | - def test_unknown_kind(self) -> None: | |
| 59 | - with pytest.raises(SpecValidationError, match="unknown probe kind"): | |
| 60 | - build_probe({"name": "t", "kind": "no_such_kind"}) | |
| 61 | - | |
| 62 | - def test_missing_kind(self) -> None: | |
| 63 | - with pytest.raises(SpecValidationError, match="missing string 'kind'"): | |
| 64 | - build_probe({"name": "t"}) | |
| 65 | - | |
| 66 | - def test_extra_field_forbidden(self) -> None: | |
| 67 | - with pytest.raises(SpecValidationError) as exc_info: | |
| 68 | - build_probe({"name": "t", "kind": "__test_dummy", "bogus": "y"}) | |
| 69 | - assert "bogus" in str(exc_info.value).lower() | |
sway/tests/unit/test_probe_calibration_drift.pydeleted@@ -1,57 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.calibration_drift`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 6 | -from dlm_sway.core.result import Verdict | |
| 7 | -from dlm_sway.probes._calibration_pack import BUILT_IN_PACK | |
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 9 | - | |
| 10 | - | |
| 11 | -def _backend(delta_per_token: float) -> DummyDifferentialBackend: | |
| 12 | - """Apply a uniform per-token logprob delta across every item.""" | |
| 13 | - base_lp: dict[tuple[str, str], float] = {} | |
| 14 | - ft_lp: dict[tuple[str, str], float] = {} | |
| 15 | - for prompt, gold in BUILT_IN_PACK: | |
| 16 | - base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1) | |
| 17 | - ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1) | |
| 18 | - return DummyDifferentialBackend( | |
| 19 | - base=DummyResponses(logprobs=base_lp), | |
| 20 | - ft=DummyResponses(logprobs=ft_lp), | |
| 21 | - ) | |
| 22 | - | |
| 23 | - | |
| 24 | -class TestCalibrationDrift: | |
| 25 | - def test_healthy_when_no_regression(self) -> None: | |
| 26 | - backend = _backend(delta_per_token=0.0) # no drift | |
| 27 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | |
| 28 | - ctx = RunContext(backend=backend) | |
| 29 | - result = probe.run(spec, ctx) | |
| 30 | - assert result.verdict == Verdict.PASS | |
| 31 | - assert result.raw == 0.0 # zero fraction regressed | |
| 32 | - | |
| 33 | - def test_fail_on_uniform_large_regression(self) -> None: | |
| 34 | - backend = _backend(delta_per_token=-2.0) # every item regresses | |
| 35 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | |
| 36 | - ctx = RunContext(backend=backend) | |
| 37 | - result = probe.run(spec, ctx) | |
| 38 | - assert result.verdict == Verdict.FAIL | |
| 39 | - assert result.raw == 1.0 | |
| 40 | - | |
| 41 | - def test_respects_items_limit(self) -> None: | |
| 42 | - backend = _backend(delta_per_token=0.0) | |
| 43 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5}) | |
| 44 | - ctx = RunContext(backend=backend) | |
| 45 | - result = probe.run(spec, ctx) | |
| 46 | - assert result.evidence["total_items"] == 5 | |
| 47 | - | |
| 48 | - def test_worst_offenders_reported(self) -> None: | |
| 49 | - backend = _backend(delta_per_token=-2.0) | |
| 50 | - probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"}) | |
| 51 | - ctx = RunContext(backend=backend) | |
| 52 | - result = probe.run(spec, ctx) | |
| 53 | - worst = result.evidence["worst_offenders"] | |
| 54 | - assert len(worst) <= 5 | |
| 55 | - # Each worst-offender record carries prompt/gold/delta fields. | |
| 56 | - if worst: | |
| 57 | - assert {"prompt", "gold", "delta"} <= set(worst[0].keys()) | |
sway/tests/unit/test_probe_delta_kl.pydeleted@@ -1,124 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.delta_kl`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import numpy as np | |
| 6 | - | |
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 8 | -from dlm_sway.core.result import Verdict | |
| 9 | -from dlm_sway.core.scoring import TokenDist | |
| 10 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 11 | - | |
| 12 | - | |
| 13 | -def _diverging_backend() -> DummyDifferentialBackend: | |
| 14 | - """Base peaks tightly on token 1; ft is broad uniform. Real divergence.""" | |
| 15 | - base = DummyResponses( | |
| 16 | - token_dists={ | |
| 17 | - "q1": TokenDist( | |
| 18 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 19 | - logprobs=np.log(np.array([0.9, 0.05, 0.05], dtype=np.float32)), | |
| 20 | - vocab_size=100, | |
| 21 | - ), | |
| 22 | - "q2": TokenDist( | |
| 23 | - token_ids=np.array([5, 6], dtype=np.int64), | |
| 24 | - logprobs=np.log(np.array([0.8, 0.2], dtype=np.float32)), | |
| 25 | - vocab_size=100, | |
| 26 | - ), | |
| 27 | - } | |
| 28 | - ) | |
| 29 | - ft = DummyResponses( | |
| 30 | - token_dists={ | |
| 31 | - "q1": TokenDist( | |
| 32 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 33 | - logprobs=np.log(np.array([0.3, 0.35, 0.35], dtype=np.float32)), | |
| 34 | - vocab_size=100, | |
| 35 | - ), | |
| 36 | - "q2": TokenDist( | |
| 37 | - token_ids=np.array([5, 6], dtype=np.int64), | |
| 38 | - logprobs=np.log(np.array([0.4, 0.6], dtype=np.float32)), | |
| 39 | - vocab_size=100, | |
| 40 | - ), | |
| 41 | - } | |
| 42 | - ) | |
| 43 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 44 | - | |
| 45 | - | |
| 46 | -def _identical_backend() -> DummyDifferentialBackend: | |
| 47 | - dist = TokenDist( | |
| 48 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 49 | - logprobs=np.log(np.array([0.5, 0.3, 0.2], dtype=np.float32)), | |
| 50 | - vocab_size=100, | |
| 51 | - ) | |
| 52 | - base = DummyResponses(token_dists={"q1": dist}) | |
| 53 | - ft = DummyResponses(token_dists={"q1": dist}) | |
| 54 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 55 | - | |
| 56 | - | |
| 57 | -class TestDeltaKL: | |
| 58 | - def test_passes_when_distributions_diverge(self) -> None: | |
| 59 | - probe, spec = build_probe( | |
| 60 | - { | |
| 61 | - "name": "dk", | |
| 62 | - "kind": "delta_kl", | |
| 63 | - "prompts": ["q1", "q2"], | |
| 64 | - "assert_mean_gte": 0.01, | |
| 65 | - } | |
| 66 | - ) | |
| 67 | - ctx = RunContext(backend=_diverging_backend()) | |
| 68 | - result = probe.run(spec, ctx) | |
| 69 | - assert result.verdict == Verdict.PASS | |
| 70 | - assert result.raw is not None | |
| 71 | - assert result.raw > 0.01 | |
| 72 | - assert result.evidence["num_prompts"] == 2 | |
| 73 | - assert len(result.evidence["per_prompt"]) == 2 | |
| 74 | - | |
| 75 | - def test_fails_when_distributions_identical(self) -> None: | |
| 76 | - probe, spec = build_probe( | |
| 77 | - { | |
| 78 | - "name": "dk", | |
| 79 | - "kind": "delta_kl", | |
| 80 | - "prompts": ["q1"], | |
| 81 | - "assert_mean_gte": 0.01, | |
| 82 | - } | |
| 83 | - ) | |
| 84 | - ctx = RunContext(backend=_identical_backend()) | |
| 85 | - result = probe.run(spec, ctx) | |
| 86 | - assert result.verdict == Verdict.FAIL | |
| 87 | - assert result.raw == 0.0 | |
| 88 | - | |
| 89 | - def test_z_score_path_when_null_stats_present(self) -> None: | |
| 90 | - probe, spec = build_probe( | |
| 91 | - { | |
| 92 | - "name": "dk", | |
| 93 | - "kind": "delta_kl", | |
| 94 | - "prompts": ["q1"], | |
| 95 | - "assert_z_gte": 2.0, | |
| 96 | - } | |
| 97 | - ) | |
| 98 | - null_stats = {"delta_kl": {"mean": 0.01, "std": 0.01, "n": 3.0}} | |
| 99 | - ctx = RunContext(backend=_diverging_backend(), null_stats=null_stats) | |
| 100 | - result = probe.run(spec, ctx) | |
| 101 | - assert result.z_score is not None | |
| 102 | - # Our synthetic ft diverges ~0.1+, far above μ=0.01, σ=0.01 → huge z. | |
| 103 | - assert result.z_score > 2.0 | |
| 104 | - assert result.verdict == Verdict.PASS | |
| 105 | - | |
| 106 | - def test_error_on_empty_prompts(self) -> None: | |
| 107 | - probe, spec = build_probe({"name": "dk", "kind": "delta_kl", "prompts": []}) | |
| 108 | - ctx = RunContext(backend=_identical_backend()) | |
| 109 | - result = probe.run(spec, ctx) | |
| 110 | - assert result.verdict == Verdict.ERROR | |
| 111 | - | |
| 112 | - def test_kl_kind_available(self) -> None: | |
| 113 | - probe, spec = build_probe( | |
| 114 | - { | |
| 115 | - "name": "dk", | |
| 116 | - "kind": "delta_kl", | |
| 117 | - "prompts": ["q1"], | |
| 118 | - "divergence": "kl", | |
| 119 | - "assert_mean_gte": 0.0, | |
| 120 | - } | |
| 121 | - ) | |
| 122 | - ctx = RunContext(backend=_diverging_backend()) | |
| 123 | - result = probe.run(spec, ctx) | |
| 124 | - assert result.evidence["divergence_kind"] == "kl" | |
sway/tests/unit/test_probe_leakage.pydeleted@@ -1,109 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.leakage`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 6 | -from dlm_sway.core.result import Verdict | |
| 7 | -from dlm_sway.core.sections import Section | |
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 9 | -from dlm_sway.probes.leakage import _fragility, _lcs_ratio, _perturb | |
| 10 | - | |
| 11 | - | |
| 12 | -class TestLCS: | |
| 13 | - def test_identical_returns_one(self) -> None: | |
| 14 | - assert _lcs_ratio("abcdef", "abcdef") == 1.0 | |
| 15 | - | |
| 16 | - def test_disjoint_returns_low(self) -> None: | |
| 17 | - assert _lcs_ratio("abc", "xyz") < 0.3 | |
| 18 | - | |
| 19 | - def test_empty_returns_zero(self) -> None: | |
| 20 | - assert _lcs_ratio("", "abc") == 0.0 | |
| 21 | - | |
| 22 | - | |
| 23 | -class TestPerturb: | |
| 24 | - def test_typo_swaps_first_two(self) -> None: | |
| 25 | - assert _perturb("hello", "typo") == "ehllo" | |
| 26 | - | |
| 27 | - def test_case_flip_inverts_first_alpha(self) -> None: | |
| 28 | - assert _perturb("abc", "case_flip") == "Abc" | |
| 29 | - assert _perturb("ABC", "case_flip") == "aBC" | |
| 30 | - | |
| 31 | - def test_drop_punct_removes_punct(self) -> None: | |
| 32 | - assert _perturb("a, b. c!", "drop_punct") == "a b c" | |
| 33 | - | |
| 34 | - | |
| 35 | -class TestFragility: | |
| 36 | - def test_zero_when_clean_zero(self) -> None: | |
| 37 | - assert _fragility(0.0, 0.0) == 0.0 | |
| 38 | - | |
| 39 | - def test_expected_when_perturbed_dropped(self) -> None: | |
| 40 | - import pytest | |
| 41 | - | |
| 42 | - assert _fragility(0.8, 0.2) == pytest.approx(0.75) | |
| 43 | - | |
| 44 | - | |
| 45 | -def _prose_section(sid: str, content: str) -> Section: | |
| 46 | - return Section(id=sid, kind="prose", content=content) | |
| 47 | - | |
| 48 | - | |
| 49 | -def _backend(*, ft_recall: float, ft_perturbed_recall: float) -> DummyDifferentialBackend: | |
| 50 | - """Build a backend whose ft generate() returns a controlled prefix of ``target``. | |
| 51 | - | |
| 52 | - The target is "aaa..." (200 chars) so we can measure LCS ratio | |
| 53 | - against it deterministically. | |
| 54 | - """ | |
| 55 | - content = ("The capital of France is Paris. " * 30).strip() | |
| 56 | - # Generate a fraction of the target to hit the desired recall. | |
| 57 | - target = content[128 : 128 + 256] | |
| 58 | - ft_full = target[: int(ft_recall * len(target))] | |
| 59 | - ft_pert = target[: int(ft_perturbed_recall * len(target))] | |
| 60 | - | |
| 61 | - base = DummyResponses() | |
| 62 | - ft = DummyResponses( | |
| 63 | - generations={ | |
| 64 | - content[:128]: ft_full, | |
| 65 | - # perturbations of the first 128 chars hit these three: | |
| 66 | - **{_perturb(content[:128], p): ft_pert for p in ("typo", "case_flip", "drop_punct")}, | |
| 67 | - } | |
| 68 | - ) | |
| 69 | - return DummyDifferentialBackend(base=base, ft=ft), content | |
| 70 | - | |
| 71 | - | |
| 72 | -class TestProbe: | |
| 73 | - def test_skip_without_sections(self) -> None: | |
| 74 | - backend, _ = _backend(ft_recall=0.0, ft_perturbed_recall=0.0) | |
| 75 | - probe, spec = build_probe({"name": "c3", "kind": "leakage"}) | |
| 76 | - ctx = RunContext(backend=backend) | |
| 77 | - result = probe.run(spec, ctx) | |
| 78 | - assert result.verdict == Verdict.SKIP | |
| 79 | - | |
| 80 | - def test_pass_when_no_leak(self) -> None: | |
| 81 | - backend, content = _backend(ft_recall=0.0, ft_perturbed_recall=0.0) | |
| 82 | - probe, spec = build_probe( | |
| 83 | - { | |
| 84 | - "name": "c3", | |
| 85 | - "kind": "leakage", | |
| 86 | - "prefix_chars": 128, | |
| 87 | - "continuation_chars": 256, | |
| 88 | - } | |
| 89 | - ) | |
| 90 | - ctx = RunContext(backend=backend, sections=(_prose_section("a", content),)) | |
| 91 | - result = probe.run(spec, ctx) | |
| 92 | - assert result.verdict == Verdict.PASS | |
| 93 | - | |
| 94 | - def test_fail_when_strong_low_fragility_leak(self) -> None: | |
| 95 | - backend, content = _backend(ft_recall=0.95, ft_perturbed_recall=0.9) | |
| 96 | - probe, spec = build_probe( | |
| 97 | - { | |
| 98 | - "name": "c3", | |
| 99 | - "kind": "leakage", | |
| 100 | - "prefix_chars": 128, | |
| 101 | - "continuation_chars": 256, | |
| 102 | - "assert_recall_lt": 0.5, | |
| 103 | - "min_fragility": 0.3, | |
| 104 | - } | |
| 105 | - ) | |
| 106 | - ctx = RunContext(backend=backend, sections=(_prose_section("a", content),)) | |
| 107 | - result = probe.run(spec, ctx) | |
| 108 | - # High recall + low fragility → fail. | |
| 109 | - assert result.verdict == Verdict.FAIL | |
sway/tests/unit/test_probe_paraphrase_invariance.pydeleted@@ -1,91 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.paraphrase_invariance`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 6 | -from dlm_sway.core.result import Verdict | |
| 7 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 8 | - | |
| 9 | - | |
| 10 | -def _backend(*, par_lift_fraction: float, verb_lift: float = 10.0) -> DummyDifferentialBackend: | |
| 11 | - """Return a backend with tunable verbatim/paraphrase lifts. | |
| 12 | - | |
| 13 | - The ft view adds ``verb_lift`` nats to the verbatim (Q,A) logprob | |
| 14 | - and ``par_lift_fraction * verb_lift`` to paraphrase logprobs. | |
| 15 | - """ | |
| 16 | - base = DummyResponses( | |
| 17 | - logprobs={ | |
| 18 | - ("Q", "A"): -20.0, | |
| 19 | - ("Q_par1", "A"): -20.0, | |
| 20 | - ("Q_par2", "A"): -20.0, | |
| 21 | - } | |
| 22 | - ) | |
| 23 | - ft = DummyResponses( | |
| 24 | - logprobs={ | |
| 25 | - ("Q", "A"): -20.0 + verb_lift, | |
| 26 | - ("Q_par1", "A"): -20.0 + par_lift_fraction * verb_lift, | |
| 27 | - ("Q_par2", "A"): -20.0 + par_lift_fraction * verb_lift, | |
| 28 | - } | |
| 29 | - ) | |
| 30 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 31 | - | |
| 32 | - | |
| 33 | -def test_pass_when_generalizing() -> None: | |
| 34 | - # High paraphrase lift + high verbatim → healthy generalization. | |
| 35 | - backend = _backend(par_lift_fraction=0.9) | |
| 36 | - probe, spec = build_probe( | |
| 37 | - { | |
| 38 | - "name": "pi", | |
| 39 | - "kind": "paraphrase_invariance", | |
| 40 | - "intent": "generalize", | |
| 41 | - "min_verbatim_lift": 0.05, | |
| 42 | - "min_generalization_ratio": 0.5, | |
| 43 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1", "Q_par2"]}], | |
| 44 | - } | |
| 45 | - ) | |
| 46 | - ctx = RunContext(backend=backend) | |
| 47 | - result = probe.run(spec, ctx) | |
| 48 | - assert result.verdict == Verdict.PASS | |
| 49 | - assert result.raw is not None | |
| 50 | - assert result.raw >= 0.5 | |
| 51 | - | |
| 52 | - | |
| 53 | -def test_fails_when_only_memorized_but_intent_generalize() -> None: | |
| 54 | - backend = _backend(par_lift_fraction=0.0) | |
| 55 | - probe, spec = build_probe( | |
| 56 | - { | |
| 57 | - "name": "pi", | |
| 58 | - "kind": "paraphrase_invariance", | |
| 59 | - "intent": "generalize", | |
| 60 | - "min_verbatim_lift": 0.05, | |
| 61 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}], | |
| 62 | - } | |
| 63 | - ) | |
| 64 | - ctx = RunContext(backend=backend) | |
| 65 | - result = probe.run(spec, ctx) | |
| 66 | - assert result.verdict == Verdict.FAIL | |
| 67 | - | |
| 68 | - | |
| 69 | -def test_passes_memorize_intent_when_only_memorized() -> None: | |
| 70 | - backend = _backend(par_lift_fraction=0.0) | |
| 71 | - probe, spec = build_probe( | |
| 72 | - { | |
| 73 | - "name": "pi", | |
| 74 | - "kind": "paraphrase_invariance", | |
| 75 | - "intent": "memorize", | |
| 76 | - "min_verbatim_lift": 0.05, | |
| 77 | - "max_generalization_ratio_if_memorize": 0.3, | |
| 78 | - "cases": [{"prompt": "Q", "gold": "A", "paraphrases": ["Q_par1"]}], | |
| 79 | - } | |
| 80 | - ) | |
| 81 | - ctx = RunContext(backend=backend) | |
| 82 | - result = probe.run(spec, ctx) | |
| 83 | - assert result.verdict == Verdict.PASS | |
| 84 | - | |
| 85 | - | |
| 86 | -def test_error_on_empty_cases() -> None: | |
| 87 | - probe, spec = build_probe({"name": "pi", "kind": "paraphrase_invariance", "cases": []}) | |
| 88 | - backend = _backend(par_lift_fraction=0.9) | |
| 89 | - ctx = RunContext(backend=backend) | |
| 90 | - result = probe.run(spec, ctx) | |
| 91 | - assert result.verdict == Verdict.ERROR | |
sway/tests/unit/test_probe_preference_flip.pydeleted@@ -1,161 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.preference_flip`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 6 | -from dlm_sway.core.result import Verdict | |
| 7 | -from dlm_sway.core.sections import Section, SectionPreference | |
| 8 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 9 | - | |
| 10 | - | |
| 11 | -def _backend(pairs: list[tuple[str, str, str, float, float]]) -> DummyDifferentialBackend: | |
| 12 | - """``pairs`` = list of (prompt, chosen, rejected, base_margin, ft_margin). | |
| 13 | - | |
| 14 | - We distribute the margin half to the chosen and half (negative) to | |
| 15 | - the rejected, which is enough to make logprob_of(chosen)-logprob_of(rejected) | |
| 16 | - equal the requested margin. | |
| 17 | - """ | |
| 18 | - base_lp: dict[tuple[str, str], float] = {} | |
| 19 | - ft_lp: dict[tuple[str, str], float] = {} | |
| 20 | - for prompt, chosen, rejected, base_m, ft_m in pairs: | |
| 21 | - base_lp[(prompt, chosen)] = base_m / 2 | |
| 22 | - base_lp[(prompt, rejected)] = -base_m / 2 | |
| 23 | - ft_lp[(prompt, chosen)] = ft_m / 2 | |
| 24 | - ft_lp[(prompt, rejected)] = -ft_m / 2 | |
| 25 | - return DummyDifferentialBackend( | |
| 26 | - base=DummyResponses(logprobs=base_lp), | |
| 27 | - ft=DummyResponses(logprobs=ft_lp), | |
| 28 | - ) | |
| 29 | - | |
| 30 | - | |
| 31 | -def test_pass_when_base_wrong_flipped() -> None: | |
| 32 | - backend = _backend( | |
| 33 | - [ | |
| 34 | - ("p1", "good1", "bad1", -2.0, 2.0), # base wrong, ft flips | |
| 35 | - ("p2", "good2", "bad2", -1.5, 1.0), # base wrong, ft flips | |
| 36 | - ("p3", "good3", "bad3", -0.5, 0.8), # base wrong, ft flips | |
| 37 | - ("p4", "good4", "bad4", 1.0, 2.0), # base already right (no contribution) | |
| 38 | - ] | |
| 39 | - ) | |
| 40 | - triples = [ | |
| 41 | - {"prompt": p, "chosen": c, "rejected": r} | |
| 42 | - for (p, c, r, _, _) in [ | |
| 43 | - ("p1", "good1", "bad1", 0, 0), | |
| 44 | - ("p2", "good2", "bad2", 0, 0), | |
| 45 | - ("p3", "good3", "bad3", 0, 0), | |
| 46 | - ("p4", "good4", "bad4", 0, 0), | |
| 47 | - ] | |
| 48 | - ] | |
| 49 | - probe, spec = build_probe( | |
| 50 | - { | |
| 51 | - "name": "pf", | |
| 52 | - "kind": "preference_flip", | |
| 53 | - "triples": triples, | |
| 54 | - "assert_flip_rate_gte": 0.7, | |
| 55 | - "min_triples_for_decision": 3, | |
| 56 | - } | |
| 57 | - ) | |
| 58 | - ctx = RunContext(backend=backend) | |
| 59 | - result = probe.run(spec, ctx) | |
| 60 | - assert result.verdict == Verdict.PASS | |
| 61 | - assert result.raw == 1.0 # 3/3 flipped | |
| 62 | - | |
| 63 | - | |
| 64 | -def test_fail_when_base_wrong_not_flipped() -> None: | |
| 65 | - backend = _backend( | |
| 66 | - [ | |
| 67 | - ("p1", "good1", "bad1", -2.0, -1.5), # base wrong, ft still wrong | |
| 68 | - ("p2", "good2", "bad2", -1.5, -1.0), # base wrong, ft still wrong | |
| 69 | - ("p3", "good3", "bad3", -0.5, 0.8), # base wrong, ft flips | |
| 70 | - ] | |
| 71 | - ) | |
| 72 | - triples = [ | |
| 73 | - {"prompt": p, "chosen": c, "rejected": r} | |
| 74 | - for p, c, r in [ | |
| 75 | - ("p1", "good1", "bad1"), | |
| 76 | - ("p2", "good2", "bad2"), | |
| 77 | - ("p3", "good3", "bad3"), | |
| 78 | - ] | |
| 79 | - ] | |
| 80 | - probe, spec = build_probe( | |
| 81 | - { | |
| 82 | - "name": "pf", | |
| 83 | - "kind": "preference_flip", | |
| 84 | - "triples": triples, | |
| 85 | - "assert_flip_rate_gte": 0.7, | |
| 86 | - "min_triples_for_decision": 3, | |
| 87 | - } | |
| 88 | - ) | |
| 89 | - ctx = RunContext(backend=backend) | |
| 90 | - result = probe.run(spec, ctx) | |
| 91 | - assert result.verdict == Verdict.FAIL | |
| 92 | - assert result.raw is not None | |
| 93 | - assert result.raw < 0.7 | |
| 94 | - | |
| 95 | - | |
| 96 | -def test_skip_when_no_triples_anywhere() -> None: | |
| 97 | - probe, spec = build_probe({"name": "pf", "kind": "preference_flip"}) | |
| 98 | - backend = _backend([]) | |
| 99 | - ctx = RunContext(backend=backend) | |
| 100 | - result = probe.run(spec, ctx) | |
| 101 | - assert result.verdict == Verdict.SKIP | |
| 102 | - | |
| 103 | - | |
| 104 | -def test_warn_when_too_few_base_wrong() -> None: | |
| 105 | - backend = _backend( | |
| 106 | - [ | |
| 107 | - ("p1", "good1", "bad1", 1.0, 2.0), # base right | |
| 108 | - ("p2", "good2", "bad2", 0.5, 1.0), # base right | |
| 109 | - ("p3", "good3", "bad3", -0.5, 0.5), # base wrong | |
| 110 | - ] | |
| 111 | - ) | |
| 112 | - triples = [ | |
| 113 | - {"prompt": p, "chosen": c, "rejected": r} | |
| 114 | - for p, c, r in [ | |
| 115 | - ("p1", "good1", "bad1"), | |
| 116 | - ("p2", "good2", "bad2"), | |
| 117 | - ("p3", "good3", "bad3"), | |
| 118 | - ] | |
| 119 | - ] | |
| 120 | - probe, spec = build_probe( | |
| 121 | - { | |
| 122 | - "name": "pf", | |
| 123 | - "kind": "preference_flip", | |
| 124 | - "triples": triples, | |
| 125 | - "min_triples_for_decision": 3, | |
| 126 | - } | |
| 127 | - ) | |
| 128 | - ctx = RunContext(backend=backend) | |
| 129 | - result = probe.run(spec, ctx) | |
| 130 | - assert result.verdict == Verdict.WARN | |
| 131 | - | |
| 132 | - | |
| 133 | -def test_triples_pulled_from_sections() -> None: | |
| 134 | - pref_section = Section( | |
| 135 | - id="p1", | |
| 136 | - kind="preference", | |
| 137 | - content="...", | |
| 138 | - preferences=( | |
| 139 | - SectionPreference(prompt="q1", chosen="good", rejected="bad"), | |
| 140 | - SectionPreference(prompt="q2", chosen="good2", rejected="bad2"), | |
| 141 | - SectionPreference(prompt="q3", chosen="good3", rejected="bad3"), | |
| 142 | - ), | |
| 143 | - ) | |
| 144 | - backend = _backend( | |
| 145 | - [ | |
| 146 | - ("q1", "good", "bad", -1.0, 1.0), | |
| 147 | - ("q2", "good2", "bad2", -1.0, 1.0), | |
| 148 | - ("q3", "good3", "bad3", -1.0, 1.0), | |
| 149 | - ] | |
| 150 | - ) | |
| 151 | - probe, spec = build_probe( | |
| 152 | - { | |
| 153 | - "name": "pf", | |
| 154 | - "kind": "preference_flip", | |
| 155 | - "assert_flip_rate_gte": 0.7, | |
| 156 | - "min_triples_for_decision": 3, | |
| 157 | - } | |
| 158 | - ) | |
| 159 | - ctx = RunContext(backend=backend, sections=(pref_section,)) | |
| 160 | - result = probe.run(spec, ctx) | |
| 161 | - assert result.verdict == Verdict.PASS | |
sway/tests/unit/test_probe_prompt_collapse.pydeleted@@ -1,137 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.prompt_collapse`. | |
| 2 | - | |
| 3 | -Uses a programmable dummy backend that serves different token dists | |
| 4 | -depending on whether the prompt contains the stuffing prefix. That's the | |
| 5 | -cleanest way to simulate "divergence decays with context length" without | |
| 6 | -a real model. | |
| 7 | -""" | |
| 8 | - | |
| 9 | -from __future__ import annotations | |
| 10 | - | |
| 11 | -import numpy as np | |
| 12 | - | |
| 13 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 14 | -from dlm_sway.core.result import Verdict | |
| 15 | -from dlm_sway.core.scoring import TokenDist | |
| 16 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 17 | -from dlm_sway.probes.prompt_collapse import _fit_half_life | |
| 18 | - | |
| 19 | - | |
| 20 | -class TestFitHalfLife: | |
| 21 | - def test_exponential_recovered(self) -> None: | |
| 22 | - lengths = np.array([0.0, 100.0, 200.0, 300.0]) | |
| 23 | - # y = 1.0 * exp(-x / 100) | |
| 24 | - y = np.exp(-lengths / 100.0) | |
| 25 | - h = _fit_half_life(lengths, y) | |
| 26 | - assert h is not None | |
| 27 | - import math | |
| 28 | - | |
| 29 | - # True half-life = ln(2) * 100 ≈ 69.3 | |
| 30 | - assert abs(h - math.log(2.0) * 100.0) < 1e-6 | |
| 31 | - | |
| 32 | - def test_returns_none_for_flat(self) -> None: | |
| 33 | - lengths = np.array([0.0, 100.0, 200.0]) | |
| 34 | - y = np.array([1e-10, 1e-10, 1e-10]) | |
| 35 | - assert _fit_half_life(lengths, y) is not None or _fit_half_life(lengths, y) is None | |
| 36 | - # Either None or a huge half-life — both acceptable for flat input. | |
| 37 | - | |
| 38 | - def test_returns_none_for_increasing(self) -> None: | |
| 39 | - lengths = np.array([0.0, 100.0, 200.0]) | |
| 40 | - y = np.array([0.1, 0.3, 0.5]) | |
| 41 | - assert _fit_half_life(lengths, y) is None | |
| 42 | - | |
| 43 | - | |
| 44 | -def _programmed_backend(stuffing_sensitivity: float) -> DummyDifferentialBackend: | |
| 45 | - """Return a backend whose divergence decays with prompt length. | |
| 46 | - | |
| 47 | - ``stuffing_sensitivity`` controls how quickly the ft distribution | |
| 48 | - snaps back to base as prompt length grows; lower = healthier adapter. | |
| 49 | - """ | |
| 50 | - import numpy as np | |
| 51 | - | |
| 52 | - base_probs = np.array([0.5, 0.3, 0.2], dtype=np.float32) | |
| 53 | - | |
| 54 | - class _StuffedResponses(DummyResponses): | |
| 55 | - def __init__(self, is_ft: bool): | |
| 56 | - super().__init__() | |
| 57 | - self._is_ft = is_ft | |
| 58 | - | |
| 59 | - # Override retrieval by subclassing the view's lookup path. | |
| 60 | - | |
| 61 | - # Simpler: use explicit prompts at each expected length to seed the dict. | |
| 62 | - # The probe prefixes stuffing so the dummy sees the exact final prompt. | |
| 63 | - # We pre-build dists for each prompt we expect to see. | |
| 64 | - base = DummyResponses() | |
| 65 | - ft = DummyResponses() | |
| 66 | - | |
| 67 | - # Pre-generate prompts the probe will query. The probe uses default | |
| 68 | - # context_lengths=[0,256,512,1024] times _STUFFING ~4 chars/tok. | |
| 69 | - from dlm_sway.probes.prompt_collapse import _stuffing | |
| 70 | - | |
| 71 | - for ctx_len in (0, 256, 512, 1024): | |
| 72 | - prefix = _stuffing(ctx_len) | |
| 73 | - for prompt in ("q1",): | |
| 74 | - key = prefix + prompt | |
| 75 | - # Base: always tight on token 1. | |
| 76 | - base.token_dists[key] = TokenDist( | |
| 77 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 78 | - logprobs=np.log(base_probs), | |
| 79 | - vocab_size=100, | |
| 80 | - ) | |
| 81 | - # FT: diverges at ctx=0, decays toward base with length. | |
| 82 | - decay = np.exp(-ctx_len * stuffing_sensitivity) | |
| 83 | - ft_probs = base_probs * (1.0 - decay) + np.array([0.1, 0.45, 0.45]) * decay | |
| 84 | - ft_probs = ft_probs / ft_probs.sum() | |
| 85 | - ft.token_dists[key] = TokenDist( | |
| 86 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 87 | - logprobs=np.log(ft_probs.astype(np.float32)), | |
| 88 | - vocab_size=100, | |
| 89 | - ) | |
| 90 | - return DummyDifferentialBackend(base=base, ft=ft) | |
| 91 | - | |
| 92 | - | |
| 93 | -class TestPromptCollapse: | |
| 94 | - def test_healthy_adapter_passes(self) -> None: | |
| 95 | - probe, spec = build_probe( | |
| 96 | - { | |
| 97 | - "name": "pc", | |
| 98 | - "kind": "prompt_collapse", | |
| 99 | - "prompts": ["q1"], | |
| 100 | - "context_lengths": [0, 256, 512, 1024], | |
| 101 | - "assert_half_life_tokens": 100, | |
| 102 | - } | |
| 103 | - ) | |
| 104 | - ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.001)) | |
| 105 | - result = probe.run(spec, ctx) | |
| 106 | - # Half-life should be well above 100 with slow decay. | |
| 107 | - assert result.verdict == Verdict.PASS | |
| 108 | - assert result.raw is not None | |
| 109 | - assert result.raw > 100 | |
| 110 | - | |
| 111 | - def test_collapsing_adapter_fails(self) -> None: | |
| 112 | - probe, spec = build_probe( | |
| 113 | - { | |
| 114 | - "name": "pc", | |
| 115 | - "kind": "prompt_collapse", | |
| 116 | - "prompts": ["q1"], | |
| 117 | - "context_lengths": [0, 256, 512, 1024], | |
| 118 | - "assert_half_life_tokens": 500, | |
| 119 | - } | |
| 120 | - ) | |
| 121 | - ctx = RunContext(backend=_programmed_backend(stuffing_sensitivity=0.02)) | |
| 122 | - result = probe.run(spec, ctx) | |
| 123 | - # Fast decay → short half-life → fail against 500-token threshold. | |
| 124 | - assert result.verdict == Verdict.FAIL | |
| 125 | - | |
| 126 | - def test_error_on_empty_prompts(self) -> None: | |
| 127 | - probe, spec = build_probe( | |
| 128 | - { | |
| 129 | - "name": "pc", | |
| 130 | - "kind": "prompt_collapse", | |
| 131 | - "prompts": [], | |
| 132 | - "context_lengths": [0, 256], | |
| 133 | - } | |
| 134 | - ) | |
| 135 | - ctx = RunContext(backend=_programmed_backend(0.001)) | |
| 136 | - result = probe.run(spec, ctx) | |
| 137 | - assert result.verdict == Verdict.ERROR | |
sway/tests/unit/test_probe_section_internalization.pydeleted@@ -1,94 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.section_internalization` (the flagship B1).""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import numpy as np | |
| 6 | - | |
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 8 | -from dlm_sway.core.result import Verdict | |
| 9 | -from dlm_sway.core.scoring import RollingLogprob | |
| 10 | -from dlm_sway.core.sections import Section, SectionProbe | |
| 11 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 12 | - | |
| 13 | - | |
| 14 | -def _rolling(mean_lp: float, n: int = 10) -> RollingLogprob: | |
| 15 | - lp = np.full(n - 1, mean_lp, dtype=np.float32) | |
| 16 | - return RollingLogprob( | |
| 17 | - token_ids=np.arange(n, dtype=np.int64), | |
| 18 | - logprobs=lp, | |
| 19 | - num_tokens=n, | |
| 20 | - total_logprob=float(lp.sum()), | |
| 21 | - ) | |
| 22 | - | |
| 23 | - | |
| 24 | -def _section(sid: str, kind: str = "prose", content: str = "content", probes=()) -> Section: | |
| 25 | - return Section(id=sid, kind=kind, content=content, probes=tuple(probes)) # type: ignore[arg-type] | |
| 26 | - | |
| 27 | - | |
| 28 | -def test_skip_without_sections() -> None: | |
| 29 | - probe, spec = build_probe({"name": "sis", "kind": "section_internalization"}) | |
| 30 | - backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | |
| 31 | - ctx = RunContext(backend=backend) | |
| 32 | - result = probe.run(spec, ctx) | |
| 33 | - assert result.verdict == Verdict.SKIP | |
| 34 | - | |
| 35 | - | |
| 36 | -def test_skip_with_single_section() -> None: | |
| 37 | - probe, spec = build_probe({"name": "sis", "kind": "section_internalization"}) | |
| 38 | - backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | |
| 39 | - ctx = RunContext(backend=backend, sections=(_section("a"),)) | |
| 40 | - result = probe.run(spec, ctx) | |
| 41 | - assert result.verdict == Verdict.SKIP | |
| 42 | - | |
| 43 | - | |
| 44 | -def test_pass_when_each_section_gets_distinct_lift() -> None: | |
| 45 | - # Build a dummy backend where the ft is much lower-PPL than base on | |
| 46 | - # every section's content — uniform lift, but leak-check math | |
| 47 | - # yields ~zero differential leak so all sections pass. | |
| 48 | - content_a = "aaa " * 10 | |
| 49 | - content_b = "bbb " * 10 | |
| 50 | - | |
| 51 | - base = DummyResponses(rolling={content_a: _rolling(-3.0), content_b: _rolling(-3.0)}) | |
| 52 | - ft = DummyResponses(rolling={content_a: _rolling(-1.0), content_b: _rolling(-2.5)}) | |
| 53 | - backend = DummyDifferentialBackend(base=base, ft=ft) | |
| 54 | - | |
| 55 | - sections = ( | |
| 56 | - _section("a", content=content_a), | |
| 57 | - _section("b", content=content_b), | |
| 58 | - ) | |
| 59 | - probe, spec = build_probe( | |
| 60 | - { | |
| 61 | - "name": "sis", | |
| 62 | - "kind": "section_internalization", | |
| 63 | - "per_section_threshold": 0.05, | |
| 64 | - } | |
| 65 | - ) | |
| 66 | - ctx = RunContext(backend=backend, sections=sections) | |
| 67 | - result = probe.run(spec, ctx) | |
| 68 | - assert result.verdict in (Verdict.PASS, Verdict.FAIL) | |
| 69 | - assert "per_section" in result.evidence | |
| 70 | - assert len(result.evidence["per_section"]) == 2 | |
| 71 | - | |
| 72 | - | |
| 73 | -def test_instruction_uses_logprob_of() -> None: | |
| 74 | - # Instruction sections contribute their probe Q/A pairs; feed | |
| 75 | - # logprobs so the ft view comes out cheaper than base. | |
| 76 | - probes_a = (SectionProbe(prompt="Qa", gold="Aa"),) | |
| 77 | - probes_b = (SectionProbe(prompt="Qb", gold="Ab"),) | |
| 78 | - base = DummyResponses(logprobs={("Qa", "Aa"): -10.0, ("Qb", "Ab"): -10.0}) | |
| 79 | - ft = DummyResponses(logprobs={("Qa", "Aa"): -3.0, ("Qb", "Ab"): -8.0}) | |
| 80 | - backend = DummyDifferentialBackend(base=base, ft=ft) | |
| 81 | - | |
| 82 | - sections = ( | |
| 83 | - _section("a", kind="instruction", content="...", probes=probes_a), | |
| 84 | - _section("b", kind="instruction", content="...", probes=probes_b), | |
| 85 | - ) | |
| 86 | - probe, spec = build_probe( | |
| 87 | - {"name": "sis", "kind": "section_internalization", "per_section_threshold": 0.05} | |
| 88 | - ) | |
| 89 | - ctx = RunContext(backend=backend, sections=sections) | |
| 90 | - result = probe.run(spec, ctx) | |
| 91 | - per = result.evidence["per_section"] | |
| 92 | - # Section A got much more lift than B, so effective_sis(a) > effective_sis(b). | |
| 93 | - sis_by_id = {row["section_id"]: row["effective_sis"] for row in per} | |
| 94 | - assert sis_by_id["a"] > sis_by_id["b"] | |
sway/tests/unit/test_probe_style_fingerprint.pydeleted@@ -1,115 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.probes.style_fingerprint`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import numpy as np | |
| 6 | - | |
| 7 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 8 | -from dlm_sway.core.result import Verdict | |
| 9 | -from dlm_sway.probes.base import RunContext, build_probe | |
| 10 | -from dlm_sway.probes.style_fingerprint import fingerprint | |
| 11 | - | |
| 12 | - | |
| 13 | -class TestFingerprint: | |
| 14 | - def test_zero_vector_for_empty(self) -> None: | |
| 15 | - fp = fingerprint("") | |
| 16 | - assert fp.shape == (6,) | |
| 17 | - assert np.allclose(fp, 0.0) | |
| 18 | - | |
| 19 | - def test_non_zero_for_normal_text(self) -> None: | |
| 20 | - fp = fingerprint("This is a sentence. This is another one. A third.") | |
| 21 | - assert fp.shape == (6,) | |
| 22 | - assert fp[0] > 0 # mean sentence length | |
| 23 | - assert fp[2] > 0 # TTR | |
| 24 | - assert fp[3] > 0 # avg word length | |
| 25 | - | |
| 26 | - def test_distinct_styles_distinct_fingerprints(self) -> None: | |
| 27 | - terse = "Go. Now. Quick." | |
| 28 | - verbose = ( | |
| 29 | - "We must, with all deliberate speed and measured consideration, " | |
| 30 | - "proceed expeditiously towards the elaborated and carefully " | |
| 31 | - "constructed resolution of the foregoing matter." | |
| 32 | - ) | |
| 33 | - assert not np.allclose(fingerprint(terse), fingerprint(verbose)) | |
| 34 | - | |
| 35 | - | |
| 36 | -def _backend_with_samples(base: list[str], ft: list[str]) -> DummyDifferentialBackend: | |
| 37 | - return DummyDifferentialBackend( | |
| 38 | - base=DummyResponses(generations={f"p{i}": s for i, s in enumerate(base)}), | |
| 39 | - ft=DummyResponses(generations={f"p{i}": s for i, s in enumerate(ft)}), | |
| 40 | - ) | |
| 41 | - | |
| 42 | - | |
| 43 | -class TestProbe: | |
| 44 | - def test_pass_when_ft_drifts_toward_doc(self) -> None: | |
| 45 | - base_samples = ["Short. Plain. Words."] * 2 | |
| 46 | - ft_samples = [ | |
| 47 | - "Wherein many clauses conjoin themselves, through extended " | |
| 48 | - "ruminations, unto a meandering whole of considerable length." | |
| 49 | - ] * 2 | |
| 50 | - doc = ( | |
| 51 | - "Wherein many clauses conjoin themselves, through extended " | |
| 52 | - "ruminations, unto a meandering whole of considerable length. " | |
| 53 | - "Further elaboration, no less copious, follows apace." | |
| 54 | - ) | |
| 55 | - backend = _backend_with_samples(base_samples, ft_samples) | |
| 56 | - probe, spec = build_probe( | |
| 57 | - { | |
| 58 | - "name": "c1", | |
| 59 | - "kind": "style_fingerprint", | |
| 60 | - "prompts": ["p0", "p1"], | |
| 61 | - "doc_reference": doc, | |
| 62 | - "max_new_tokens": 32, | |
| 63 | - "assert_shift_gte": 0.2, | |
| 64 | - } | |
| 65 | - ) | |
| 66 | - ctx = RunContext(backend=backend) | |
| 67 | - result = probe.run(spec, ctx) | |
| 68 | - assert result.verdict == Verdict.PASS | |
| 69 | - assert result.raw is not None | |
| 70 | - assert result.raw > 0.2 | |
| 71 | - | |
| 72 | - def test_fail_when_no_stylistic_shift(self) -> None: | |
| 73 | - base_samples = ["Short. Plain. Words."] * 2 | |
| 74 | - ft_samples = ["Short. Plain. Words."] * 2 | |
| 75 | - doc = "Wherein clauses conjoin into meandering wholes of length." | |
| 76 | - backend = _backend_with_samples(base_samples, ft_samples) | |
| 77 | - probe, spec = build_probe( | |
| 78 | - { | |
| 79 | - "name": "c1", | |
| 80 | - "kind": "style_fingerprint", | |
| 81 | - "prompts": ["p0", "p1"], | |
| 82 | - "doc_reference": doc, | |
| 83 | - "assert_shift_gte": 0.25, | |
| 84 | - } | |
| 85 | - ) | |
| 86 | - ctx = RunContext(backend=backend) | |
| 87 | - result = probe.run(spec, ctx) | |
| 88 | - assert result.verdict == Verdict.FAIL | |
| 89 | - | |
| 90 | - def test_skip_without_doc_reference(self) -> None: | |
| 91 | - backend = _backend_with_samples(["x"], ["y"]) | |
| 92 | - probe, spec = build_probe( | |
| 93 | - { | |
| 94 | - "name": "c1", | |
| 95 | - "kind": "style_fingerprint", | |
| 96 | - "prompts": ["p0"], | |
| 97 | - } | |
| 98 | - ) | |
| 99 | - ctx = RunContext(backend=backend) | |
| 100 | - result = probe.run(spec, ctx) | |
| 101 | - assert result.verdict == Verdict.SKIP | |
| 102 | - | |
| 103 | - def test_error_on_empty_prompts(self) -> None: | |
| 104 | - backend = _backend_with_samples([], []) | |
| 105 | - probe, spec = build_probe( | |
| 106 | - { | |
| 107 | - "name": "c1", | |
| 108 | - "kind": "style_fingerprint", | |
| 109 | - "prompts": [], | |
| 110 | - "doc_reference": "doc", | |
| 111 | - } | |
| 112 | - ) | |
| 113 | - ctx = RunContext(backend=backend) | |
| 114 | - result = probe.run(spec, ctx) | |
| 115 | - assert result.verdict == Verdict.ERROR | |
sway/tests/unit/test_result.pydeleted@@ -1,82 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.core.result`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dataclasses import FrozenInstanceError | |
| 6 | - | |
| 7 | -import pytest | |
| 8 | - | |
| 9 | -from dlm_sway.core.result import ( | |
| 10 | - DEFAULT_COMPONENT_WEIGHTS, | |
| 11 | - ProbeResult, | |
| 12 | - SuiteResult, | |
| 13 | - SwayScore, | |
| 14 | - Verdict, | |
| 15 | - utcnow, | |
| 16 | -) | |
| 17 | - | |
| 18 | - | |
| 19 | -class TestVerdict: | |
| 20 | - def test_is_str_enum(self) -> None: | |
| 21 | - assert Verdict.PASS.value == "pass" | |
| 22 | - assert str(Verdict.WARN.value) == "warn" | |
| 23 | - | |
| 24 | - def test_all_expected_members(self) -> None: | |
| 25 | - assert {v.value for v in Verdict} == { | |
| 26 | - "pass", | |
| 27 | - "fail", | |
| 28 | - "warn", | |
| 29 | - "skip", | |
| 30 | - "error", | |
| 31 | - } | |
| 32 | - | |
| 33 | - | |
| 34 | -class TestProbeResult: | |
| 35 | - def test_minimum_construction(self) -> None: | |
| 36 | - r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82) | |
| 37 | - assert r.raw is None | |
| 38 | - assert r.evidence == {} | |
| 39 | - assert r.message == "" | |
| 40 | - assert r.duration_s == 0.0 | |
| 41 | - | |
| 42 | - def test_frozen(self) -> None: | |
| 43 | - r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5) | |
| 44 | - with pytest.raises(FrozenInstanceError): | |
| 45 | - r.score = 0.6 # type: ignore[misc] | |
| 46 | - | |
| 47 | - | |
| 48 | -class TestSuiteResult: | |
| 49 | - def test_wall_seconds(self) -> None: | |
| 50 | - from datetime import timedelta | |
| 51 | - | |
| 52 | - started = utcnow() | |
| 53 | - finished = started + timedelta(seconds=2, milliseconds=500) | |
| 54 | - result = SuiteResult( | |
| 55 | - spec_path="sway.yaml", | |
| 56 | - started_at=started, | |
| 57 | - finished_at=finished, | |
| 58 | - base_model_id="b", | |
| 59 | - adapter_id="a", | |
| 60 | - sway_version="0.1.0.dev0", | |
| 61 | - ) | |
| 62 | - assert result.wall_seconds == pytest.approx(2.5, abs=1e-6) | |
| 63 | - | |
| 64 | - | |
| 65 | -class TestSwayScore: | |
| 66 | - def test_default_weights_sum_to_one(self) -> None: | |
| 67 | - assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9 | |
| 68 | - | |
| 69 | - def test_band_boundaries(self) -> None: | |
| 70 | - assert SwayScore.band_for(0.0) == "noise" | |
| 71 | - assert SwayScore.band_for(0.29) == "noise" | |
| 72 | - assert SwayScore.band_for(0.30) == "partial" | |
| 73 | - assert SwayScore.band_for(0.59) == "partial" | |
| 74 | - assert SwayScore.band_for(0.60) == "healthy" | |
| 75 | - assert SwayScore.band_for(0.85) == "healthy" | |
| 76 | - assert SwayScore.band_for(0.851) == "suspicious" | |
| 77 | - assert SwayScore.band_for(0.99) == "suspicious" | |
| 78 | - | |
| 79 | - | |
| 80 | -def test_utcnow_is_tz_aware() -> None: | |
| 81 | - now = utcnow() | |
| 82 | - assert now.tzinfo is not None | |
sway/tests/unit/test_scoring.pydeleted@@ -1,84 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.core.scoring`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import math | |
| 6 | - | |
| 7 | -import numpy as np | |
| 8 | - | |
| 9 | -from dlm_sway.core.scoring import ( | |
| 10 | - DifferentialBackend, | |
| 11 | - RollingLogprob, | |
| 12 | - ScoringBackend, | |
| 13 | - TokenDist, | |
| 14 | -) | |
| 15 | - | |
| 16 | - | |
| 17 | -class TestRollingLogprob: | |
| 18 | - def test_empty_sequence(self) -> None: | |
| 19 | - r = RollingLogprob( | |
| 20 | - token_ids=np.array([42], dtype=np.int64), | |
| 21 | - logprobs=np.array([], dtype=np.float32), | |
| 22 | - num_tokens=1, | |
| 23 | - total_logprob=0.0, | |
| 24 | - ) | |
| 25 | - assert r.mean_logprob == 0.0 | |
| 26 | - assert r.perplexity == 1.0 | |
| 27 | - | |
| 28 | - def test_mean_and_perplexity(self) -> None: | |
| 29 | - # Three tokens, two transition logprobs summing to -4.0 → mean -2.0. | |
| 30 | - r = RollingLogprob( | |
| 31 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 32 | - logprobs=np.array([-1.5, -2.5], dtype=np.float32), | |
| 33 | - num_tokens=3, | |
| 34 | - total_logprob=-4.0, | |
| 35 | - ) | |
| 36 | - assert math.isclose(r.mean_logprob, -2.0, rel_tol=1e-6) | |
| 37 | - assert math.isclose(r.perplexity, math.exp(2.0), rel_tol=1e-6) | |
| 38 | - | |
| 39 | - | |
| 40 | -class TestTokenDist: | |
| 41 | - def test_construction_and_defaults(self) -> None: | |
| 42 | - dist = TokenDist( | |
| 43 | - token_ids=np.array([1, 2, 3], dtype=np.int64), | |
| 44 | - logprobs=np.array([-0.1, -1.0, -3.0], dtype=np.float32), | |
| 45 | - vocab_size=50_257, | |
| 46 | - ) | |
| 47 | - assert dist.tail_logprob == 0.0 | |
| 48 | - assert dist.token_ids.shape == (3,) | |
| 49 | - | |
| 50 | - | |
| 51 | -class TestProtocols: | |
| 52 | - def test_scoring_backend_runtime_checkable(self) -> None: | |
| 53 | - class FakeScoring: | |
| 54 | - def logprob_of(self, prompt: str, completion: str) -> float: | |
| 55 | - return 0.0 | |
| 56 | - | |
| 57 | - def rolling_logprob(self, text: str) -> RollingLogprob: | |
| 58 | - return RollingLogprob( | |
| 59 | - token_ids=np.array([0], dtype=np.int64), | |
| 60 | - logprobs=np.array([], dtype=np.float32), | |
| 61 | - num_tokens=1, | |
| 62 | - total_logprob=0.0, | |
| 63 | - ) | |
| 64 | - | |
| 65 | - def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist: | |
| 66 | - return TokenDist( | |
| 67 | - token_ids=np.array([0], dtype=np.int64), | |
| 68 | - logprobs=np.array([0.0], dtype=np.float32), | |
| 69 | - vocab_size=1, | |
| 70 | - ) | |
| 71 | - | |
| 72 | - assert isinstance(FakeScoring(), ScoringBackend) | |
| 73 | - | |
| 74 | - def test_differential_backend_runtime_checkable(self) -> None: | |
| 75 | - from contextlib import nullcontext | |
| 76 | - | |
| 77 | - class FakeDiff: | |
| 78 | - def as_base(self): # type: ignore[no-untyped-def] | |
| 79 | - return nullcontext(object()) | |
| 80 | - | |
| 81 | - def as_finetuned(self): # type: ignore[no-untyped-def] | |
| 82 | - return nullcontext(object()) | |
| 83 | - | |
| 84 | - assert isinstance(FakeDiff(), DifferentialBackend) | |
sway/tests/unit/test_sections.pydeleted@@ -1,35 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.core.sections`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from dlm_sway.core.sections import ( | |
| 6 | - Section, | |
| 7 | - SectionPreference, | |
| 8 | - SectionProbe, | |
| 9 | - filter_kinds, | |
| 10 | -) | |
| 11 | - | |
| 12 | - | |
| 13 | -def test_default_field_types() -> None: | |
| 14 | - s = Section(id="abc", kind="prose", content="hello world") | |
| 15 | - assert s.probes == () | |
| 16 | - assert s.preferences == () | |
| 17 | - assert s.tag is None | |
| 18 | - | |
| 19 | - | |
| 20 | -def test_filter_kinds() -> None: | |
| 21 | - sections = ( | |
| 22 | - Section(id="a", kind="prose", content="x"), | |
| 23 | - Section(id="b", kind="instruction", content="y"), | |
| 24 | - Section(id="c", kind="preference", content="z"), | |
| 25 | - ) | |
| 26 | - only_prose = filter_kinds(sections, ("prose",)) | |
| 27 | - assert len(only_prose) == 1 | |
| 28 | - assert only_prose[0].id == "a" | |
| 29 | - | |
| 30 | - | |
| 31 | -def test_section_probe_and_preference() -> None: | |
| 32 | - p = SectionProbe(prompt="Q", gold="A") | |
| 33 | - assert p.prompt == "Q" | |
| 34 | - pref = SectionPreference(prompt="P", chosen="good", rejected="bad") | |
| 35 | - assert pref.chosen == "good" | |
sway/tests/unit/test_suite_runner.pydeleted@@ -1,134 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.suite.runner`. | |
| 2 | - | |
| 3 | -Uses the dummy backend + ad-hoc probe classes so nothing real is loaded. | |
| 4 | -""" | |
| 5 | - | |
| 6 | -from __future__ import annotations | |
| 7 | - | |
| 8 | -from typing import Literal | |
| 9 | - | |
| 10 | -import pytest | |
| 11 | - | |
| 12 | -from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses | |
| 13 | -from dlm_sway.core.errors import ProbeError | |
| 14 | -from dlm_sway.core.result import ProbeResult, Verdict | |
| 15 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 16 | -from dlm_sway.suite.runner import run | |
| 17 | -from dlm_sway.suite.spec import SwaySpec | |
| 18 | - | |
| 19 | - | |
| 20 | -class _PassSpec(ProbeSpec): | |
| 21 | - kind: Literal["__runner_pass"] = "__runner_pass" | |
| 22 | - | |
| 23 | - | |
| 24 | -class _PassProbe(Probe): | |
| 25 | - kind = "__runner_pass" | |
| 26 | - spec_cls = _PassSpec | |
| 27 | - category = "adherence" | |
| 28 | - | |
| 29 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 30 | - return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.PASS, score=0.9) | |
| 31 | - | |
| 32 | - | |
| 33 | -class _FailSpec(ProbeSpec): | |
| 34 | - kind: Literal["__runner_fail"] = "__runner_fail" | |
| 35 | - | |
| 36 | - | |
| 37 | -class _FailProbe(Probe): | |
| 38 | - kind = "__runner_fail" | |
| 39 | - spec_cls = _FailSpec | |
| 40 | - category = "attribution" | |
| 41 | - | |
| 42 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 43 | - return ProbeResult(name=spec.name, kind=spec.kind, verdict=Verdict.FAIL, score=0.1) | |
| 44 | - | |
| 45 | - | |
| 46 | -class _RaiseSpec(ProbeSpec): | |
| 47 | - kind: Literal["__runner_raise"] = "__runner_raise" | |
| 48 | - | |
| 49 | - | |
| 50 | -class _RaiseProbe(Probe): | |
| 51 | - kind = "__runner_raise" | |
| 52 | - spec_cls = _RaiseSpec | |
| 53 | - | |
| 54 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 55 | - raise ProbeError(spec.kind, "kaboom") | |
| 56 | - | |
| 57 | - | |
| 58 | -class _UnexpectedSpec(ProbeSpec): | |
| 59 | - kind: Literal["__runner_unexpected"] = "__runner_unexpected" | |
| 60 | - | |
| 61 | - | |
| 62 | -class _UnexpectedProbe(Probe): | |
| 63 | - kind = "__runner_unexpected" | |
| 64 | - spec_cls = _UnexpectedSpec | |
| 65 | - | |
| 66 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 67 | - raise ValueError("surprise") | |
| 68 | - | |
| 69 | - | |
| 70 | -@pytest.fixture | |
| 71 | -def backend() -> DummyDifferentialBackend: | |
| 72 | - return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) | |
| 73 | - | |
| 74 | - | |
| 75 | -def _spec(*entries: dict) -> SwaySpec: | |
| 76 | - return SwaySpec.model_validate( | |
| 77 | - { | |
| 78 | - "version": 1, | |
| 79 | - "models": { | |
| 80 | - "base": {"base": "b"}, | |
| 81 | - "ft": {"base": "b", "adapter": "/tmp/a"}, | |
| 82 | - }, | |
| 83 | - "suite": list(entries), | |
| 84 | - } | |
| 85 | - ) | |
| 86 | - | |
| 87 | - | |
| 88 | -class TestRunner: | |
| 89 | - def test_runs_each_probe_in_order(self, backend: DummyDifferentialBackend) -> None: | |
| 90 | - spec = _spec( | |
| 91 | - {"name": "p1", "kind": "__runner_pass"}, | |
| 92 | - {"name": "p2", "kind": "__runner_fail"}, | |
| 93 | - ) | |
| 94 | - result = run(spec, backend) | |
| 95 | - assert [r.name for r in result.probes] == ["p1", "p2"] | |
| 96 | - assert result.probes[0].verdict == Verdict.PASS | |
| 97 | - assert result.probes[1].verdict == Verdict.FAIL | |
| 98 | - | |
| 99 | - def test_disabled_probe_records_skip(self, backend: DummyDifferentialBackend) -> None: | |
| 100 | - spec = _spec({"name": "p1", "kind": "__runner_pass", "enabled": False}) | |
| 101 | - result = run(spec, backend) | |
| 102 | - assert result.probes[0].verdict == Verdict.SKIP | |
| 103 | - assert "disabled" in result.probes[0].message | |
| 104 | - | |
| 105 | - def test_probeerror_becomes_error_verdict(self, backend: DummyDifferentialBackend) -> None: | |
| 106 | - spec = _spec({"name": "oops", "kind": "__runner_raise"}) | |
| 107 | - result = run(spec, backend) | |
| 108 | - assert result.probes[0].verdict == Verdict.ERROR | |
| 109 | - assert "kaboom" in result.probes[0].message | |
| 110 | - | |
| 111 | - def test_unexpected_exception_becomes_error_verdict( | |
| 112 | - self, backend: DummyDifferentialBackend | |
| 113 | - ) -> None: | |
| 114 | - spec = _spec({"name": "oops", "kind": "__runner_unexpected"}) | |
| 115 | - result = run(spec, backend) | |
| 116 | - assert result.probes[0].verdict == Verdict.ERROR | |
| 117 | - assert "ValueError" in result.probes[0].message | |
| 118 | - | |
| 119 | - def test_wall_seconds_populated(self, backend: DummyDifferentialBackend) -> None: | |
| 120 | - spec = _spec({"name": "p1", "kind": "__runner_pass"}) | |
| 121 | - result = run(spec, backend) | |
| 122 | - assert result.wall_seconds >= 0 | |
| 123 | - assert result.probes[0].duration_s >= 0 | |
| 124 | - | |
| 125 | - def test_null_adapter_passes_on_null_calibrated_backend( | |
| 126 | - self, backend: DummyDifferentialBackend | |
| 127 | - ) -> None: | |
| 128 | - # Dummy backend implements NullCalibratedBackend, so calibration runs. | |
| 129 | - spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]}) | |
| 130 | - result = run(spec, backend) | |
| 131 | - assert result.probes[0].kind == "null_adapter" | |
| 132 | - assert result.probes[0].verdict == Verdict.PASS | |
| 133 | - # And the suite's null_stats bubbles up onto the result. | |
| 134 | - assert "delta_kl" in result.null_stats | |
sway/tests/unit/test_suite_score_report.pydeleted@@ -1,217 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -import json | |
| 6 | -from datetime import timedelta | |
| 7 | -from typing import Literal | |
| 8 | - | |
| 9 | -import pytest | |
| 10 | - | |
| 11 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | |
| 12 | -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext | |
| 13 | -from dlm_sway.suite import report, score | |
| 14 | -from dlm_sway.suite.spec import SwaySpec | |
| 15 | - | |
| 16 | - | |
| 17 | -class _AdherenceSpec(ProbeSpec): | |
| 18 | - kind: Literal["__score_adherence"] = "__score_adherence" | |
| 19 | - | |
| 20 | - | |
| 21 | -class _AdherenceProbe(Probe): | |
| 22 | - kind = "__score_adherence" | |
| 23 | - spec_cls = _AdherenceSpec | |
| 24 | - category = "adherence" | |
| 25 | - | |
| 26 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 27 | - raise NotImplementedError # never executed; registered for category lookup | |
| 28 | - | |
| 29 | - | |
| 30 | -class _AttributionSpec(ProbeSpec): | |
| 31 | - kind: Literal["__score_attribution"] = "__score_attribution" | |
| 32 | - | |
| 33 | - | |
| 34 | -class _AttributionProbe(Probe): | |
| 35 | - kind = "__score_attribution" | |
| 36 | - spec_cls = _AttributionSpec | |
| 37 | - category = "attribution" | |
| 38 | - | |
| 39 | - def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: | |
| 40 | - raise NotImplementedError | |
| 41 | - | |
| 42 | - | |
| 43 | -def _synth_suite(*probes: ProbeResult) -> SuiteResult: | |
| 44 | - started = utcnow() | |
| 45 | - return SuiteResult( | |
| 46 | - spec_path="sway.yaml", | |
| 47 | - started_at=started, | |
| 48 | - finished_at=started + timedelta(seconds=1), | |
| 49 | - base_model_id="base", | |
| 50 | - adapter_id="adapter", | |
| 51 | - sway_version="0.1.0.dev0", | |
| 52 | - probes=probes, | |
| 53 | - ) | |
| 54 | - | |
| 55 | - | |
| 56 | -class TestCompute: | |
| 57 | - def test_single_passing_probe(self) -> None: | |
| 58 | - suite = _synth_suite( | |
| 59 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) | |
| 60 | - ) | |
| 61 | - s = score.compute(suite) | |
| 62 | - assert s.overall == pytest.approx(0.8) | |
| 63 | - assert s.components["adherence"] == pytest.approx(0.8) | |
| 64 | - assert s.band == "healthy" | |
| 65 | - | |
| 66 | - def test_mixed_categories_weighted(self) -> None: | |
| 67 | - suite = _synth_suite( | |
| 68 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), | |
| 69 | - ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3), | |
| 70 | - ) | |
| 71 | - s = score.compute(suite) | |
| 72 | - # Active categories: adherence (0.30) + attribution (0.35). Normalized. | |
| 73 | - expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35) | |
| 74 | - assert s.overall == pytest.approx(expected) | |
| 75 | - | |
| 76 | - def test_errors_and_skips_excluded(self) -> None: | |
| 77 | - suite = _synth_suite( | |
| 78 | - ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), | |
| 79 | - ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None), | |
| 80 | - ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None), | |
| 81 | - ) | |
| 82 | - s = score.compute(suite) | |
| 83 | - assert s.components["adherence"] == pytest.approx(0.9) | |
| 84 | - | |
| 85 | - def test_per_probe_weights_override_uniform(self) -> None: | |
| 86 | - suite = _synth_suite( | |
| 87 | - ProbeResult( | |
| 88 | - name="a", | |
| 89 | - kind="__score_adherence", | |
| 90 | - verdict=Verdict.PASS, | |
| 91 | - score=1.0, | |
| 92 | - evidence={"weight": 3.0}, | |
| 93 | - ), | |
| 94 | - ProbeResult( | |
| 95 | - name="b", | |
| 96 | - kind="__score_adherence", | |
| 97 | - verdict=Verdict.PASS, | |
| 98 | - score=0.0, | |
| 99 | - evidence={"weight": 1.0}, | |
| 100 | - ), | |
| 101 | - ) | |
| 102 | - s = score.compute(suite) | |
| 103 | - # Weighted mean: (3·1 + 1·0) / 4 = 0.75 | |
| 104 | - assert s.components["adherence"] == pytest.approx(0.75) | |
| 105 | - | |
| 106 | - def test_failed_probe_surfaces_in_findings(self) -> None: | |
| 107 | - suite = _synth_suite( | |
| 108 | - ProbeResult( | |
| 109 | - name="bad", | |
| 110 | - kind="__score_adherence", | |
| 111 | - verdict=Verdict.FAIL, | |
| 112 | - score=0.1, | |
| 113 | - message="nope", | |
| 114 | - ) | |
| 115 | - ) | |
| 116 | - s = score.compute(suite) | |
| 117 | - assert any("bad" in f for f in s.findings) | |
| 118 | - | |
| 119 | - | |
| 120 | -class TestJsonReport: | |
| 121 | - def test_schema_fields(self) -> None: | |
| 122 | - suite = _synth_suite( | |
| 123 | - ProbeResult( | |
| 124 | - name="p1", | |
| 125 | - kind="__score_adherence", | |
| 126 | - verdict=Verdict.PASS, | |
| 127 | - score=0.75, | |
| 128 | - raw=0.12, | |
| 129 | - z_score=3.1, | |
| 130 | - ) | |
| 131 | - ) | |
| 132 | - s = score.compute(suite) | |
| 133 | - out = json.loads(report.to_json(suite, s)) | |
| 134 | - assert out["schema_version"] == 1 | |
| 135 | - assert out["score"]["overall"] == pytest.approx(0.75) | |
| 136 | - assert out["probes"][0]["verdict"] == "pass" | |
| 137 | - assert out["probes"][0]["z_score"] == pytest.approx(3.1) | |
| 138 | - | |
| 139 | - | |
| 140 | -class TestJunit: | |
| 141 | - def test_counts_populated(self) -> None: | |
| 142 | - suite = _synth_suite( | |
| 143 | - ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0), | |
| 144 | - ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0), | |
| 145 | - ProbeResult( | |
| 146 | - name="p3", | |
| 147 | - kind="__score_adherence", | |
| 148 | - verdict=Verdict.ERROR, | |
| 149 | - score=None, | |
| 150 | - ), | |
| 151 | - ) | |
| 152 | - s = score.compute(suite) | |
| 153 | - xml = report.to_junit(suite, s) | |
| 154 | - assert 'tests="3"' in xml | |
| 155 | - assert 'failures="1"' in xml | |
| 156 | - assert 'errors="1"' in xml | |
| 157 | - assert "<failure" in xml | |
| 158 | - assert "<error" in xml | |
| 159 | - | |
| 160 | - | |
| 161 | -class TestMarkdown: | |
| 162 | - def test_contains_probe_table(self) -> None: | |
| 163 | - suite = _synth_suite( | |
| 164 | - ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) | |
| 165 | - ) | |
| 166 | - s = score.compute(suite) | |
| 167 | - md = report.to_markdown(suite, s) | |
| 168 | - assert "dlm-sway report" in md | |
| 169 | - assert "| p1 | `__score_adherence`" in md | |
| 170 | - | |
| 171 | - | |
| 172 | -class TestTerminal: | |
| 173 | - def test_renders_without_error(self) -> None: | |
| 174 | - import io | |
| 175 | - | |
| 176 | - from rich.console import Console | |
| 177 | - | |
| 178 | - suite = _synth_suite( | |
| 179 | - ProbeResult( | |
| 180 | - name="p1", | |
| 181 | - kind="__score_adherence", | |
| 182 | - verdict=Verdict.PASS, | |
| 183 | - score=0.8, | |
| 184 | - raw=0.12, | |
| 185 | - z_score=3.1, | |
| 186 | - message="looks fine", | |
| 187 | - ), | |
| 188 | - ProbeResult( | |
| 189 | - name="p2", | |
| 190 | - kind="__score_attribution", | |
| 191 | - verdict=Verdict.FAIL, | |
| 192 | - score=0.1, | |
| 193 | - message="a very long message that will be truncated — " * 5, | |
| 194 | - ), | |
| 195 | - ProbeResult( | |
| 196 | - name="p3", | |
| 197 | - kind="__score_adherence", | |
| 198 | - verdict=Verdict.SKIP, | |
| 199 | - score=None, | |
| 200 | - ), | |
| 201 | - ) | |
| 202 | - s = score.compute(suite) | |
| 203 | - buf = io.StringIO() | |
| 204 | - console = Console(file=buf, force_terminal=False, width=120) | |
| 205 | - report.to_terminal(suite, s, console=console) | |
| 206 | - out = buf.getvalue() | |
| 207 | - assert "dlm-sway report" in out | |
| 208 | - assert "overall:" in out | |
| 209 | - assert "p1" in out | |
| 210 | - assert "p2" in out | |
| 211 | - # Top findings section kicks in because p2 failed. | |
| 212 | - assert "top findings" in out | |
| 213 | - | |
| 214 | - | |
| 215 | -# Force the SwaySpec model to stay reachable from tests (keeps mypy happy | |
| 216 | -# on the eventual CLI path that calls into both). | |
| 217 | -assert SwaySpec is not None | |
sway/tests/unit/test_suite_spec.pydeleted@@ -1,85 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.suite.spec` + :mod:`dlm_sway.suite.loader`.""" | |
| 2 | - | |
| 3 | -from __future__ import annotations | |
| 4 | - | |
| 5 | -from pathlib import Path | |
| 6 | - | |
| 7 | -import pytest | |
| 8 | - | |
| 9 | -from dlm_sway.core.errors import SpecValidationError | |
| 10 | -from dlm_sway.suite.loader import from_dict, load_spec | |
| 11 | -from dlm_sway.suite.spec import SwaySpec | |
| 12 | - | |
| 13 | - | |
| 14 | -def _minimum_valid() -> dict: | |
| 15 | - return { | |
| 16 | - "version": 1, | |
| 17 | - "models": { | |
| 18 | - "base": {"kind": "hf", "base": "HuggingFaceTB/SmolLM2-135M-Instruct"}, | |
| 19 | - "ft": { | |
| 20 | - "kind": "hf", | |
| 21 | - "base": "HuggingFaceTB/SmolLM2-135M-Instruct", | |
| 22 | - "adapter": "/tmp/adapter", | |
| 23 | - }, | |
| 24 | - }, | |
| 25 | - "suite": [], | |
| 26 | - } | |
| 27 | - | |
| 28 | - | |
| 29 | -class TestSwaySpec: | |
| 30 | - def test_minimum_valid(self) -> None: | |
| 31 | - spec = from_dict(_minimum_valid()) | |
| 32 | - assert isinstance(spec, SwaySpec) | |
| 33 | - assert spec.version == 1 | |
| 34 | - assert spec.defaults.seed == 0 | |
| 35 | - assert spec.defaults.differential is True | |
| 36 | - assert spec.suite == [] | |
| 37 | - | |
| 38 | - def test_rejects_unknown_top_level_keys(self) -> None: | |
| 39 | - data = _minimum_valid() | |
| 40 | - data["bogus"] = True | |
| 41 | - with pytest.raises(SpecValidationError) as exc_info: | |
| 42 | - from_dict(data) | |
| 43 | - assert "bogus" in str(exc_info.value).lower() | |
| 44 | - | |
| 45 | - def test_rejects_future_version(self) -> None: | |
| 46 | - data = _minimum_valid() | |
| 47 | - data["version"] = 9 | |
| 48 | - with pytest.raises(SpecValidationError, match="unsupported sway spec version"): | |
| 49 | - from_dict(data) | |
| 50 | - | |
| 51 | - def test_defaults_frozen(self) -> None: | |
| 52 | - spec = from_dict(_minimum_valid()) | |
| 53 | - from pydantic import ValidationError | |
| 54 | - | |
| 55 | - with pytest.raises(ValidationError): | |
| 56 | - spec.defaults.seed = 99 # type: ignore[misc] | |
| 57 | - | |
| 58 | - | |
| 59 | -class TestLoader: | |
| 60 | - def test_missing_file(self, tmp_path: Path) -> None: | |
| 61 | - missing = tmp_path / "nope.yaml" | |
| 62 | - with pytest.raises(SpecValidationError, match="not found"): | |
| 63 | - load_spec(missing) | |
| 64 | - | |
| 65 | - def test_invalid_yaml(self, tmp_path: Path) -> None: | |
| 66 | - bad = tmp_path / "bad.yaml" | |
| 67 | - # An unmatched { triggers yaml.scanner; a structurally ambiguous | |
| 68 | - # indent parses as a string value, which isn't a YAML error. | |
| 69 | - bad.write_text("{ unmatched: [", encoding="utf-8") | |
| 70 | - with pytest.raises(SpecValidationError, match="invalid YAML"): | |
| 71 | - load_spec(bad) | |
| 72 | - | |
| 73 | - def test_non_mapping_top_level(self, tmp_path: Path) -> None: | |
| 74 | - bad = tmp_path / "list.yaml" | |
| 75 | - bad.write_text("- 1\n- 2\n", encoding="utf-8") | |
| 76 | - with pytest.raises(SpecValidationError, match="must be a mapping"): | |
| 77 | - load_spec(bad) | |
| 78 | - | |
| 79 | - def test_roundtrip_via_yaml(self, tmp_path: Path) -> None: | |
| 80 | - import yaml | |
| 81 | - | |
| 82 | - path = tmp_path / "sway.yaml" | |
| 83 | - path.write_text(yaml.safe_dump(_minimum_valid()), encoding="utf-8") | |
| 84 | - spec = load_spec(path) | |
| 85 | - assert spec.models.ft.adapter == Path("/tmp/adapter") | |
sway/tests/unit/test_visualize.pydeleted@@ -1,202 +0,0 @@ | ||
| 1 | -"""Tests for :mod:`dlm_sway.visualize`. | |
| 2 | - | |
| 3 | -Exercises the error path (matplotlib missing) and the happy path when | |
| 4 | -the module is present by stubbing ``matplotlib.pyplot`` via sys.modules. | |
| 5 | -""" | |
| 6 | - | |
| 7 | -from __future__ import annotations | |
| 8 | - | |
| 9 | -import sys | |
| 10 | -import types | |
| 11 | -from datetime import timedelta | |
| 12 | - | |
| 13 | -import pytest | |
| 14 | - | |
| 15 | -from dlm_sway.core.errors import BackendNotAvailableError | |
| 16 | -from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow | |
| 17 | - | |
| 18 | - | |
| 19 | -def _suite_with(*probes: ProbeResult) -> SuiteResult: | |
| 20 | - started = utcnow() | |
| 21 | - return SuiteResult( | |
| 22 | - spec_path="sway.yaml", | |
| 23 | - started_at=started, | |
| 24 | - finished_at=started + timedelta(seconds=1), | |
| 25 | - base_model_id="b", | |
| 26 | - adapter_id="a", | |
| 27 | - sway_version="0.1.0.dev0", | |
| 28 | - probes=probes, | |
| 29 | - ) | |
| 30 | - | |
| 31 | - | |
| 32 | -class _FakeFig: | |
| 33 | - def tight_layout(self) -> None: # pragma: no cover — trivial | |
| 34 | - return None | |
| 35 | - | |
| 36 | - | |
| 37 | -class _FakeAx: | |
| 38 | - def __init__(self) -> None: | |
| 39 | - self.calls: list[str] = [] | |
| 40 | - | |
| 41 | - def bar(self, *a, **k): # type: ignore[no-untyped-def] | |
| 42 | - self.calls.append("bar") | |
| 43 | - | |
| 44 | - def plot(self, *a, **k): # type: ignore[no-untyped-def] | |
| 45 | - self.calls.append("plot") | |
| 46 | - | |
| 47 | - def hist(self, *a, **k): # type: ignore[no-untyped-def] | |
| 48 | - self.calls.append("hist") | |
| 49 | - | |
| 50 | - def axhline(self, *a, **k): # type: ignore[no-untyped-def] | |
| 51 | - return None | |
| 52 | - | |
| 53 | - def axvline(self, *a, **k): # type: ignore[no-untyped-def] | |
| 54 | - return None | |
| 55 | - | |
| 56 | - def set_xticks(self, *a, **k): # type: ignore[no-untyped-def] | |
| 57 | - return None | |
| 58 | - | |
| 59 | - def set_xticklabels(self, *a, **k): # type: ignore[no-untyped-def] | |
| 60 | - return None | |
| 61 | - | |
| 62 | - def set_xlabel(self, *a, **k): # type: ignore[no-untyped-def] | |
| 63 | - return None | |
| 64 | - | |
| 65 | - def set_ylabel(self, *a, **k): # type: ignore[no-untyped-def] | |
| 66 | - return None | |
| 67 | - | |
| 68 | - def set_title(self, *a, **k): # type: ignore[no-untyped-def] | |
| 69 | - return None | |
| 70 | - | |
| 71 | - def legend(self, *a, **k): # type: ignore[no-untyped-def] | |
| 72 | - return None | |
| 73 | - | |
| 74 | - | |
| 75 | -@pytest.fixture | |
| 76 | -def fake_mpl(monkeypatch: pytest.MonkeyPatch) -> _FakeAx: | |
| 77 | - ax = _FakeAx() | |
| 78 | - | |
| 79 | - def _subplots(*a, **k): # type: ignore[no-untyped-def] | |
| 80 | - return _FakeFig(), ax | |
| 81 | - | |
| 82 | - plt = types.ModuleType("matplotlib.pyplot") | |
| 83 | - plt.subplots = _subplots # type: ignore[attr-defined] | |
| 84 | - mpl_pkg = types.ModuleType("matplotlib") | |
| 85 | - monkeypatch.setitem(sys.modules, "matplotlib", mpl_pkg) | |
| 86 | - monkeypatch.setitem(sys.modules, "matplotlib.pyplot", plt) | |
| 87 | - return ax | |
| 88 | - | |
| 89 | - | |
| 90 | -def test_section_sis_plot_uses_per_section_evidence(fake_mpl: _FakeAx) -> None: | |
| 91 | - from dlm_sway.visualize import plot_section_sis | |
| 92 | - | |
| 93 | - suite = _suite_with( | |
| 94 | - ProbeResult( | |
| 95 | - name="sis", | |
| 96 | - kind="section_internalization", | |
| 97 | - verdict=Verdict.PASS, | |
| 98 | - score=0.75, | |
| 99 | - raw=0.1, | |
| 100 | - evidence={ | |
| 101 | - "per_section": [ | |
| 102 | - { | |
| 103 | - "section_id": "a", | |
| 104 | - "kind": "prose", | |
| 105 | - "tag": None, | |
| 106 | - "base_nll": 3.0, | |
| 107 | - "ft_nll": 2.5, | |
| 108 | - "own_lift": 0.17, | |
| 109 | - "leak_lift": 0.02, | |
| 110 | - "effective_sis": 0.15, | |
| 111 | - "passed": True, | |
| 112 | - }, | |
| 113 | - { | |
| 114 | - "section_id": "b", | |
| 115 | - "kind": "instruction", | |
| 116 | - "tag": "intro", | |
| 117 | - "base_nll": 4.0, | |
| 118 | - "ft_nll": 3.9, | |
| 119 | - "own_lift": 0.025, | |
| 120 | - "leak_lift": 0.03, | |
| 121 | - "effective_sis": -0.005, | |
| 122 | - "passed": False, | |
| 123 | - }, | |
| 124 | - ], | |
| 125 | - "per_section_threshold": 0.05, | |
| 126 | - }, | |
| 127 | - ) | |
| 128 | - ) | |
| 129 | - plot_section_sis(suite) | |
| 130 | - assert "bar" in fake_mpl.calls | |
| 131 | - | |
| 132 | - | |
| 133 | -def test_adapter_ablation_plot(fake_mpl: _FakeAx) -> None: | |
| 134 | - from dlm_sway.visualize import plot_adapter_ablation | |
| 135 | - | |
| 136 | - suite = _suite_with( | |
| 137 | - ProbeResult( | |
| 138 | - name="abl", | |
| 139 | - kind="adapter_ablation", | |
| 140 | - verdict=Verdict.PASS, | |
| 141 | - score=0.8, | |
| 142 | - raw=0.9, | |
| 143 | - evidence={ | |
| 144 | - "lambdas": [0.0, 0.5, 1.0, 1.25], | |
| 145 | - "mean_divergence_per_lambda": [0.0, 0.5, 1.0, 1.1], | |
| 146 | - "linearity": 0.91, | |
| 147 | - "saturation_lambda": 0.75, | |
| 148 | - "overshoot": 1.1, | |
| 149 | - }, | |
| 150 | - ) | |
| 151 | - ) | |
| 152 | - plot_adapter_ablation(suite) | |
| 153 | - assert "plot" in fake_mpl.calls | |
| 154 | - | |
| 155 | - | |
| 156 | -def test_kl_histogram_plot(fake_mpl: _FakeAx) -> None: | |
| 157 | - from dlm_sway.visualize import plot_kl_histogram | |
| 158 | - | |
| 159 | - suite = _suite_with( | |
| 160 | - ProbeResult( | |
| 161 | - name="dk", | |
| 162 | - kind="delta_kl", | |
| 163 | - verdict=Verdict.PASS, | |
| 164 | - score=0.7, | |
| 165 | - raw=0.1, | |
| 166 | - evidence={"per_prompt": [0.05, 0.1, 0.12, 0.09, 0.15], "divergence_kind": "js"}, | |
| 167 | - ) | |
| 168 | - ) | |
| 169 | - plot_kl_histogram(suite) | |
| 170 | - assert "hist" in fake_mpl.calls | |
| 171 | - | |
| 172 | - | |
| 173 | -def test_raises_when_matplotlib_missing(monkeypatch: pytest.MonkeyPatch) -> None: | |
| 174 | - # Purge matplotlib modules and block imports. | |
| 175 | - for mod in list(sys.modules): | |
| 176 | - if mod == "matplotlib" or mod.startswith("matplotlib."): | |
| 177 | - monkeypatch.delitem(sys.modules, mod, raising=False) | |
| 178 | - | |
| 179 | - import builtins | |
| 180 | - | |
| 181 | - real_import = builtins.__import__ | |
| 182 | - | |
| 183 | - def fake_import(name: str, *a, **k): # type: ignore[no-untyped-def] | |
| 184 | - if name == "matplotlib" or name.startswith("matplotlib."): | |
| 185 | - raise ImportError("matplotlib missing in this venv") | |
| 186 | - return real_import(name, *a, **k) | |
| 187 | - | |
| 188 | - monkeypatch.setattr(builtins, "__import__", fake_import) | |
| 189 | - | |
| 190 | - from dlm_sway.visualize import plot_section_sis | |
| 191 | - | |
| 192 | - suite = _suite_with() | |
| 193 | - with pytest.raises(BackendNotAvailableError): | |
| 194 | - plot_section_sis(suite) | |
| 195 | - | |
| 196 | - | |
| 197 | -def test_raises_when_no_matching_probe(fake_mpl: _FakeAx) -> None: | |
| 198 | - from dlm_sway.visualize import plot_section_sis | |
| 199 | - | |
| 200 | - suite = _suite_with() # empty — no section_internalization probe | |
| 201 | - with pytest.raises(ValueError, match="section_internalization"): | |
| 202 | - plot_section_sis(suite) | |