sway: scaffold standalone subproject (pyproject, LICENSE, README)
- SHA
3732e0b0b7a7c8764be3e80c92afacfb5be47391- Tree
bd74e8a
3732e0b
3732e0b0b7a7c8764be3e80c92afacfb5be47391bd74e8a| Status | File | + | - |
|---|---|---|---|
| A |
LICENSE
|
21 | 0 |
| A |
README.md
|
101 | 0 |
| A |
pyproject.toml
|
203 | 0 |
| A |
src/dlm_sway/backends/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/cli/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/core/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/integrations/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/integrations/dlm/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/probes/__init__.py
|
1 | 0 |
| A |
src/dlm_sway/py.typed
|
0 | 0 |
| A |
src/dlm_sway/suite/__init__.py
|
1 | 0 |
| A |
tests/__init__.py
|
0 | 0 |
| A |
tests/conftest.py
|
24 | 0 |
| A |
tests/fixtures/__init__.py
|
0 | 0 |
| A |
tests/integration/__init__.py
|
0 | 0 |
| A |
tests/unit/__init__.py
|
0 | 0 |
LICENSEadded@@ -0,0 +1,21 @@ | |||
| 1 | +MIT License | ||
| 2 | + | ||
| 3 | +Copyright (c) 2026 Matt Wolffe | ||
| 4 | + | ||
| 5 | +Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 6 | +of this software and associated documentation files (the "Software"), to deal | ||
| 7 | +in the Software without restriction, including without limitation the rights | ||
| 8 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 9 | +copies of the Software, and to permit persons to whom the Software is | ||
| 10 | +furnished to do so, subject to the following conditions: | ||
| 11 | + | ||
| 12 | +The above copyright notice and this permission notice shall be included in all | ||
| 13 | +copies or substantial portions of the Software. | ||
| 14 | + | ||
| 15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 16 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 17 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 18 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 19 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 20 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 21 | +SOFTWARE. | ||
README.mdadded@@ -0,0 +1,101 @@ | |||
| 1 | +# dlm-sway | ||
| 2 | + | ||
| 3 | +Differential testing for fine-tuned causal language models. | ||
| 4 | + | ||
| 5 | +**One question:** *did LoRA/QLoRA training actually change model behavior | ||
| 6 | +in a meaningful way, or is the model just defaulting to the pretrained | ||
| 7 | +base?* | ||
| 8 | + | ||
| 9 | +`dlm-sway` gives you a trustworthy, reproducible answer with eleven | ||
| 10 | +purpose-built primitives, each z-scored against a null-adapter baseline. | ||
| 11 | +No LLM judges. No external APIs. Deterministic on CPU where possible. | ||
| 12 | + | ||
| 13 | +## Install | ||
| 14 | + | ||
| 15 | +```bash | ||
| 16 | +pip install "dlm-sway[hf]" # HuggingFace + PEFT backend | ||
| 17 | +pip install "dlm-sway[hf,style,semsim]" # full primitive battery | ||
| 18 | +pip install "dlm-sway[all]" # everything including optional viz | ||
| 19 | +pip install "dlm-sway[dlm]" # auto-generate tests from a .dlm file | ||
| 20 | +``` | ||
| 21 | + | ||
| 22 | +## 90-second smoke test | ||
| 23 | + | ||
| 24 | +```bash | ||
| 25 | +dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct | ||
| 26 | +``` | ||
| 27 | + | ||
| 28 | +Outputs a verdict in under a minute on CPU for small models: *your | ||
| 29 | +adapter is 4.2σ above noise* ✅ or *indistinguishable from a null | ||
| 30 | +adapter* ❌. | ||
| 31 | + | ||
| 32 | +## Full suite | ||
| 33 | + | ||
| 34 | +```yaml | ||
| 35 | +# sway.yaml | ||
| 36 | +version: 1 | ||
| 37 | +models: | ||
| 38 | + base: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct"} | ||
| 39 | + ft: {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct", | ||
| 40 | + adapter: "./runs/adapter/v0003"} | ||
| 41 | +suite: | ||
| 42 | + - {name: knows_concept, kind: dir, | ||
| 43 | + prompt: "The Dunning-Kruger effect describes", | ||
| 44 | + target: " a cognitive bias where", | ||
| 45 | + distractor: " a programming language"} | ||
| 46 | + - {name: no_reversion, kind: adapter_revert, paraphrases: 4} | ||
| 47 | + - {name: section_attribution, kind: section_internalization} | ||
| 48 | +``` | ||
| 49 | + | ||
| 50 | +```bash | ||
| 51 | +dlm-sway run sway.yaml # full report to terminal + JSON | ||
| 52 | +dlm-sway gate sway.yaml --junit # CI-friendly; non-zero on fail | ||
| 53 | +``` | ||
| 54 | + | ||
| 55 | +## Why it exists | ||
| 56 | + | ||
| 57 | +Standard benchmarks (MMLU, HellaSwag) ask *"how good is this model?"* | ||
| 58 | +That's the wrong question after a targeted LoRA fine-tune on a small | ||
| 59 | +user-authored document. The right question is *"did the adapter actually | ||
| 60 | +move the model toward what I wrote?"* — and existing tools answer this | ||
| 61 | +poorly. | ||
| 62 | + | ||
| 63 | +`dlm-sway` answers it directly via eleven primitives across four | ||
| 64 | +categories: | ||
| 65 | + | ||
| 66 | +| Category | Primitives | | ||
| 67 | +|---------------|-------------------------------------------------------| | ||
| 68 | +| Adherence | `delta_kl`, `adapter_revert`, `prompt_collapse` | | ||
| 69 | +| Attribution | `section_internalization`, `paraphrase_invariance`, `preference_flip` | | ||
| 70 | +| Calibration | `style_fingerprint`, `calibration_drift`, `leakage` | | ||
| 71 | +| Ablation | `adapter_ablation` ← the signature primitive | | ||
| 72 | + | ||
| 73 | +**The signature primitive.** `adapter_ablation` scales the LoRA additive | ||
| 74 | +term by λ ∈ {0, 0.25, 0.5, 0.75, 1.0, 1.25} and measures the divergence | ||
| 75 | +curve. A healthy fine-tune shows a smooth, monotonic, non-saturated | ||
| 76 | +response. A degenerate one shows a step function or an overshoot-then- | ||
| 77 | +crash. Nobody else does this because nobody else gets this close to the | ||
| 78 | +adapter math. | ||
| 79 | + | ||
| 80 | +## The `.dlm` integration | ||
| 81 | + | ||
| 82 | +If you trained your adapter via the [DocumentLanguageModel | ||
| 83 | +project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway | ||
| 84 | +can auto-generate a test suite from your document's sections: | ||
| 85 | + | ||
| 86 | +```bash | ||
| 87 | +pip install "dlm-sway[hf,dlm]" | ||
| 88 | +dlm-sway autogen path/to/doc.dlm -o sway.yaml | ||
| 89 | +dlm-sway run sway.yaml | ||
| 90 | +``` | ||
| 91 | + | ||
| 92 | +Per-section attribution tells you *which* parts of your document | ||
| 93 | +actually moved the model — a kind of signal no other tool provides. | ||
| 94 | + | ||
| 95 | +## Status | ||
| 96 | + | ||
| 97 | +Pre-alpha. API will break. Version `0.1.0` is the first tag. | ||
| 98 | + | ||
| 99 | +## License | ||
| 100 | + | ||
| 101 | +MIT | ||
pyproject.tomladded@@ -0,0 +1,203 @@ | |||
| 1 | +[project] | ||
| 2 | +name = "dlm-sway" | ||
| 3 | +version = "0.1.0.dev0" | ||
| 4 | +description = "Differential testing for fine-tuned causal LMs: did LoRA/QLoRA training actually change behavior, or is the model defaulting to the pretrained base?" | ||
| 5 | +readme = "README.md" | ||
| 6 | +requires-python = ">=3.11" | ||
| 7 | +license = { text = "MIT" } | ||
| 8 | +authors = [{ name = "Matt Wolffe", email = "mfwolffe@outlook.com" }] | ||
| 9 | +keywords = [ | ||
| 10 | + "lora", | ||
| 11 | + "qlora", | ||
| 12 | + "peft", | ||
| 13 | + "fine-tuning", | ||
| 14 | + "evaluation", | ||
| 15 | + "llm", | ||
| 16 | + "differential-testing", | ||
| 17 | +] | ||
| 18 | +classifiers = [ | ||
| 19 | + "Development Status :: 3 - Alpha", | ||
| 20 | + "Intended Audience :: Developers", | ||
| 21 | + "Intended Audience :: Science/Research", | ||
| 22 | + "License :: OSI Approved :: MIT License", | ||
| 23 | + "Programming Language :: Python :: 3", | ||
| 24 | + "Programming Language :: Python :: 3.11", | ||
| 25 | + "Programming Language :: Python :: 3.12", | ||
| 26 | + "Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
| 27 | +] | ||
| 28 | + | ||
| 29 | +# Core deps: spec loading, orchestration, reporting. No torch — a user | ||
| 30 | +# who only defines specs or writes a custom backend shouldn't pull 3 GB | ||
| 31 | +# of CUDA wheels. | ||
| 32 | +dependencies = [ | ||
| 33 | + "pydantic>=2.9", | ||
| 34 | + "pyyaml>=6.0", | ||
| 35 | + "typer>=0.12", | ||
| 36 | + "rich>=13.7", | ||
| 37 | + "numpy>=1.26", | ||
| 38 | + "packaging>=24.0", | ||
| 39 | +] | ||
| 40 | + | ||
| 41 | +[project.optional-dependencies] | ||
| 42 | +# HuggingFace + PEFT scoring backend. The canonical path. | ||
| 43 | +hf = [ | ||
| 44 | + "torch>=2.4", | ||
| 45 | + "transformers>=4.45", | ||
| 46 | + "peft>=0.13", | ||
| 47 | + "safetensors>=0.4", | ||
| 48 | +] | ||
| 49 | +# Apple Silicon inference. Env markers keep `uv sync --extra mlx` a no-op | ||
| 50 | +# on non-Apple hosts so Linux/CUDA contributors' wheel resolution stays | ||
| 51 | +# sane. | ||
| 52 | +mlx = [ | ||
| 53 | + "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 54 | + "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 55 | +] | ||
| 56 | +# Stylistic fingerprinting (C1). spaCy models pull at runtime via | ||
| 57 | +# `python -m spacy download`. | ||
| 58 | +style = [ | ||
| 59 | + "spacy>=3.7", | ||
| 60 | + "textstat>=0.7", | ||
| 61 | + "nlpaug>=1.1", | ||
| 62 | +] | ||
| 63 | +# Semantic similarity (A2). MiniLM ~80 MB, CPU-friendly. | ||
| 64 | +semsim = [ | ||
| 65 | + "sentence-transformers>=3.0", | ||
| 66 | +] | ||
| 67 | +# Optional .dlm integration. Only imported inside dlm_sway.integrations.dlm. | ||
| 68 | +dlm = [ | ||
| 69 | + "dlm>=0.9", | ||
| 70 | +] | ||
| 71 | +# Visualization (P9). | ||
| 72 | +viz = [ | ||
| 73 | + "matplotlib>=3.8", | ||
| 74 | +] | ||
| 75 | +all = [ | ||
| 76 | + "torch>=2.4", | ||
| 77 | + "transformers>=4.45", | ||
| 78 | + "peft>=0.13", | ||
| 79 | + "safetensors>=0.4", | ||
| 80 | + "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 81 | + "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'", | ||
| 82 | + "spacy>=3.7", | ||
| 83 | + "textstat>=0.7", | ||
| 84 | + "nlpaug>=1.1", | ||
| 85 | + "sentence-transformers>=3.0", | ||
| 86 | + "matplotlib>=3.8", | ||
| 87 | +] | ||
| 88 | + | ||
| 89 | +[project.scripts] | ||
| 90 | +dlm-sway = "dlm_sway.cli.app:main" | ||
| 91 | + | ||
| 92 | +[project.urls] | ||
| 93 | +Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel" | ||
| 94 | +Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues" | ||
| 95 | + | ||
| 96 | +[dependency-groups] | ||
| 97 | +dev = [ | ||
| 98 | + "pytest>=8.0", | ||
| 99 | + "pytest-cov>=5.0", | ||
| 100 | + "mypy>=1.11", | ||
| 101 | + "ruff>=0.6", | ||
| 102 | + "types-pyyaml>=6.0", | ||
| 103 | + "hypothesis>=6.152.1", | ||
| 104 | +] | ||
| 105 | + | ||
| 106 | +[build-system] | ||
| 107 | +requires = ["hatchling"] | ||
| 108 | +build-backend = "hatchling.build" | ||
| 109 | + | ||
| 110 | +[tool.hatch.build.targets.wheel] | ||
| 111 | +packages = ["src/dlm_sway"] | ||
| 112 | + | ||
| 113 | +# -------- ruff -------- | ||
| 114 | +[tool.ruff] | ||
| 115 | +line-length = 100 | ||
| 116 | +target-version = "py311" | ||
| 117 | +src = ["src", "tests"] | ||
| 118 | + | ||
| 119 | +[tool.ruff.lint] | ||
| 120 | +select = [ | ||
| 121 | + "E", # pycodestyle errors | ||
| 122 | + "F", # pyflakes | ||
| 123 | + "W", # pycodestyle warnings | ||
| 124 | + "I", # isort | ||
| 125 | + "UP", # pyupgrade | ||
| 126 | + "B", # bugbear | ||
| 127 | + "N", # pep8-naming | ||
| 128 | + "C4", # comprehensions | ||
| 129 | + "SIM", # simplify | ||
| 130 | + "PT", # pytest | ||
| 131 | + "RET", # return | ||
| 132 | + "ARG", # unused args | ||
| 133 | + "PTH", # use pathlib | ||
| 134 | + "TID", # tidy imports | ||
| 135 | +] | ||
| 136 | +ignore = [ | ||
| 137 | + "E501", # handled by formatter | ||
| 138 | +] | ||
| 139 | + | ||
| 140 | +[tool.ruff.lint.per-file-ignores] | ||
| 141 | +"tests/**/*.py" = ["ARG", "PT011", "SIM117"] | ||
| 142 | + | ||
| 143 | +[tool.ruff.lint.flake8-tidy-imports.banned-api] | ||
| 144 | +# Hard architectural boundary: the `dlm` package is only importable | ||
| 145 | +# from inside the optional integration shim. This keeps dlm-sway | ||
| 146 | +# usable for anyone with just a HuggingFace base + PEFT adapter. | ||
| 147 | +"dlm".msg = "Import `dlm` only from dlm_sway.integrations.dlm (the optional extra)." | ||
| 148 | + | ||
| 149 | +[tool.ruff.format] | ||
| 150 | +quote-style = "double" | ||
| 151 | +indent-style = "space" | ||
| 152 | + | ||
| 153 | +# -------- mypy -------- | ||
| 154 | +[tool.mypy] | ||
| 155 | +strict = true | ||
| 156 | +python_version = "3.11" | ||
| 157 | +packages = ["dlm_sway"] | ||
| 158 | +mypy_path = "src" | ||
| 159 | +warn_return_any = true | ||
| 160 | +warn_unused_ignores = true | ||
| 161 | +warn_redundant_casts = true | ||
| 162 | +no_implicit_optional = true | ||
| 163 | +disallow_untyped_decorators = true | ||
| 164 | +plugins = ["pydantic.mypy"] | ||
| 165 | + | ||
| 166 | +[tool.pydantic-mypy] | ||
| 167 | +init_forbid_extra = true | ||
| 168 | +init_typed = true | ||
| 169 | +warn_required_dynamic_aliases = true | ||
| 170 | + | ||
| 171 | +# Stubless ML ecosystem packages. Narrow boundaries in backends/* import | ||
| 172 | +# them explicitly; the rest of the codebase stays strict. | ||
| 173 | +[[tool.mypy.overrides]] | ||
| 174 | +module = [ | ||
| 175 | + "torch", | ||
| 176 | + "torch.*", | ||
| 177 | + "transformers.*", | ||
| 178 | + "peft.*", | ||
| 179 | + "safetensors.*", | ||
| 180 | + "mlx.*", | ||
| 181 | + "mlx_lm.*", | ||
| 182 | + "sentence_transformers.*", | ||
| 183 | + "spacy.*", | ||
| 184 | + "textstat.*", | ||
| 185 | + "nlpaug.*", | ||
| 186 | + "huggingface_hub.*", | ||
| 187 | + "dlm.*", | ||
| 188 | +] | ||
| 189 | +ignore_missing_imports = true | ||
| 190 | +disable_error_code = ["no-untyped-call"] | ||
| 191 | + | ||
| 192 | +# -------- pytest -------- | ||
| 193 | +[tool.pytest.ini_options] | ||
| 194 | +testpaths = ["tests"] | ||
| 195 | +addopts = [ | ||
| 196 | + "-ra", | ||
| 197 | + "-m", "not slow and not gpu and not online", | ||
| 198 | +] | ||
| 199 | +markers = [ | ||
| 200 | + "slow: expensive; deselected by default", | ||
| 201 | + "gpu: requires CUDA; skipped on CPU/MPS runners", | ||
| 202 | + "online: touches the network; skipped in offline CI", | ||
| 203 | +] | ||
src/dlm_sway/backends/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Scoring backends: HuggingFace (``hf``), MLX (``mlx``), dummy, custom.""" | ||
src/dlm_sway/cli/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Command-line interface (entry point: ``dlm-sway``).""" | ||
src/dlm_sway/core/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Core abstractions: protocols, results, errors, determinism.""" | ||
src/dlm_sway/integrations/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Optional integrations with upstream fine-tuning tools.""" | ||
src/dlm_sway/integrations/dlm/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""DLM project integration. Imports the ``dlm`` package; requires ``dlm-sway[dlm]``.""" | ||
src/dlm_sway/probes/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Probe primitives. Each module in this package implements one primitive.""" | ||
src/dlm_sway/py.typedaddedsrc/dlm_sway/suite/__init__.pyadded@@ -0,0 +1,1 @@ | |||
| 1 | +"""Suite plumbing: spec models, loader, runner, report, composite score.""" | ||
tests/__init__.pyaddedtests/conftest.pyadded@@ -0,0 +1,24 @@ | |||
| 1 | +"""Shared test fixtures. | ||
| 2 | + | ||
| 3 | +Keep the default fast-test environment offline and deterministic so unit | ||
| 4 | +tests stay below ~1 s per file. Integration tests override these via | ||
| 5 | +their own ``conftest`` when they need network access. | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +from __future__ import annotations | ||
| 9 | + | ||
| 10 | +import pytest | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +@pytest.fixture(autouse=True) | ||
| 14 | +def _offline_and_no_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: | ||
| 15 | + """Unit tests never touch the network. | ||
| 16 | + | ||
| 17 | + Any backend test that needs HF should be marked ``@pytest.mark.online`` | ||
| 18 | + and clear these vars explicitly. | ||
| 19 | + """ | ||
| 20 | + monkeypatch.setenv("HF_HUB_OFFLINE", "1") | ||
| 21 | + monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1") | ||
| 22 | + monkeypatch.setenv("HF_DATASETS_OFFLINE", "1") | ||
| 23 | + monkeypatch.setenv("HF_HUB_DISABLE_TELEMETRY", "1") | ||
| 24 | + monkeypatch.setenv("DO_NOT_TRACK", "1") | ||
tests/fixtures/__init__.pyaddedtests/integration/__init__.pyaddedtests/unit/__init__.pyadded