tenseleyflow/sway / 98ad941

Browse files

rename CLI + source references to sway; keep dlm-sway as the PyPI wheel name

Authored by espadonne
SHA
98ad9417c94e1bbeb97cf5e553878d7953513f69
Parents
3890e1e
Tree
02c7f68

13 changed files

StatusFile+-
A .gitignore 28 0
M README.md 28 16
M pyproject.toml 4 3
M src/dlm_sway/__init__.py 5 1
M src/dlm_sway/backends/hf.py 4 3
M src/dlm_sway/cli/__init__.py 1 1
M src/dlm_sway/cli/app.py 4 4
M src/dlm_sway/cli/commands.py 5 5
M src/dlm_sway/core/errors.py 2 2
M src/dlm_sway/suite/report.py 4 4
M src/dlm_sway/suite/spec.py 2 2
M tests/unit/test_cli.py 4 4
M tests/unit/test_suite_score_report.py 2 2
.gitignoreadded
@@ -0,0 +1,28 @@
1
+# Python
2
+__pycache__/
3
+*.py[cod]
4
+*$py.class
5
+*.so
6
+
7
+# Virtual envs
8
+.venv/
9
+venv/
10
+.env
11
+
12
+# Tooling caches
13
+.mypy_cache/
14
+.pytest_cache/
15
+.ruff_cache/
16
+.coverage
17
+htmlcov/
18
+
19
+# Build artifacts
20
+build/
21
+dist/
22
+*.egg-info/
23
+
24
+# Editor
25
+.idea/
26
+.vscode/
27
+*.swp
28
+.DS_Store
README.mdmodified
@@ -1,4 +1,4 @@
1
-# dlm-sway
1
+# sway
2
 
2
 
3
 Differential testing for fine-tuned causal language models.
3
 Differential testing for fine-tuned causal language models.
4
 
4
 
@@ -6,10 +6,16 @@ Differential testing for fine-tuned causal language models.
6
 in a meaningful way, or is the model just defaulting to the pretrained
6
 in a meaningful way, or is the model just defaulting to the pretrained
7
 base?*
7
 base?*
8
 
8
 
9
-`dlm-sway` gives you a trustworthy, reproducible answer with eleven
9
+`sway` gives you a trustworthy, reproducible answer with eleven
10
 purpose-built primitives, each z-scored against a null-adapter baseline.
10
 purpose-built primitives, each z-scored against a null-adapter baseline.
11
 No LLM judges. No external APIs. Deterministic on CPU where possible.
11
 No LLM judges. No external APIs. Deterministic on CPU where possible.
12
 
12
 
13
+> **Naming note.** The source repo and CLI entry point are both `sway`.
14
+> The PyPI wheel is named `dlm-sway` because `sway` is already taken on
15
+> PyPI by an unrelated project. `pip install dlm-sway` installs the
16
+> `sway` command — mismatched names are a PyPA convention (see
17
+> `pyyaml` → `import yaml`).
18
+
13
 ## Install
19
 ## Install
14
 
20
 
15
 ```bash
21
 ```bash
@@ -22,7 +28,7 @@ pip install "dlm-sway[dlm]" # auto-generate tests from a .dlm file
22
 ## 90-second smoke test
28
 ## 90-second smoke test
23
 
29
 
24
 ```bash
30
 ```bash
25
-dlm-sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct
31
+sway check path/to/adapter --base HuggingFaceTB/SmolLM2-135M-Instruct
26
 ```
32
 ```
27
 
33
 
28
 Outputs a verdict in under a minute on CPU for small models: *your
34
 Outputs a verdict in under a minute on CPU for small models: *your
@@ -39,17 +45,18 @@ models:
39
   ft:   {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct",
45
   ft:   {kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct",
40
          adapter: "./runs/adapter/v0003"}
46
          adapter: "./runs/adapter/v0003"}
41
 suite:
47
 suite:
42
-  - {name: knows_concept, kind: dir,
48
+  - {name: null_baseline,       kind: null_adapter, runs: 3}
43
-     prompt: "The Dunning-Kruger effect describes",
49
+  - {name: doc_divergence,      kind: delta_kl,
44
-     target: " a cognitive bias where",
50
+     prompts: ["The key insight is", "An important rule"]}
45
-     distractor: " a programming language"}
46
-  - {name: no_reversion, kind: adapter_revert, paraphrases: 4}
47
   - {name: section_attribution, kind: section_internalization}
51
   - {name: section_attribution, kind: section_internalization}
52
+  - {name: no_leakage,          kind: leakage}
53
+  - {name: ablation_shape,      kind: adapter_ablation,
54
+     prompts: ["Tell me more about"]}
48
 ```
55
 ```
49
 
56
 
50
 ```bash
57
 ```bash
51
-dlm-sway run sway.yaml              # full report to terminal + JSON
58
+sway run sway.yaml              # full report to terminal + JSON
52
-dlm-sway gate sway.yaml --junit     # CI-friendly; non-zero on fail
59
+sway gate sway.yaml --junit     # CI-friendly; non-zero on fail
53
 ```
60
 ```
54
 
61
 
55
 ## Why it exists
62
 ## Why it exists
@@ -60,8 +67,7 @@ user-authored document. The right question is *"did the adapter actually
60
 move the model toward what I wrote?"* — and existing tools answer this
67
 move the model toward what I wrote?"* — and existing tools answer this
61
 poorly.
68
 poorly.
62
 
69
 
63
-`dlm-sway` answers it directly via eleven primitives across four
70
+`sway` answers it directly via eleven primitives across four categories:
64
-categories:
65
 
71
 
66
 | Category      | Primitives                                            |
72
 | Category      | Primitives                                            |
67
 |---------------|-------------------------------------------------------|
73
 |---------------|-------------------------------------------------------|
@@ -77,16 +83,22 @@ response. A degenerate one shows a step function or an overshoot-then-
77
 crash. Nobody else does this because nobody else gets this close to the
83
 crash. Nobody else does this because nobody else gets this close to the
78
 adapter math.
84
 adapter math.
79
 
85
 
86
+**The calibration.** Every numeric probe z-scores its raw metric against
87
+a null-adapter baseline — a same-structure LoRA with random-init weights.
88
+"Your adapter's KL is 4.2σ above noise" is a far stronger claim than a
89
+fixed threshold. The null-adapter calibration requires a backend that
90
+implements `NullCalibratedBackend` (the HF backend does).
91
+
80
 ## The `.dlm` integration
92
 ## The `.dlm` integration
81
 
93
 
82
 If you trained your adapter via the [DocumentLanguageModel
94
 If you trained your adapter via the [DocumentLanguageModel
83
-project](https://github.com/tenseleyFlow/DocumentLanguageModel), sway
95
+project](https://github.com/tenseleyFlow/DocumentLanguageModel), `sway`
84
-can auto-generate a test suite from your document's sections:
96
+auto-generates a test suite from your document's sections:
85
 
97
 
86
 ```bash
98
 ```bash
87
 pip install "dlm-sway[hf,dlm]"
99
 pip install "dlm-sway[hf,dlm]"
88
-dlm-sway autogen path/to/doc.dlm -o sway.yaml
100
+sway autogen path/to/doc.dlm -o sway.yaml
89
-dlm-sway run sway.yaml
101
+sway run sway.yaml
90
 ```
102
 ```
91
 
103
 
92
 Per-section attribution tells you *which* parts of your document
104
 Per-section attribution tells you *which* parts of your document
pyproject.tomlmodified
@@ -87,11 +87,12 @@ all = [
87
 ]
87
 ]
88
 
88
 
89
 [project.scripts]
89
 [project.scripts]
90
-dlm-sway = "dlm_sway.cli.app:main"
90
+sway = "dlm_sway.cli.app:main"
91
 
91
 
92
 [project.urls]
92
 [project.urls]
93
-Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
93
+Homepage = "https://github.com/tenseleyFlow/sway"
94
-Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
94
+Issues = "https://github.com/tenseleyFlow/sway/issues"
95
+"Related project" = "https://github.com/tenseleyFlow/DocumentLanguageModel"
95
 
96
 
96
 [dependency-groups]
97
 [dependency-groups]
97
 dev = [
98
 dev = [
src/dlm_sway/__init__.pymodified
@@ -1,4 +1,8 @@
1
-"""dlm-sway — differential testing for fine-tuned causal language models."""
1
+"""sway — differential testing for fine-tuned causal language models.
2
+
3
+Published on PyPI as ``dlm-sway`` (the short name is taken); the CLI
4
+entry point and source repo are ``sway``.
5
+"""
2
 
6
 
3
 from __future__ import annotations
7
 from __future__ import annotations
4
 
8
 
src/dlm_sway/backends/hf.pymodified
@@ -256,9 +256,10 @@ class HuggingFaceDifferentialBackend:
256
     def as_base(self) -> Iterator[_HFView]:
256
     def as_base(self) -> Iterator[_HFView]:
257
         self._enter("base")
257
         self._enter("base")
258
         try:
258
         try:
259
-            # peft.PeftModel.disable_adapter is a context manager; mypy
259
+            # peft.PeftModel.disable_adapter is a context manager; newer
260
-            # mis-reads it as a Tensor on this transformers version.
260
+            # transformers builds ship stubs that mis-type it as a Tensor,
261
-            with self._peft_model.disable_adapter():  # type: ignore[operator]
261
+            # so we warn-only there (see hf backend mypy overrides).
262
+            with self._peft_model.disable_adapter():
262
                 yield self._make_view("base")
263
                 yield self._make_view("base")
263
         finally:
264
         finally:
264
             self._exit()
265
             self._exit()
src/dlm_sway/cli/__init__.pymodified
@@ -1,1 +1,1 @@
1
-"""Command-line interface (entry point: ``dlm-sway``)."""
1
+"""Command-line interface (entry point: ``sway``)."""
src/dlm_sway/cli/app.pymodified
@@ -1,7 +1,7 @@
1
-"""dlm-sway CLI entry point.
1
+"""sway CLI entry point.
2
 
2
 
3
 ``pip install dlm-sway`` installs this module's :func:`main` as the
3
 ``pip install dlm-sway`` installs this module's :func:`main` as the
4
-``dlm-sway`` console script. Every subcommand is a thin wrapper around a
4
+``sway`` console script. Every subcommand is a thin wrapper around a
5
 library-level function so the CLI surface mirrors what programmatic
5
 library-level function so the CLI surface mirrors what programmatic
6
 callers get.
6
 callers get.
7
 """
7
 """
@@ -14,7 +14,7 @@ from dlm_sway import __version__
14
 from dlm_sway.cli import commands
14
 from dlm_sway.cli import commands
15
 
15
 
16
 app = typer.Typer(
16
 app = typer.Typer(
17
-    name="dlm-sway",
17
+    name="sway",
18
     no_args_is_help=True,
18
     no_args_is_help=True,
19
     add_completion=False,
19
     add_completion=False,
20
     help="Differential testing for fine-tuned causal language models.",
20
     help="Differential testing for fine-tuned causal language models.",
@@ -23,7 +23,7 @@ app = typer.Typer(
23
 
23
 
24
 def _version_callback(value: bool) -> None:
24
 def _version_callback(value: bool) -> None:
25
     if value:
25
     if value:
26
-        typer.echo(f"dlm-sway {__version__}")
26
+        typer.echo(f"sway {__version__}")
27
         raise typer.Exit()
27
         raise typer.Exit()
28
 
28
 
29
 
29
 
src/dlm_sway/cli/commands.pymodified
@@ -1,4 +1,4 @@
1
-"""Command implementations for the ``dlm-sway`` CLI.
1
+"""Command implementations for the ``sway`` CLI.
2
 
2
 
3
 Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
3
 Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
4
 Commands deliberately do as little as possible themselves — the real
4
 Commands deliberately do as little as possible themselves — the real
@@ -212,7 +212,7 @@ def autogen_cmd(
212
         typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
212
         typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
213
     ] = Path("sway.yaml"),
213
     ] = Path("sway.yaml"),
214
 ) -> None:
214
 ) -> None:
215
-    """Generate a sway.yaml from a .dlm file (requires dlm-sway[dlm])."""
215
+    """Generate a sway.yaml from a .dlm file (requires the ``dlm-sway[dlm]`` extra)."""
216
     import importlib
216
     import importlib
217
 
217
 
218
     try:
218
     try:
@@ -237,7 +237,7 @@ def autogen_cmd(
237
 def doctor_cmd() -> None:
237
 def doctor_cmd() -> None:
238
     """Print backend availability and version info."""
238
     """Print backend availability and version info."""
239
     console = Console()
239
     console = Console()
240
-    console.print(f"[bold]dlm-sway[/bold] {__version__}")
240
+    console.print(f"[bold]sway[/bold] {__version__}")
241
     console.print(f"  python:    {sys.version.split()[0]}")
241
     console.print(f"  python:    {sys.version.split()[0]}")
242
     console.print(f"  platform:  {sys.platform}")
242
     console.print(f"  platform:  {sys.platform}")
243
     console.print()
243
     console.print()
@@ -360,7 +360,7 @@ def _probe_import(name: str) -> str:
360
 def _render_markdown_from_json(raw: dict[str, Any]) -> str:
360
 def _render_markdown_from_json(raw: dict[str, Any]) -> str:
361
     score: dict[str, Any] = raw.get("score", {})
361
     score: dict[str, Any] = raw.get("score", {})
362
     lines: list[str] = [
362
     lines: list[str] = [
363
-        "# dlm-sway report",
363
+        "# sway report",
364
         "",
364
         "",
365
         f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`)  ",
365
         f"**Overall:** {score.get('overall', 0.0):.2f} (`{score.get('band', '?')}`)  ",
366
         f"**Base:** `{raw.get('base_model_id', '?')}`  ",
366
         f"**Base:** `{raw.get('base_model_id', '?')}`  ",
@@ -384,7 +384,7 @@ def _render_junit_from_json(raw: dict[str, Any]) -> str:
384
     import xml.etree.ElementTree as ET
384
     import xml.etree.ElementTree as ET
385
 
385
 
386
     probes: list[dict[str, Any]] = raw.get("probes", [])
386
     probes: list[dict[str, Any]] = raw.get("probes", [])
387
-    testsuite = ET.Element("testsuite", {"name": "dlm-sway", "tests": str(len(probes))})
387
+    testsuite = ET.Element("testsuite", {"name": "sway", "tests": str(len(probes))})
388
     for p in probes:
388
     for p in probes:
389
         tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]})
389
         tc = ET.SubElement(testsuite, "testcase", {"classname": p["kind"], "name": p["name"]})
390
         if p["verdict"] == "fail":
390
         if p["verdict"] == "fail":
src/dlm_sway/core/errors.pymodified
@@ -1,4 +1,4 @@
1
-"""Exception hierarchy for dlm-sway.
1
+"""Exception hierarchy for sway.
2
 
2
 
3
 Every error sway raises inherits from :class:`SwayError` so callers can
3
 Every error sway raises inherits from :class:`SwayError` so callers can
4
 catch the whole family with a single ``except``. Subclasses carry enough
4
 catch the whole family with a single ``except``. Subclasses carry enough
@@ -11,7 +11,7 @@ from __future__ import annotations
11
 
11
 
12
 
12
 
13
 class SwayError(Exception):
13
 class SwayError(Exception):
14
-    """Root of the dlm-sway exception hierarchy."""
14
+    """Root of the sway exception hierarchy."""
15
 
15
 
16
 
16
 
17
 class SpecValidationError(SwayError):
17
 class SpecValidationError(SwayError):
src/dlm_sway/suite/report.pymodified
@@ -8,7 +8,7 @@ JSON is the machine-readable source of truth — same fields as the
8
 :class:`SuiteResult` dataclass but flattened for easy downstream parsing
8
 :class:`SuiteResult` dataclass but flattened for easy downstream parsing
9
 (dashboards, diff tools, history tracking).
9
 (dashboards, diff tools, history tracking).
10
 
10
 
11
-JUnit XML exists to drop into CI pipelines so ``dlm-sway gate``
11
+JUnit XML exists to drop into CI pipelines so ``sway gate``
12
 integrates with existing test dashboards with no extra glue.
12
 integrates with existing test dashboards with no extra glue.
13
 """
13
 """
14
 
14
 
@@ -40,7 +40,7 @@ def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None
40
     c = console or Console()
40
     c = console or Console()
41
 
41
 
42
     header = Text.assemble(
42
     header = Text.assemble(
43
-        ("dlm-sway report — ", "bold"),
43
+        ("sway report — ", "bold"),
44
         (suite.base_model_id, "cyan"),
44
         (suite.base_model_id, "cyan"),
45
         ("  vs  ", "dim"),
45
         ("  vs  ", "dim"),
46
         (_adapter_label(suite.adapter_id), "cyan"),
46
         (_adapter_label(suite.adapter_id), "cyan"),
@@ -153,7 +153,7 @@ def to_junit(suite: SuiteResult, score: SwayScore) -> str:
153
     testsuite = ET.Element(
153
     testsuite = ET.Element(
154
         "testsuite",
154
         "testsuite",
155
         {
155
         {
156
-            "name": "dlm-sway",
156
+            "name": "sway",
157
             "tests": str(len(suite.probes)),
157
             "tests": str(len(suite.probes)),
158
             "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
158
             "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
159
             "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
159
             "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
@@ -187,7 +187,7 @@ def to_junit(suite: SuiteResult, score: SwayScore) -> str:
187
 def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
187
 def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
188
     """A portable, CI-friendly markdown report."""
188
     """A portable, CI-friendly markdown report."""
189
     buf = StringIO()
189
     buf = StringIO()
190
-    buf.write("# dlm-sway report\n\n")
190
+    buf.write("# sway report\n\n")
191
     buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`)  \n")
191
     buf.write(f"**Overall:** {score.overall:.2f} (`{score.band}`)  \n")
192
     buf.write(f"**Base:** `{suite.base_model_id}`  \n")
192
     buf.write(f"**Base:** `{suite.base_model_id}`  \n")
193
     buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
193
     buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
src/dlm_sway/suite/spec.pymodified
@@ -38,7 +38,7 @@ class SuiteDefaults(BaseModel):
38
     instead of toggling on one. More memory-heavy; only useful when a
38
     instead of toggling on one. More memory-heavy; only useful when a
39
     backend can't do in-place toggling."""
39
     backend can't do in-place toggling."""
40
     coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6
40
     coverage_threshold: Annotated[float, Field(ge=0.0, le=1.0)] = 0.6
41
-    """Minimum composite score for ``dlm-sway gate`` to pass."""
41
+    """Minimum composite score for ``sway gate`` to pass."""
42
 
42
 
43
 
43
 
44
 class SwaySpec(BaseModel):
44
 class SwaySpec(BaseModel):
@@ -58,7 +58,7 @@ class SwaySpec(BaseModel):
58
     """Optional path to a ``.dlm`` file. When present, the runner asks
58
     """Optional path to a ``.dlm`` file. When present, the runner asks
59
     :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and
59
     :mod:`dlm_sway.integrations.dlm.resolver` for typed sections and
60
     hands them to probes via :attr:`RunContext.sections`. Auto-populated
60
     hands them to probes via :attr:`RunContext.sections`. Auto-populated
61
-    by ``dlm-sway autogen``."""
61
+    by ``sway autogen``."""
62
 
62
 
63
     def check_version(self) -> None:
63
     def check_version(self) -> None:
64
         """Raise ``ValueError`` if the spec version is unsupported.
64
         """Raise ``ValueError`` if the spec version is unsupported.
tests/unit/test_cli.pymodified
@@ -1,4 +1,4 @@
1
-"""Smoke tests for the dlm-sway CLI.
1
+"""Smoke tests for the sway CLI.
2
 
2
 
3
 We avoid exercising backends (they need real models) and instead test
3
 We avoid exercising backends (they need real models) and instead test
4
 arg parsing, error paths, and the read-only commands (``doctor``,
4
 arg parsing, error paths, and the read-only commands (``doctor``,
@@ -18,7 +18,7 @@ from dlm_sway.cli.app import app
18
 def test_version_exits_zero() -> None:
18
 def test_version_exits_zero() -> None:
19
     result = CliRunner().invoke(app, ["--version"])
19
     result = CliRunner().invoke(app, ["--version"])
20
     assert result.exit_code == 0
20
     assert result.exit_code == 0
21
-    assert "dlm-sway" in result.stdout
21
+    assert "sway" in result.stdout
22
 
22
 
23
 
23
 
24
 def test_help_lists_all_commands() -> None:
24
 def test_help_lists_all_commands() -> None:
@@ -32,7 +32,7 @@ def test_doctor_runs(capsys) -> None: # type: ignore[no-untyped-def]
32
     result = CliRunner().invoke(app, ["doctor"])
32
     result = CliRunner().invoke(app, ["doctor"])
33
     assert result.exit_code == 0
33
     assert result.exit_code == 0
34
     # Rich applies color codes by default; assert the bare product name appears.
34
     # Rich applies color codes by default; assert the bare product name appears.
35
-    assert "dlm-sway" in result.stdout
35
+    assert "sway" in result.stdout
36
     assert "backends" in result.stdout
36
     assert "backends" in result.stdout
37
 
37
 
38
 
38
 
@@ -69,7 +69,7 @@ def test_report_from_json(tmp_path: Path) -> None:
69
 
69
 
70
     md = CliRunner().invoke(app, ["report", str(path), "--format", "md"])
70
     md = CliRunner().invoke(app, ["report", str(path), "--format", "md"])
71
     assert md.exit_code == 0
71
     assert md.exit_code == 0
72
-    assert "dlm-sway report" in md.stdout
72
+    assert "sway report" in md.stdout
73
 
73
 
74
     junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"])
74
     junit = CliRunner().invoke(app, ["report", str(path), "--format", "junit"])
75
     assert junit.exit_code == 0
75
     assert junit.exit_code == 0
tests/unit/test_suite_score_report.pymodified
@@ -165,7 +165,7 @@ class TestMarkdown:
165
         )
165
         )
166
         s = score.compute(suite)
166
         s = score.compute(suite)
167
         md = report.to_markdown(suite, s)
167
         md = report.to_markdown(suite, s)
168
-        assert "dlm-sway report" in md
168
+        assert "sway report" in md
169
         assert "| p1 | `__score_adherence`" in md
169
         assert "| p1 | `__score_adherence`" in md
170
 
170
 
171
 
171
 
@@ -204,7 +204,7 @@ class TestTerminal:
204
         console = Console(file=buf, force_terminal=False, width=120)
204
         console = Console(file=buf, force_terminal=False, width=120)
205
         report.to_terminal(suite, s, console=console)
205
         report.to_terminal(suite, s, console=console)
206
         out = buf.getvalue()
206
         out = buf.getvalue()
207
-        assert "dlm-sway report" in out
207
+        assert "sway report" in out
208
         assert "overall:" in out
208
         assert "overall:" in out
209
         assert "p1" in out
209
         assert "p1" in out
210
         assert "p2" in out
210
         assert "p2" in out