tenseleyflow/documentlanguagemodel / a19a302

Browse files

Slow integration test: dlm train succeeds without prior dlm init

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
a19a302defe916efbea9d8d426ca5710ca8fd722
Parents
f6aba6c
Tree
f09a5e5

1 changed file

StatusFile+-
A tests/integration/train/test_fresh_train_without_init.py 89 0
tests/integration/train/test_fresh_train_without_init.pyadded
@@ -0,0 +1,89 @@
1
+"""B12.1 regression: `dlm train` on a hand-authored `.dlm` (no prior `dlm init`).
2
+
3
+The original bug surfaced via Audit 12 E2E-1: an authored `.dlm` with a
4
+fresh ULID frontmatter passed straight to `dlm train` crashes with
5
+`manifest is corrupt: read failed: No such file or directory` after the
6
+trainer creates `<store>/{adapter,logs}/` but before any code writes
7
+the manifest.
8
+
9
+The fix in `src/dlm/cli/commands.py:train_cmd` bootstraps a manifest
10
+whenever the store layout exists but `manifest.json` does not (covers
11
+both the auto-scaffold path and this hand-authored path).
12
+
13
+This test reproduces the original failure mode end-to-end via
14
+`CliRunner` so the bootstrap can't silently regress.
15
+"""
16
+
17
+from __future__ import annotations
18
+
19
+import os
20
+from pathlib import Path
21
+
22
+import pytest
23
+from typer.testing import CliRunner
24
+
25
+pytestmark = [pytest.mark.slow, pytest.mark.online]
26
+
27
+
28
+def test_fresh_train_without_init_writes_manifest_and_advances(
29
+    tmp_path: Path,
30
+    tiny_model_dir: Path,  # noqa: ARG001 — session-cached download
31
+    monkeypatch: pytest.MonkeyPatch,
32
+) -> None:
33
+    monkeypatch.setenv("DLM_HOME", str(tmp_path / "dlm-home"))
34
+
35
+    doc = tmp_path / "fresh.dlm"
36
+    doc.write_text(
37
+        "---\n"
38
+        "dlm_id: 01KQB000FRESHB12B12B12B12B\n"
39
+        "dlm_version: 14\n"
40
+        "base_model: smollm2-135m\n"
41
+        "training:\n"
42
+        "  adapter: lora\n"
43
+        "  lora_r: 4\n"
44
+        "  sequence_len: 256\n"
45
+        "  micro_batch_size: 1\n"
46
+        "  grad_accum: 1\n"
47
+        "  num_epochs: 1\n"
48
+        "---\n"
49
+        "# Fresh\n"
50
+        "\n"
51
+        "::instruction::\n"
52
+        "### Q\n"
53
+        "What is two plus two?\n"
54
+        "\n"
55
+        "### A\n"
56
+        "Four.\n"
57
+        "\n"
58
+        "::instruction::\n"
59
+        "### Q\n"
60
+        "What is the capital of France?\n"
61
+        "\n"
62
+        "### A\n"
63
+        "Paris.\n",
64
+        encoding="utf-8",
65
+    )
66
+
67
+    from dlm.cli.app import app
68
+    from dlm.store.paths import for_dlm
69
+
70
+    runner = CliRunner()
71
+    result = runner.invoke(
72
+        app,
73
+        ["train", str(doc), "--max-steps", "1", "--fresh"],
74
+        env={**os.environ, "DLM_HOME": str(tmp_path / "dlm-home")},
75
+        catch_exceptions=False,
76
+    )
77
+
78
+    assert result.exit_code == 0, f"train failed:\n{result.output}"
79
+
80
+    store = for_dlm("01KQB000FRESHB12B12B12B12B")
81
+    assert store.manifest.exists(), (
82
+        "B12.1 regression: manifest.json was not bootstrapped on first train"
83
+    )
84
+    versions_dir = store.adapter / "versions"
85
+    assert versions_dir.exists(), "adapter/versions dir missing"
86
+    written_versions = sorted(p.name for p in versions_dir.iterdir() if p.is_dir())
87
+    assert "v0001" in written_versions, (
88
+        f"expected v0001 adapter after first train, got {written_versions}"
89
+    )