tenseleyflow/documentlanguagemodel / 453f95e

Browse files

ci,test(eval): gate eval+inference at 95% + scaffold slow integration stubs

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
453f95e7f9e42e38472f554965e78f9b00038236
Parents
b3e7a82
Tree
c608fc7

3 changed files

StatusFile+-
M .github/workflows/ci.yml 16 0
A tests/integration/eval/__init__.py 0 0
A tests/integration/eval/test_train_then_prompt_tinymodel.py 58 0
.github/workflows/ci.ymlmodified
@@ -103,6 +103,22 @@ jobs:
103103
             --cov-report=term-missing \
104104
             --cov-fail-under=95
105105
 
106
+      - name: Coverage gate — src/dlm/eval ≥ 95% (Sprint 10)
107
+        if: matrix.os == 'ubuntu-latest'
108
+        run: |
109
+          uv run pytest tests/unit/eval \
110
+            --cov=src/dlm/eval \
111
+            --cov-report=term-missing \
112
+            --cov-fail-under=95
113
+
114
+      - name: Coverage gate — src/dlm/inference ≥ 95% (Sprint 10)
115
+        if: matrix.os == 'ubuntu-latest'
116
+        run: |
117
+          uv run pytest tests/unit/inference \
118
+            --cov=src/dlm/inference \
119
+            --cov-report=term-missing \
120
+            --cov-fail-under=95
121
+
106122
   no-network-sandbox:
107123
     # audit F13: dlm init / doctor / show must work with zero outbound network.
108124
     name: no-network sandbox (ubuntu-latest)
tests/integration/eval/__init__.pyadded
tests/integration/eval/test_train_then_prompt_tinymodel.pyadded
@@ -0,0 +1,58 @@
1
+"""End-to-end: train on tiny model, then `dlm prompt` against the adapter.
2
+
3
+Sprint 10 DoD: `dlm prompt` one-shot works on a freshly trained tiny-model
4
+adapter. Also exercises the cross-hardware `InferencePlan` path when run
5
+on a CPU-only runner after a QLoRA adapter was CI-produced on a CUDA
6
+job (the regression test that F05 calls out).
7
+
8
+Marked `@pytest.mark.slow`. Skipped when the SmolLM2-135M fixture isn't
9
+offline-resolvable (same gate as Sprint 09's integration stubs).
10
+"""
11
+
12
+from __future__ import annotations
13
+
14
+import pytest
15
+
16
+pytestmark = pytest.mark.slow
17
+
18
+
19
+@pytest.mark.slow
20
+def test_train_then_prompt_one_cycle() -> None:
21
+    """20-step train + prompt generates non-empty coherent output.
22
+
23
+    Shape:
24
+      1. Synthetic `.dlm` via `tests.fixtures.dlm_factory`.
25
+      2. `trainer.run(..., max_steps=20)` on SmolLM2-135M.
26
+      3. Resolve `InferencePlan` against current host's caps.
27
+      4. `load_for_inference` → `generate(prompt="What is X?")`.
28
+      5. Assert non-empty string response.
29
+
30
+    Deferred body: implementation is CI-dependent. The scaffold is
31
+    checked in so `pytest -m slow` has a concrete test to collect.
32
+    """
33
+    try:
34
+        from tests.fixtures.tiny_model import tiny_model_path
35
+
36
+        tiny_model_path()
37
+    except Exception as exc:  # pragma: no cover
38
+        pytest.skip(f"tiny-model fixture unavailable: {exc}")
39
+
40
+    pytest.xfail("train+prompt integration scaffolded; body deferred to first CI slow run")
41
+
42
+
43
+@pytest.mark.slow
44
+def test_qlora_crossplatform_dequantize() -> None:
45
+    """Audit F05: a QLoRA-trained adapter loads on a non-CUDA host via dequantize.
46
+
47
+    Shape:
48
+      1. CI matrix has a CUDA job that trains QLoRA and uploads the
49
+         adapter as an artifact.
50
+      2. This test runs on a CPU-only matrix row; downloads the
51
+         artifact, resolves `InferencePlan`, asserts
52
+         `plan.dequantize_on_load is True`, loads, generates,
53
+         asserts non-empty coherent output.
54
+
55
+    Until the CUDA-producing job exists, the test is xfailed so it
56
+    shows up in `pytest -m slow` as "expected failure pending CI".
57
+    """
58
+    pytest.xfail("needs CI artifact-sharing between CUDA and CPU jobs; F05 regression")