Python · 7566 bytes Raw Blame History
1 """End-to-end doctor() + renderer + CLI integration."""
2
3 from __future__ import annotations
4
5 import json
6 import subprocess
7 import sys
8 from dataclasses import replace
9 from unittest.mock import patch
10
11 from typer.testing import CliRunner
12
13 from dlm.cli.app import app
14 from dlm.hardware import DoctorResult, TrainingPlan, doctor, render_text
15 from dlm.hardware.backend import Backend
16 from dlm.hardware.capabilities import Capabilities, probe
17 from tests.fixtures.hardware_mocks import force_cpu, force_cuda, force_mps
18
19
20 class TestDoctorResult:
21 def test_returns_capabilities_on_any_backend(self) -> None:
22 with force_mps():
23 result = doctor()
24 assert result.capabilities.backend.value == "mps"
25 assert result.plan is not None # MPS + default config → plan resolves
26
27 def test_refused_plan_records_error(self) -> None:
28 from dlm.doc.schema import TrainingConfig
29
30 with force_mps():
31 result = doctor(TrainingConfig(adapter="qlora"))
32 assert result.plan is None
33 assert result.plan_error is not None
34 assert "MPS detected" in result.plan_error
35
36 def test_large_mps_base_records_force_only_error(self) -> None:
37 with force_mps():
38 caps = probe()
39 caps = replace(caps, unified_memory_gb=48.0)
40 with patch("dlm.hardware.doctor.probe", return_value=caps):
41 result = doctor(base_params=24_000_000_000)
42 assert result.plan is None
43 assert result.plan_error is not None
44 assert "Apple Silicon" in result.plan_error
45 assert "--force" in result.plan_error
46
47
48 class TestJsonSerialization:
49 def test_to_dict_is_valid_json(self) -> None:
50 with force_cuda(sm=(8, 9), vram_gb=24.0):
51 result = doctor()
52 blob = json.dumps(result.to_dict(), default=str)
53 loaded = json.loads(blob)
54 assert loaded["capabilities"]["backend"] == "cuda"
55 assert loaded["capabilities"]["determinism_class"] == "strong"
56 assert loaded["plan"] is not None
57
58 def test_determinism_class_surface(self) -> None:
59 with force_cpu():
60 cpu_result = doctor()
61 with force_mps():
62 mps_result = doctor()
63 with force_cuda():
64 cuda_result = doctor()
65 assert cpu_result.capabilities.determinism_class == "advisory"
66 assert mps_result.capabilities.determinism_class == "best-effort"
67 assert cuda_result.capabilities.determinism_class == "strong"
68
69
70 class TestRender:
71 def test_text_output_includes_all_sections(self) -> None:
72 with force_mps():
73 result = doctor()
74 text = render_text(result)
75 for marker in (
76 "Backend:",
77 "Torch:",
78 "Determinism:",
79 "Telemetry:",
80 "Platform:",
81 "Suggested plan",
82 ):
83 assert marker in text, f"missing marker {marker!r} in render"
84
85 def test_text_output_on_refused_plan(self) -> None:
86 from dlm.doc.schema import TrainingConfig
87
88 with force_mps():
89 result = doctor(TrainingConfig(adapter="qlora"))
90 text = render_text(result)
91 assert "Plan refused:" in text
92 assert "MPS" in text
93
94 def test_cuda_render_surfaces_sm_vram_and_cuda_suffix(self) -> None:
95 caps = Capabilities(
96 backend=Backend.CUDA,
97 device_name="RTX 4090",
98 sm=(8, 9),
99 rocm_arch=None,
100 vram_gb=23.9,
101 unified_memory_gb=None,
102 cpu_cores=16,
103 ram_gb=64.0,
104 supports_bf16=True,
105 supports_fp16=True,
106 has_flash_attention=True,
107 has_xformers=False,
108 has_bitsandbytes=True,
109 has_triton=True,
110 has_mlx=False,
111 torch_version="2.11.0",
112 accelerate_version="1.2.0",
113 cuda_version="12.4",
114 rocm_version=None,
115 platform="Linux 6.8",
116 determinism_class="strong",
117 telemetry_posture={},
118 )
119 plan = TrainingPlan(
120 precision="bf16",
121 attn_implementation="flash_attention_2",
122 use_qlora=False,
123 quant_compute_dtype=None,
124 micro_batch_size=2,
125 grad_accum=4,
126 effective_batch_size=8,
127 gradient_checkpointing=False,
128 est_peak_vram_gb=7.5,
129 est_step_seconds=0.8,
130 reason="test",
131 world_size=1,
132 )
133 text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
134 assert "SM 8.9" in text
135 assert "23.9 GB VRAM free" in text
136 assert "Torch: 2.11.0 (CUDA 12.4)" in text
137
138 def test_rocm_render_surfaces_arch_suffix_and_qlora_summary(self) -> None:
139 caps = Capabilities(
140 backend=Backend.ROCM,
141 device_name="Radeon 7900 XTX",
142 sm=(11, 0),
143 rocm_arch="gfx1100",
144 vram_gb=15.5,
145 unified_memory_gb=None,
146 cpu_cores=16,
147 ram_gb=64.0,
148 supports_bf16=True,
149 supports_fp16=True,
150 has_flash_attention=False,
151 has_xformers=False,
152 has_bitsandbytes=False,
153 has_triton=True,
154 has_mlx=False,
155 torch_version="2.11.0",
156 accelerate_version=None,
157 cuda_version=None,
158 rocm_version="6.0",
159 platform="Linux 6.8",
160 determinism_class="best-effort",
161 telemetry_posture={},
162 )
163 plan = TrainingPlan(
164 precision="bf16",
165 attn_implementation="sdpa",
166 use_qlora=True,
167 quant_compute_dtype="bf16",
168 micro_batch_size=1,
169 grad_accum=8,
170 effective_batch_size=8,
171 gradient_checkpointing=True,
172 est_peak_vram_gb=8.2,
173 est_step_seconds=1.2,
174 reason="test",
175 world_size=1,
176 )
177 text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
178 assert "arch gfx1100" in text
179 assert "Torch: 2.11.0 (ROCm 6.0)" in text
180 assert "adapter: qlora (4-bit NF4, compute bf16)" in text
181
182
183 class TestCliDoctor:
184 def test_cli_human_output_works(self) -> None:
185 runner = CliRunner()
186 # Exit code + rough output shape; underlying backend is whatever
187 # the test host has.
188 result = runner.invoke(app, ["doctor"])
189 assert result.exit_code == 0, result.output
190 assert "Backend:" in result.output
191 assert "Suggested plan" in result.output or "Plan refused" in result.output
192
193 def test_cli_json_output_is_valid_json(self) -> None:
194 runner = CliRunner()
195 result = runner.invoke(app, ["doctor", "--json"])
196 assert result.exit_code == 0, result.output
197 payload = json.loads(result.output)
198 assert "capabilities" in payload
199 assert "plan" in payload
200 assert payload["capabilities"]["determinism_class"] in {
201 "strong",
202 "best-effort",
203 "advisory",
204 }
205
206 def test_dlm_doctor_subprocess_exits_zero(self) -> None:
207 """Belt-and-braces: `python -m dlm doctor` on the real process."""
208 result = subprocess.run(
209 [sys.executable, "-m", "dlm", "doctor"],
210 check=False,
211 capture_output=True,
212 text=True,
213 timeout=30,
214 )
215 assert result.returncode == 0, result.stderr
216 assert "Determinism:" in result.stdout