documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 7566 bytes Raw Blame History

  
        1
        """End-to-end doctor() + renderer + CLI integration."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import json
      
        6
        import subprocess
      
        7
        import sys
      
        8
        from dataclasses import replace
      
        9
        from unittest.mock import patch
      
        10
        
        11
        from typer.testing import CliRunner
      
        12
        
        13
        from dlm.cli.app import app
      
        14
        from dlm.hardware import DoctorResult, TrainingPlan, doctor, render_text
      
        15
        from dlm.hardware.backend import Backend
      
        16
        from dlm.hardware.capabilities import Capabilities, probe
      
        17
        from tests.fixtures.hardware_mocks import force_cpu, force_cuda, force_mps
      
        18
        
        19
        
        20
        class TestDoctorResult:
      
        21
            def test_returns_capabilities_on_any_backend(self) -> None:
      
        22
                with force_mps():
      
        23
                    result = doctor()
      
        24
                assert result.capabilities.backend.value == "mps"
      
        25
                assert result.plan is not None  # MPS + default config → plan resolves
      
        26
        
        27
            def test_refused_plan_records_error(self) -> None:
      
        28
                from dlm.doc.schema import TrainingConfig
      
        29
        
        30
                with force_mps():
      
        31
                    result = doctor(TrainingConfig(adapter="qlora"))
      
        32
                assert result.plan is None
      
        33
                assert result.plan_error is not None
      
        34
                assert "MPS detected" in result.plan_error
      
        35
        
        36
            def test_large_mps_base_records_force_only_error(self) -> None:
      
        37
                with force_mps():
      
        38
                    caps = probe()
      
        39
                caps = replace(caps, unified_memory_gb=48.0)
      
        40
                with patch("dlm.hardware.doctor.probe", return_value=caps):
      
        41
                    result = doctor(base_params=24_000_000_000)
      
        42
                assert result.plan is None
      
        43
                assert result.plan_error is not None
      
        44
                assert "Apple Silicon" in result.plan_error
      
        45
                assert "--force" in result.plan_error
      
        46
        
        47
        
        48
        class TestJsonSerialization:
      
        49
            def test_to_dict_is_valid_json(self) -> None:
      
        50
                with force_cuda(sm=(8, 9), vram_gb=24.0):
      
        51
                    result = doctor()
      
        52
                blob = json.dumps(result.to_dict(), default=str)
      
        53
                loaded = json.loads(blob)
      
        54
                assert loaded["capabilities"]["backend"] == "cuda"
      
        55
                assert loaded["capabilities"]["determinism_class"] == "strong"
      
        56
                assert loaded["plan"] is not None
      
        57
        
        58
            def test_determinism_class_surface(self) -> None:
      
        59
                with force_cpu():
      
        60
                    cpu_result = doctor()
      
        61
                with force_mps():
      
        62
                    mps_result = doctor()
      
        63
                with force_cuda():
      
        64
                    cuda_result = doctor()
      
        65
                assert cpu_result.capabilities.determinism_class == "advisory"
      
        66
                assert mps_result.capabilities.determinism_class == "best-effort"
      
        67
                assert cuda_result.capabilities.determinism_class == "strong"
      
        68
        
        69
        
        70
        class TestRender:
      
        71
            def test_text_output_includes_all_sections(self) -> None:
      
        72
                with force_mps():
      
        73
                    result = doctor()
      
        74
                text = render_text(result)
      
        75
                for marker in (
      
        76
                    "Backend:",
      
        77
                    "Torch:",
      
        78
                    "Determinism:",
      
        79
                    "Telemetry:",
      
        80
                    "Platform:",
      
        81
                    "Suggested plan",
      
        82
                ):
      
        83
                    assert marker in text, f"missing marker {marker!r} in render"
      
        84
        
        85
            def test_text_output_on_refused_plan(self) -> None:
      
        86
                from dlm.doc.schema import TrainingConfig
      
        87
        
        88
                with force_mps():
      
        89
                    result = doctor(TrainingConfig(adapter="qlora"))
      
        90
                text = render_text(result)
      
        91
                assert "Plan refused:" in text
      
        92
                assert "MPS" in text
      
        93
        
        94
            def test_cuda_render_surfaces_sm_vram_and_cuda_suffix(self) -> None:
      
        95
                caps = Capabilities(
      
        96
                    backend=Backend.CUDA,
      
        97
                    device_name="RTX 4090",
      
        98
                    sm=(8, 9),
      
        99
                    rocm_arch=None,
      
        100
                    vram_gb=23.9,
      
        101
                    unified_memory_gb=None,
      
        102
                    cpu_cores=16,
      
        103
                    ram_gb=64.0,
      
        104
                    supports_bf16=True,
      
        105
                    supports_fp16=True,
      
        106
                    has_flash_attention=True,
      
        107
                    has_xformers=False,
      
        108
                    has_bitsandbytes=True,
      
        109
                    has_triton=True,
      
        110
                    has_mlx=False,
      
        111
                    torch_version="2.11.0",
      
        112
                    accelerate_version="1.2.0",
      
        113
                    cuda_version="12.4",
      
        114
                    rocm_version=None,
      
        115
                    platform="Linux 6.8",
      
        116
                    determinism_class="strong",
      
        117
                    telemetry_posture={},
      
        118
                )
      
        119
                plan = TrainingPlan(
      
        120
                    precision="bf16",
      
        121
                    attn_implementation="flash_attention_2",
      
        122
                    use_qlora=False,
      
        123
                    quant_compute_dtype=None,
      
        124
                    micro_batch_size=2,
      
        125
                    grad_accum=4,
      
        126
                    effective_batch_size=8,
      
        127
                    gradient_checkpointing=False,
      
        128
                    est_peak_vram_gb=7.5,
      
        129
                    est_step_seconds=0.8,
      
        130
                    reason="test",
      
        131
                    world_size=1,
      
        132
                )
      
        133
                text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
      
        134
                assert "SM 8.9" in text
      
        135
                assert "23.9 GB VRAM free" in text
      
        136
                assert "Torch:          2.11.0 (CUDA 12.4)" in text
      
        137
        
        138
            def test_rocm_render_surfaces_arch_suffix_and_qlora_summary(self) -> None:
      
        139
                caps = Capabilities(
      
        140
                    backend=Backend.ROCM,
      
        141
                    device_name="Radeon 7900 XTX",
      
        142
                    sm=(11, 0),
      
        143
                    rocm_arch="gfx1100",
      
        144
                    vram_gb=15.5,
      
        145
                    unified_memory_gb=None,
      
        146
                    cpu_cores=16,
      
        147
                    ram_gb=64.0,
      
        148
                    supports_bf16=True,
      
        149
                    supports_fp16=True,
      
        150
                    has_flash_attention=False,
      
        151
                    has_xformers=False,
      
        152
                    has_bitsandbytes=False,
      
        153
                    has_triton=True,
      
        154
                    has_mlx=False,
      
        155
                    torch_version="2.11.0",
      
        156
                    accelerate_version=None,
      
        157
                    cuda_version=None,
      
        158
                    rocm_version="6.0",
      
        159
                    platform="Linux 6.8",
      
        160
                    determinism_class="best-effort",
      
        161
                    telemetry_posture={},
      
        162
                )
      
        163
                plan = TrainingPlan(
      
        164
                    precision="bf16",
      
        165
                    attn_implementation="sdpa",
      
        166
                    use_qlora=True,
      
        167
                    quant_compute_dtype="bf16",
      
        168
                    micro_batch_size=1,
      
        169
                    grad_accum=8,
      
        170
                    effective_batch_size=8,
      
        171
                    gradient_checkpointing=True,
      
        172
                    est_peak_vram_gb=8.2,
      
        173
                    est_step_seconds=1.2,
      
        174
                    reason="test",
      
        175
                    world_size=1,
      
        176
                )
      
        177
                text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
      
        178
                assert "arch gfx1100" in text
      
        179
                assert "Torch:          2.11.0 (ROCm 6.0)" in text
      
        180
                assert "adapter:         qlora (4-bit NF4, compute bf16)" in text
      
        181
        
        182
        
        183
        class TestCliDoctor:
      
        184
            def test_cli_human_output_works(self) -> None:
      
        185
                runner = CliRunner()
      
        186
                # Exit code + rough output shape; underlying backend is whatever
      
        187
                # the test host has.
      
        188
                result = runner.invoke(app, ["doctor"])
      
        189
                assert result.exit_code == 0, result.output
      
        190
                assert "Backend:" in result.output
      
        191
                assert "Suggested plan" in result.output or "Plan refused" in result.output
      
        192
        
        193
            def test_cli_json_output_is_valid_json(self) -> None:
      
        194
                runner = CliRunner()
      
        195
                result = runner.invoke(app, ["doctor", "--json"])
      
        196
                assert result.exit_code == 0, result.output
      
        197
                payload = json.loads(result.output)
      
        198
                assert "capabilities" in payload
      
        199
                assert "plan" in payload
      
        200
                assert payload["capabilities"]["determinism_class"] in {
      
        201
                    "strong",
      
        202
                    "best-effort",
      
        203
                    "advisory",
      
        204
                }
      
        205
        
        206
            def test_dlm_doctor_subprocess_exits_zero(self) -> None:
      
        207
                """Belt-and-braces: `python -m dlm doctor` on the real process."""
      
        208
                result = subprocess.run(
      
        209
                    [sys.executable, "-m", "dlm", "doctor"],
      
        210
                    check=False,
      
        211
                    capture_output=True,
      
        212
                    text=True,
      
        213
                    timeout=30,
      
        214
                )
      
        215
                assert result.returncode == 0, result.stderr
      
        216
                assert "Determinism:" in result.stdout

1	"""End-to-end doctor() + renderer + CLI integration."""
2
3	from __future__ import annotations
4
5	import json
6	import subprocess
7	import sys
8	from dataclasses import replace
9	from unittest.mock import patch
10
11	from typer.testing import CliRunner
12
13	from dlm.cli.app import app
14	from dlm.hardware import DoctorResult, TrainingPlan, doctor, render_text
15	from dlm.hardware.backend import Backend
16	from dlm.hardware.capabilities import Capabilities, probe
17	from tests.fixtures.hardware_mocks import force_cpu, force_cuda, force_mps
18
19
20	class TestDoctorResult:
21	def test_returns_capabilities_on_any_backend(self) -> None:
22	with force_mps():
23	result = doctor()
24	assert result.capabilities.backend.value == "mps"
25	assert result.plan is not None # MPS + default config → plan resolves
26
27	def test_refused_plan_records_error(self) -> None:
28	from dlm.doc.schema import TrainingConfig
29
30	with force_mps():
31	result = doctor(TrainingConfig(adapter="qlora"))
32	assert result.plan is None
33	assert result.plan_error is not None
34	assert "MPS detected" in result.plan_error
35
36	def test_large_mps_base_records_force_only_error(self) -> None:
37	with force_mps():
38	caps = probe()
39	caps = replace(caps, unified_memory_gb=48.0)
40	with patch("dlm.hardware.doctor.probe", return_value=caps):
41	result = doctor(base_params=24_000_000_000)
42	assert result.plan is None
43	assert result.plan_error is not None
44	assert "Apple Silicon" in result.plan_error
45	assert "--force" in result.plan_error
46
47
48	class TestJsonSerialization:
49	def test_to_dict_is_valid_json(self) -> None:
50	with force_cuda(sm=(8, 9), vram_gb=24.0):
51	result = doctor()
52	blob = json.dumps(result.to_dict(), default=str)
53	loaded = json.loads(blob)
54	assert loaded["capabilities"]["backend"] == "cuda"
55	assert loaded["capabilities"]["determinism_class"] == "strong"
56	assert loaded["plan"] is not None
57
58	def test_determinism_class_surface(self) -> None:
59	with force_cpu():
60	cpu_result = doctor()
61	with force_mps():
62	mps_result = doctor()
63	with force_cuda():
64	cuda_result = doctor()
65	assert cpu_result.capabilities.determinism_class == "advisory"
66	assert mps_result.capabilities.determinism_class == "best-effort"
67	assert cuda_result.capabilities.determinism_class == "strong"
68
69
70	class TestRender:
71	def test_text_output_includes_all_sections(self) -> None:
72	with force_mps():
73	result = doctor()
74	text = render_text(result)
75	for marker in (
76	"Backend:",
77	"Torch:",
78	"Determinism:",
79	"Telemetry:",
80	"Platform:",
81	"Suggested plan",
82	):
83	assert marker in text, f"missing marker {marker!r} in render"
84
85	def test_text_output_on_refused_plan(self) -> None:
86	from dlm.doc.schema import TrainingConfig
87
88	with force_mps():
89	result = doctor(TrainingConfig(adapter="qlora"))
90	text = render_text(result)
91	assert "Plan refused:" in text
92	assert "MPS" in text
93
94	def test_cuda_render_surfaces_sm_vram_and_cuda_suffix(self) -> None:
95	caps = Capabilities(
96	backend=Backend.CUDA,
97	device_name="RTX 4090",
98	sm=(8, 9),
99	rocm_arch=None,
100	vram_gb=23.9,
101	unified_memory_gb=None,
102	cpu_cores=16,
103	ram_gb=64.0,
104	supports_bf16=True,
105	supports_fp16=True,
106	has_flash_attention=True,
107	has_xformers=False,
108	has_bitsandbytes=True,
109	has_triton=True,
110	has_mlx=False,
111	torch_version="2.11.0",
112	accelerate_version="1.2.0",
113	cuda_version="12.4",
114	rocm_version=None,
115	platform="Linux 6.8",
116	determinism_class="strong",
117	telemetry_posture={},
118	)
119	plan = TrainingPlan(
120	precision="bf16",
121	attn_implementation="flash_attention_2",
122	use_qlora=False,
123	quant_compute_dtype=None,
124	micro_batch_size=2,
125	grad_accum=4,
126	effective_batch_size=8,
127	gradient_checkpointing=False,
128	est_peak_vram_gb=7.5,
129	est_step_seconds=0.8,
130	reason="test",
131	world_size=1,
132	)
133	text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
134	assert "SM 8.9" in text
135	assert "23.9 GB VRAM free" in text
136	assert "Torch: 2.11.0 (CUDA 12.4)" in text
137
138	def test_rocm_render_surfaces_arch_suffix_and_qlora_summary(self) -> None:
139	caps = Capabilities(
140	backend=Backend.ROCM,
141	device_name="Radeon 7900 XTX",
142	sm=(11, 0),
143	rocm_arch="gfx1100",
144	vram_gb=15.5,
145	unified_memory_gb=None,
146	cpu_cores=16,
147	ram_gb=64.0,
148	supports_bf16=True,
149	supports_fp16=True,
150	has_flash_attention=False,
151	has_xformers=False,
152	has_bitsandbytes=False,
153	has_triton=True,
154	has_mlx=False,
155	torch_version="2.11.0",
156	accelerate_version=None,
157	cuda_version=None,
158	rocm_version="6.0",
159	platform="Linux 6.8",
160	determinism_class="best-effort",
161	telemetry_posture={},
162	)
163	plan = TrainingPlan(
164	precision="bf16",
165	attn_implementation="sdpa",
166	use_qlora=True,
167	quant_compute_dtype="bf16",
168	micro_batch_size=1,
169	grad_accum=8,
170	effective_batch_size=8,
171	gradient_checkpointing=True,
172	est_peak_vram_gb=8.2,
173	est_step_seconds=1.2,
174	reason="test",
175	world_size=1,
176	)
177	text = render_text(DoctorResult(capabilities=caps, plan=plan, plan_error=None))
178	assert "arch gfx1100" in text
179	assert "Torch: 2.11.0 (ROCm 6.0)" in text
180	assert "adapter: qlora (4-bit NF4, compute bf16)" in text
181
182
183	class TestCliDoctor:
184	def test_cli_human_output_works(self) -> None:
185	runner = CliRunner()
186	# Exit code + rough output shape; underlying backend is whatever
187	# the test host has.
188	result = runner.invoke(app, ["doctor"])
189	assert result.exit_code == 0, result.output
190	assert "Backend:" in result.output
191	assert "Suggested plan" in result.output or "Plan refused" in result.output
192
193	def test_cli_json_output_is_valid_json(self) -> None:
194	runner = CliRunner()
195	result = runner.invoke(app, ["doctor", "--json"])
196	assert result.exit_code == 0, result.output
197	payload = json.loads(result.output)
198	assert "capabilities" in payload
199	assert "plan" in payload
200	assert payload["capabilities"]["determinism_class"] in {
201	"strong",
202	"best-effort",
203	"advisory",
204	}
205
206	def test_dlm_doctor_subprocess_exits_zero(self) -> None:
207	"""Belt-and-braces: `python -m dlm doctor` on the real process."""
208	result = subprocess.run(
209	[sys.executable, "-m", "dlm", "doctor"],
210	check=False,
211	capture_output=True,
212	text=True,
213	timeout=30,
214	)
215	assert result.returncode == 0, result.stderr
216	assert "Determinism:" in result.stdout