`12ccdcc`

test(hardware): ROCm LoRA smoke scaffold (opt-in via DLM_ENABLE_ROCM_SMOKE)

Authored by

espadonne 3 weeks ago

SHA: 12ccdcc7276ed4aab604f02c3c07fc5a11a83816
Parents: 93e884d
Tree: a0e6022

2 changed files

Status	File	+	-
A	`tests/integration/hardware/__init__.py`	0	0
A	`tests/integration/hardware/test_rocm_train_smoke.py`	63	0

tests/integration/hardware/__init__.pyadded

tests/integration/hardware/test_rocm_train_smoke.pyadded

 +"""ROCm training smoke (Sprint 22).
++
 +Verifies the doctor→plan→trainer pipeline actually runs on a ROCm
 +host without the refusal matrix blocking LoRA. Uses the tiny-model
 +session fixture and runs for a single step; the smoke is that
 +`run_training` returns a result rather than raising.
++
 +Skipped unless:
 +- `torch.version.hip` is truthy at runtime (real ROCm torch build)
 +- `DLM_ENABLE_ROCM_SMOKE=1` in the environment (opt-in even on a
 +  ROCm host so local `pytest -m slow` runs stay CPU/CUDA-only)
++
 +CI: no default runner exists; expected to be run on a self-hosted
 +ROCm box via a scheduled workflow. Documented in
 +`docs/hardware/rocm.md`.
 +"""
++
 +from __future__ import annotations
++
 +import os
 +from typing import TYPE_CHECKING
++
 +import pytest
++
 +if TYPE_CHECKING:
 +    from tests.fixtures.trained_store import TrainedStoreHandle
++
++
 +def _rocm_host() -> bool:
 +    try:
 +        import torch
 +    except ImportError:  # pragma: no cover
 +        return False
 +    return bool(getattr(torch.version, "hip", None))
++
++
 +pytestmark = [
 +    pytest.mark.slow,
 +    pytest.mark.skipif(not _rocm_host(), reason="requires a ROCm PyTorch build"),
 +    pytest.mark.skipif(
 +        os.environ.get("DLM_ENABLE_ROCM_SMOKE") != "1",
 +        reason="set DLM_ENABLE_ROCM_SMOKE=1 to opt in to the ROCm smoke on a real host",
 +    ),
 +]
++
++
 +def test_rocm_lora_smoke_runs(  # pragma: no cover - gpu+rocm path
 +    trained_store: TrainedStoreHandle,
 +) -> None:
 +    """One-step LoRA train on ROCm — no refusal, produces an adapter version bump."""
 +    # The `trained_store` session fixture trained once during setup;
 +    # reaching this point on a ROCm host without a refusal is the
 +    # smoke signal. Assert the store has at least one committed
 +    # adapter version.
 +    store = trained_store.store
 +    adapter_dir = store.resolve_current_adapter()
 +    assert adapter_dir is not None, (
 +        "trained_store fixture produced no adapter on ROCm — "
 +        "LoRA path likely blocked by a refusal that shouldn't fire"
 +    )
 +    assert (adapter_dir / "adapter_model.safetensors").exists(), (
 +        "ROCm LoRA wrote the pointer but not the adapter weights"
 +    )