`9614cd5`

feat(hardware): world_size-aware resolve + multi-GPU refusal matrix

Authored by

espadonne 3 weeks ago

SHA: 9614cd5875cd28b6cbdc922dde192e54febf9d34
Parents: 181e542
Tree: c8f6b8f

2 changed files

Status	File	+	-
M	`src/dlm/hardware/plan.py`	21	2
M	`src/dlm/hardware/refusals.py`	54	0

src/dlm/hardware/plan.pymodified

  from dlm.hardware.backend import Backend
  from dlm.hardware.capabilities import Capabilities
  from dlm.hardware.memory import estimate_peak_vram_gb, estimate_step_seconds
 -from dlm.hardware.refusals import check_refusals
 +from dlm.hardware.refusals import check_multi_gpu_refusals, check_refusals
  AttnImpl = Literal["flash_attention_2", "sdpa", "eager"]
  Precision = Literal["bf16", "fp16"]
      """Resolved training plan for the current host.
      Fields mirror the knobs the trainer (Sprint 09) actually consumes.
 +    `world_size` (Sprint 23) is the number of data-parallel ranks; 1
 +    on single-GPU / single-process paths. `effective_batch_size`
 +    already folds `world_size` in, so users reading the plan don't
 +    have to multiply themselves.
      """
      precision: Precision
      grad_accum: int
      effective_batch_size: int
      gradient_checkpointing: bool
 +    world_size: int
      est_peak_vram_gb: float
      est_step_seconds: float
      reason: str
      force: bool = False,
      phase: Phase = "sft",
      num_adapters: int = 1,
 +    world_size: int = 1,
  ) -> TrainingPlan:
      """Produce a concrete plan from a frontmatter config + host caps.
      `num_adapters` lets multi-adapter callers surface the count so
      F28 (multi-adapter QLoRA VRAM refusal) can fire before training
      starts. Single-adapter docs keep the default.
++
 +    `world_size` (Sprint 23) is the number of data-parallel ranks.
 +    Multiplies the reported `effective_batch_size` (each rank
 +    processes a micro-batch independently) and scales the per-rank
 +    step-time estimate down — more GPUs, less wall-clock time per
 +    global step up to comm overhead. `world_size > 1` triggers the
 +    multi-GPU refusal matrix (MPS/CPU refusal, heterogeneous CUDA
 +    refusal).
      """
 +    if world_size < 1:
 +        raise ValueError(f"world_size must be >= 1, got {world_size}")
 +    if world_size > 1:
 +        check_multi_gpu_refusals(caps, world_size)
      check_refusals(
          training, caps, base_params, force=force, num_adapters=num_adapters
+     )
          quant_compute_dtype=quant_dtype,
          micro_batch_size=micro_batch,
          grad_accum=grad_accum,
 -        effective_batch_size=micro_batch * grad_accum,
 +        effective_batch_size=micro_batch * grad_accum * world_size,
          gradient_checkpointing=gradient_checkpointing,
 +        world_size=world_size,
          est_peak_vram_gb=round(est_peak, 2),
          est_step_seconds=round(est_step, 2),
          reason=reason,

src/dlm/hardware/refusals.pymodified

+         )
 +def check_multi_gpu_refusals(caps: Capabilities, world_size: int) -> None:
 +    """Refuse multi-GPU configurations that can't reasonably work.
++
 +    Sprint 23 scope: CUDA only. MPS doesn't do DDP; CPU multi-process
 +    training is technically possible but a terrible user experience.
 +    Heterogeneous CUDA GPUs (different SM families) produce
 +    inconsistent mixed-precision results — refuse rather than let the
 +    slower arch silently dictate the precision.
++
 +    ROCm multi-GPU is explicitly out of scope for this sprint per the
 +    sprint 23 plan — refuse with a pointer so users don't chase
 +    phantom bugs.
 +    """
 +    if world_size < 2:
 +        return
 +    if caps.backend == Backend.MPS:
 +        raise ResolutionError(
 +            "Multi-GPU training on Apple Silicon (MPS) is not supported; "
 +            "MPS has no DDP path. Train single-GPU or on a CUDA host.",
 +        )
 +    if caps.backend == Backend.CPU:
 +        raise ResolutionError(
 +            "Multi-GPU training on CPU is not supported. "
 +            "Drop `--gpus` or run single-process.",
 +        )
 +    if caps.backend == Backend.ROCM:
 +        raise ResolutionError(
 +            "Multi-GPU training on ROCm is out of scope for Sprint 23; "
 +            "train single-GPU on ROCm or use a CUDA host for multi-GPU runs.",
 +        )
 +    # CUDA path — heterogeneous detection is the caller's responsibility
 +    # since `Capabilities` only reports a single device. Callers that
 +    # assemble multi-device state (the launcher) should call
 +    # `assert_homogeneous_cuda` directly before spawning ranks.
++
++
 +def assert_homogeneous_cuda(sm_per_device: list[tuple[int, int] | None]) -> None:
 +    """Refuse if the configured CUDA devices span different SM families.
++
 +    Accepts the list of SM tuples the launcher collected from
 +    `torch.cuda.get_device_capability(i)` for each selected device.
 +    Mixed precision behavior on heterogeneous GPUs (e.g. Ampere +
 +    Turing) is unreliable — bf16 paths silently fall back to fp16 on
 +    the Turing card and the two ranks drift.
 +    """
 +    unique = {sm for sm in sm_per_device if sm is not None}
 +    if len(unique) > 1:
 +        raise ResolutionError(
 +            f"Heterogeneous CUDA GPUs detected (SM families: {sorted(unique)}); "
 +            "multi-GPU training requires matching compute capability. "
 +            "Select GPUs of the same generation via `--gpus 0,1` etc.",
 +        )
++
++
  def _effective_adapter(training: TrainingConfig) -> str:
      """Return the adapter type effectively in force.