`9614cd5`

feat(hardware): world_size-aware resolve + multi-GPU refusal matrix

Authored by

espadonne 3 weeks ago

SHA: 9614cd5875cd28b6cbdc922dde192e54febf9d34
Parents: 181e542
Tree: c8f6b8f

2 changed files

Status	File	+	-
M	`src/dlm/hardware/plan.py`	21	2
M	`src/dlm/hardware/refusals.py`	54	0

src/dlm/hardware/plan.pymodified

  from dlm.hardware.backend import Backend
  from dlm.hardware.capabilities import Capabilities
  from dlm.hardware.memory import estimate_peak_vram_gb, estimate_step_seconds
--from dlm.hardware.refusals import check_refusals
++from dlm.hardware.refusals import check_multi_gpu_refusals, check_refusals
  AttnImpl = Literal["flash_attention_2", "sdpa", "eager"]
  Precision = Literal["bf16", "fp16"]
      """Resolved training plan for the current host.
      Fields mirror the knobs the trainer (Sprint 09) actually consumes.
++    `world_size` (Sprint 23) is the number of data-parallel ranks; 1
++    on single-GPU / single-process paths. `effective_batch_size`
++    already folds `world_size` in, so users reading the plan don't
++    have to multiply themselves.
      """
      precision: Precision
      grad_accum: int
      effective_batch_size: int
      gradient_checkpointing: bool
++    world_size: int
      est_peak_vram_gb: float
      est_step_seconds: float
      reason: str
      force: bool = False,
      phase: Phase = "sft",
      num_adapters: int = 1,
++    world_size: int = 1,
  ) -> TrainingPlan:
      """Produce a concrete plan from a frontmatter config + host caps.
      `num_adapters` lets multi-adapter callers surface the count so
      F28 (multi-adapter QLoRA VRAM refusal) can fire before training
      starts. Single-adapter docs keep the default.
++
++    `world_size` (Sprint 23) is the number of data-parallel ranks.
++    Multiplies the reported `effective_batch_size` (each rank
++    processes a micro-batch independently) and scales the per-rank
++    step-time estimate down — more GPUs, less wall-clock time per
++    global step up to comm overhead. `world_size > 1` triggers the
++    multi-GPU refusal matrix (MPS/CPU refusal, heterogeneous CUDA
++    refusal).
      """
++    if world_size < 1:
++        raise ValueError(f"world_size must be >= 1, got {world_size}")
++    if world_size > 1:
++        check_multi_gpu_refusals(caps, world_size)
      check_refusals(
          training, caps, base_params, force=force, num_adapters=num_adapters
      )
          quant_compute_dtype=quant_dtype,
          micro_batch_size=micro_batch,
          grad_accum=grad_accum,
--        effective_batch_size=micro_batch * grad_accum,
++        effective_batch_size=micro_batch * grad_accum * world_size,
          gradient_checkpointing=gradient_checkpointing,
++        world_size=world_size,
          est_peak_vram_gb=round(est_peak, 2),
          est_step_seconds=round(est_step, 2),
          reason=reason,

src/dlm/hardware/refusals.pymodified

+         )
++def check_multi_gpu_refusals(caps: Capabilities, world_size: int) -> None:
++    """Refuse multi-GPU configurations that can't reasonably work.
++
++    Sprint 23 scope: CUDA only. MPS doesn't do DDP; CPU multi-process
++    training is technically possible but a terrible user experience.
++    Heterogeneous CUDA GPUs (different SM families) produce
++    inconsistent mixed-precision results — refuse rather than let the
++    slower arch silently dictate the precision.
++
++    ROCm multi-GPU is explicitly out of scope for this sprint per the
++    sprint 23 plan — refuse with a pointer so users don't chase
++    phantom bugs.
++    """
++    if world_size < 2:
++        return
++    if caps.backend == Backend.MPS:
++        raise ResolutionError(
++            "Multi-GPU training on Apple Silicon (MPS) is not supported; "
++            "MPS has no DDP path. Train single-GPU or on a CUDA host.",
++        )
++    if caps.backend == Backend.CPU:
++        raise ResolutionError(
++            "Multi-GPU training on CPU is not supported. "
++            "Drop `--gpus` or run single-process.",
++        )
++    if caps.backend == Backend.ROCM:
++        raise ResolutionError(
++            "Multi-GPU training on ROCm is out of scope for Sprint 23; "
++            "train single-GPU on ROCm or use a CUDA host for multi-GPU runs.",
++        )
++    # CUDA path — heterogeneous detection is the caller's responsibility
++    # since `Capabilities` only reports a single device. Callers that
++    # assemble multi-device state (the launcher) should call
++    # `assert_homogeneous_cuda` directly before spawning ranks.
++
++
++def assert_homogeneous_cuda(sm_per_device: list[tuple[int, int] | None]) -> None:
++    """Refuse if the configured CUDA devices span different SM families.
++
++    Accepts the list of SM tuples the launcher collected from
++    `torch.cuda.get_device_capability(i)` for each selected device.
++    Mixed precision behavior on heterogeneous GPUs (e.g. Ampere +
++    Turing) is unreliable — bf16 paths silently fall back to fp16 on
++    the Turing card and the two ranks drift.
++    """
++    unique = {sm for sm in sm_per_device if sm is not None}
++    if len(unique) > 1:
++        raise ResolutionError(
++            f"Heterogeneous CUDA GPUs detected (SM families: {sorted(unique)}); "
++            "multi-GPU training requires matching compute capability. "
++            "Select GPUs of the same generation via `--gpus 0,1` etc.",
++        )
++
++
  def _effective_adapter(training: TrainingConfig) -> str:
      """Return the adapter type effectively in force.