`42c551a`

refactor(modality): ModalityDispatch registry + fold spec.modality == scatter

src/dlm/modality/ gains errors.py, registry.py (base class + predicate
flags + dispatch hooks), vl.py, audio.py, text.py, plus the MODALITIES
dict + modality_for() helper. Predicate flags (requires_processor,
accepts_images, accepts_audio) replace string comparisons; subclasses
override dispatch_export + load_processor where they have real work.

Folds nine call sites off spec.modality == "..." onto predicate reads
or modality_for(spec).dispatch_export() — cli/commands.py (two export
branches + two prompt-guardrail branches), train/trainer.py (is_vl /
is_audio / is_media trio), train/loader.py (VL / audio base-model
class picker), base_models/probes.py (token-probe dispatch +
is_media).

Per-file-ignore ARG002 carves out the polymorphic signature — each
subclass uses a different subset of the shared kwargs.

Authored by

espadonne 3 weeks ago

SHA: 42c551a67de0aa1c85fe8208e0e37b7b848cbf6d
Parents: c00a243
Tree: 9dc0fc4

11 changed files

Status	File	+	-
M	`pyproject.toml`	6	0
M	`src/dlm/base_models/probes.py`	6	3
M	`src/dlm/cli/commands.py`	17	15
A	`src/dlm/modality/__init__.py`	43	0
A	`src/dlm/modality/audio.py`	46	0
A	`src/dlm/modality/errors.py`	11	0
A	`src/dlm/modality/registry.py`	108	0
A	`src/dlm/modality/text.py`	7	0
A	`src/dlm/modality/vl.py`	47	0
M	`src/dlm/train/loader.py`	5	2
M	`src/dlm/train/trainer.py`	6	3

pyproject.tomlmodified

  # positionally even when the implementation only reads some of them —
  # HF dispatches them by position. ARG002 for these wrappers is noise.
  "src/dlm/train/cpt/embed_warmup.py" = ["ARG002"]
 +# Modality dispatch uses a polymorphic interface — each subclass uses
 +# a different subset of the keyword args (text.dispatch_export reads
 +# none, VL reads gguf_emission_context, audio ignores it). ARG002
 +# flags the unused ones in each branch; the shared signature is the
 +# point of the abstraction.
 +"src/dlm/modality/*.py" = ["ARG002"]
  [tool.ruff.format]
  quote-style = "double"

src/dlm/base_models/probes.pymodified

      the vendored copy catches up. VL bases auto-opt-out of export
      probes — GGUF conversion for VL archs is tracked in Sprint 35.4.
      """
 +    from dlm.modality import modality_for
++
 +    dispatch = modality_for(spec)
      core: tuple[ProbeResult, ...] = (probe_architecture(spec),)
 -    if spec.modality == "vision-language":
 +    if dispatch.accepts_images:
          core = (*core, probe_vl_image_token(spec))
 -    elif spec.modality == "audio-language":
 +    elif dispatch.accepts_audio:
          core = (*core, probe_audio_token(spec))
      else:
          core = (*core, probe_chat_template(spec))
      # converter support for VL archs is Sprint 35.4's scope, and audio
      # archs are not on any llama.cpp roadmap yet. The export path
      # refuses GGUF cleanly for both and emits an HF snapshot instead.
 -    is_media = spec.modality in ("vision-language", "audio-language")
 +    is_media = dispatch.requires_processor
      if skip_export_probes or is_media:
          return ProbeReport(hf_id=spec.hf_id, results=core)
      results = (

src/dlm/cli/commands.pymodified

      # The VL branch has its own model / processor / adapter loader and
      # its own generate function. `--image` and vision-language bases
      # must appear together; each alone is a usage error.
 -    is_vl_spec = spec.modality == "vision-language"
 -    if image_paths and not is_vl_spec:
 +    from dlm.modality import modality_for
++
 +    dispatch = modality_for(spec)
 +    if image_paths and not dispatch.accepts_images:
          console.print(
              f"[red]prompt:[/red] --image is only valid with vision-language bases; "
              f"base {spec.key!r} is modality='{spec.modality}'."
+         )
          raise typer.Exit(code=2)
 -    if is_vl_spec and not image_paths:
 +    if dispatch.accepts_images and not image_paths:
          console.print(
              f"[red]prompt:[/red] base {spec.key!r} is vision-language; "
              "pass at least one --image PATH to prompt it."
+         )
          raise typer.Exit(code=2)
 -    if is_vl_spec:
 +    if dispatch.accepts_images:
          _dispatch_vl_prompt(
              console=console,
              spec=spec,
          return
      # --- Audio path (Sprint 35.2) -------------------------------------
 -    is_audio_spec = spec.modality == "audio-language"
 -    if audio_paths and not is_audio_spec:
 +    if audio_paths and not dispatch.accepts_audio:
          console.print(
              f"[red]prompt:[/red] --audio is only valid with audio-language bases; "
              f"base {spec.key!r} is modality='{spec.modality}'."
+         )
          raise typer.Exit(code=2)
 -    if is_audio_spec and not audio_paths:
 +    if dispatch.accepts_audio and not audio_paths:
          console.print(
              f"[red]prompt:[/red] base {spec.key!r} is audio-language; "
              "pass at least one --audio PATH to prompt it."
+         )
          raise typer.Exit(code=2)
 -    if is_audio_spec:
 +    if dispatch.accepts_audio:
          _dispatch_audio_prompt(
              console=console,
              spec=spec,
      # Audio bases take HF-snapshot unconditionally — llama.cpp has no
      # audio-arch roadmap at our pinned tag — so branch early without
      # resolving a GGUF plan.
 -    if spec.modality == "audio-language":
 -        from dlm.export.dispatch import dispatch_audio_export
 +    from dlm.modality import modality_for
 +    export_dispatch = modality_for(spec)
 +    if export_dispatch.accepts_audio:
          try:
 -            dispatch_result = dispatch_audio_export(
 +            dispatch_result = export_dispatch.dispatch_export(
                  store=store,
                  spec=spec,
                  adapter_name=adapter,
          except ExportError as exc:
              console.print(f"[red]export:[/red] {exc}")
              raise typer.Exit(code=1) from exc
 +        assert dispatch_result is not None  # audio modality always returns a result
          for line in dispatch_result.banner_lines:
              console.print(line)
          return
      # still need the resolved plan + cached base dir for the GGUF
      # path, so resolve those first, then let the dispatcher decide
      # whether to use them.
 -    if spec.modality == "vision-language":
 -        from dlm.export.dispatch import dispatch_vl_export
+-
 +    if export_dispatch.accepts_images:
          try:
              cached_vl = download_spec(spec, local_files_only=True)
          except RuntimeError as exc:
+             )
              raise typer.Exit(code=1) from exc
          try:
 -            dispatch_result = dispatch_vl_export(
 +            dispatch_result = export_dispatch.dispatch_export(
                  store=store,
                  spec=spec,
                  adapter_name=adapter,
          except ExportError as exc:
              console.print(f"[red]export:[/red] {exc}")
              raise typer.Exit(code=1) from exc
 +        assert dispatch_result is not None  # VL modality always returns a result
          for line in dispatch_result.banner_lines:
              console.print(line)
          return

src/dlm/modality/__init__.pyadded

 +"""Modality dispatch package — replaces scattered ``spec.modality ==`` branches.
++
 +Public surface:
++
 +- :class:`ModalityDispatch` — base class with predicate flags +
 +  dispatch hooks (``dispatch_export``, ``load_processor``).
 +- :data:`MODALITIES` — string → instance registry.
 +- :func:`modality_for` — resolve a spec to its dispatcher.
 +- :class:`UnknownModalityError` — raised when a spec's modality
 +  string has no registered dispatcher.
++
 +Callers that previously wrote ``if spec.modality == "vision-language"``
 +now read ``modality_for(spec).accepts_images`` (or one of the other
 +predicate flags) or call a dispatch method directly. A pregate
 +grep-gate refuses new scatter — see ``scripts/pregate.sh``.
 +"""
++
 +from __future__ import annotations
++
 +from dlm.modality.audio import AudioLanguageModality
 +from dlm.modality.errors import ModalityError, UnknownModalityError
 +from dlm.modality.registry import ModalityDispatch, TextModality, modality_for
 +from dlm.modality.vl import VisionLanguageModality
++
 +MODALITIES: dict[str, ModalityDispatch] = {
 +    "text": TextModality(),
 +    "vision-language": VisionLanguageModality(),
 +    "audio-language": AudioLanguageModality(),
 +}
 +"""Registry: modality string → dispatcher instance. Ordered by
 +registration history — future modalities append here and land a
 +corresponding class under ``dlm.modality``."""
++
 +__all__ = [
 +    "MODALITIES",
 +    "AudioLanguageModality",
 +    "ModalityDispatch",
 +    "ModalityError",
 +    "TextModality",
 +    "UnknownModalityError",
 +    "VisionLanguageModality",
 +    "modality_for",
 +]

src/dlm/modality/audio.pyadded

 +"""Audio-language modality dispatch."""
++
 +from __future__ import annotations
++
 +from typing import TYPE_CHECKING, Any
++
 +from dlm.modality.registry import ModalityDispatch
++
 +if TYPE_CHECKING:
 +    from dlm.base_models import BaseModelSpec
 +    from dlm.export.dispatch import DispatchResult
++
++
 +class AudioLanguageModality(ModalityDispatch):
 +    """Audio-language base — audio accepted, processor required, HF-snapshot export."""
++
 +    modality = "audio-language"
 +    requires_processor = True
 +    accepts_audio = True
++
 +    def load_processor(self, spec: BaseModelSpec) -> Any:
 +        from dlm.train.loader import load_processor as _load
++
 +        return _load(spec)
++
 +    def dispatch_export(
 +        self,
 +        *,
 +        store: Any,
 +        spec: BaseModelSpec,
 +        adapter_name: str | None,
 +        quant: str | None,
 +        merged: bool,
 +        adapter_mix_raw: str | None,
 +        gguf_emission_context: dict[str, Any] | None = None,
 +    ) -> DispatchResult:
 +        from dlm.export.dispatch import dispatch_audio_export
++
 +        return dispatch_audio_export(
 +            store=store,
 +            spec=spec,
 +            adapter_name=adapter_name,
 +            quant=quant,
 +            merged=merged,
 +            adapter_mix_raw=adapter_mix_raw,
 +        )

src/dlm/modality/errors.pyadded

 +"""Typed errors for modality dispatch."""
++
 +from __future__ import annotations
++
++
 +class ModalityError(Exception):
 +    """Base for `dlm.modality` errors."""
++
++
 +class UnknownModalityError(ModalityError):
 +    """Spec declares a modality string the registry doesn't know."""

src/dlm/modality/registry.pyadded

 +"""Modality dispatch base class — predicate flags + method hooks.
++
 +Callers that used to branch on ``spec.modality == "vision-language"``
 +or ``"audio-language"`` now read from a registered
 +:class:`ModalityDispatch` instance. Three concrete subclasses live
 +under the ``dlm.modality`` package — one per supported modality —
 +registered in :data:`MODALITIES` and resolved via
 +:func:`modality_for`. The split keeps the "does this spec accept
 +images?" predicate next to the "route the export through the VL
 +path" method: both are modality-specific concerns.
++
 +Each instance carries:
++
 +- ``modality`` (string tag — the only place a `"vision-language"`
 +  string literal appears outside the base-model schema);
 +- predicate flags (``requires_processor``, ``accepts_images``,
 +  ``accepts_audio``) callers read instead of comparing the tag;
 +- dispatch hooks (``dispatch_export``, ``dispatch_prompt``) that
 +  forward to the modality-specific pipeline.
++
 +A pregate grep-gate refuses new ``spec.modality ==`` comparisons
 +outside this package so next-modality work lands here rather than
 +scattering another set of branches.
 +"""
++
 +from __future__ import annotations
++
 +from typing import TYPE_CHECKING, Any
++
 +from dlm.modality.errors import UnknownModalityError
++
 +if TYPE_CHECKING:
 +    from dlm.base_models import BaseModelSpec
 +    from dlm.export.dispatch import DispatchResult
++
++
 +class ModalityDispatch:
 +    """Base class — subclasses override per-modality predicates + hooks.
++
 +    The base implementation defaults to the text-path semantics
 +    (nothing to probe, nothing to dispatch). Subclasses narrow the
 +    predicates and override the dispatch hooks.
 +    """
++
 +    modality: str = "text"
 +    """The modality tag. The only place modality string literals
 +    should appear outside this package."""
++
 +    requires_processor: bool = False
 +    """True for media modalities that ship a feature extractor /
 +    processor alongside the tokenizer. Text-only bases set this
 +    False — the trainer skips the BlobStore + preprocess pass."""
++
 +    accepts_images: bool = False
 +    """True for vision-language bases. Drives the ``dlm prompt
 +    --image`` guardrail."""
++
 +    accepts_audio: bool = False
 +    """True for audio-language bases. Drives the ``dlm prompt
 +    --audio`` guardrail."""
++
 +    def load_processor(self, spec: BaseModelSpec) -> Any | None:
 +        """Load the HF processor if this modality needs one. Text → None."""
 +        return None
++
 +    def dispatch_export(
 +        self,
 +        *,
 +        store: Any,
 +        spec: BaseModelSpec,
 +        adapter_name: str | None,
 +        quant: str | None,
 +        merged: bool,
 +        adapter_mix_raw: str | None,
 +        gguf_emission_context: dict[str, Any] | None = None,
 +    ) -> DispatchResult | None:
 +        """Route an export through the modality-specific path.
++
 +        Returns ``None`` on the text path — the caller falls back to
 +        the GGUF `run_export` pipeline, which has a different result
 +        shape (`run_export` returns `RunResult`, not `DispatchResult`,
 +        and the text path prints its own banner inline).
 +        """
 +        return None
++
++
 +class TextModality(ModalityDispatch):
 +    """Text-only base — defaults carry the whole contract."""
++
 +    modality = "text"
++
++
 +def _unknown(mod: str) -> UnknownModalityError:
 +    return UnknownModalityError(
 +        f"modality={mod!r} has no registered dispatcher. "
 +        "Register a ModalityDispatch subclass in dlm.modality and "
 +        "add it to MODALITIES."
 +    )
++
++
 +def modality_for(spec: BaseModelSpec) -> ModalityDispatch:
 +    """Resolve a spec's ``ModalityDispatch``, raising if unregistered."""
 +    from dlm.modality import MODALITIES  # late import to avoid cycle
++
 +    try:
 +        return MODALITIES[spec.modality]
 +    except KeyError as exc:
 +        raise _unknown(spec.modality) from exc

src/dlm/modality/text.pyadded

 +"""Text modality dispatch — thin re-export of the base defaults."""
++
 +from __future__ import annotations
++
 +from dlm.modality.registry import TextModality
++
 +__all__ = ["TextModality"]

src/dlm/modality/vl.pyadded

 +"""Vision-language modality dispatch."""
++
 +from __future__ import annotations
++
 +from typing import TYPE_CHECKING, Any
++
 +from dlm.modality.registry import ModalityDispatch
++
 +if TYPE_CHECKING:
 +    from dlm.base_models import BaseModelSpec
 +    from dlm.export.dispatch import DispatchResult
++
++
 +class VisionLanguageModality(ModalityDispatch):
 +    """VL base — images accepted, processor required, GGUF-then-snapshot export."""
++
 +    modality = "vision-language"
 +    requires_processor = True
 +    accepts_images = True
++
 +    def load_processor(self, spec: BaseModelSpec) -> Any:
 +        from dlm.train.loader import load_processor as _load
++
 +        return _load(spec)
++
 +    def dispatch_export(
 +        self,
 +        *,
 +        store: Any,
 +        spec: BaseModelSpec,
 +        adapter_name: str | None,
 +        quant: str | None,
 +        merged: bool,
 +        adapter_mix_raw: str | None,
 +        gguf_emission_context: dict[str, Any] | None = None,
 +    ) -> DispatchResult:
 +        from dlm.export.dispatch import dispatch_vl_export
++
 +        return dispatch_vl_export(
 +            store=store,
 +            spec=spec,
 +            adapter_name=adapter_name,
 +            quant=quant,
 +            merged=merged,
 +            adapter_mix_raw=adapter_mix_raw,
 +            gguf_emission_context=gguf_emission_context,
 +        )

src/dlm/train/loader.pymodified

      if plan.use_qlora:
          kwargs["quantization_config"] = _build_bnb_config(plan)
 -    if spec.modality == "vision-language":
 +    from dlm.modality import modality_for
++
 +    dispatch = modality_for(spec)
 +    if dispatch.accepts_images:
          # Bases with `trust_remote_code=True` often aren't registered
          # with AutoModelForImageTextToText (that's the whole reason —
          # their class lives in the repo, not transformers). Fall back
              return AutoModel.from_pretrained(spec.hf_id, **kwargs)
          return AutoModelForImageTextToText.from_pretrained(spec.hf_id, **kwargs)
 -    if spec.modality == "audio-language":
 +    if dispatch.accepts_audio:
          # No AutoModelForAudioTextToText in transformers 5.x; resolve
          # the class name from `spec.architecture` so adding a new audio
          # base is a registry edit, not a loader patch.

src/dlm/train/trainer.pymodified

      # our downstream helpers and TRL's VL collator understand. Audio
      # bases carry a processor too but TRL has no auto-dispatch, so the
      # audio branch hands the SFTTrainer a custom `AudioLmCollator`.
 -    is_vl = spec.modality == "vision-language"
 -    is_audio = spec.modality == "audio-language"
 -    is_media = is_vl or is_audio
 +    from dlm.modality import modality_for
++
 +    modality_dispatch = modality_for(spec)
 +    is_vl = modality_dispatch.accepts_images
 +    is_audio = modality_dispatch.accepts_audio
 +    is_media = modality_dispatch.requires_processor
      media_processor: Any | None = None
      blob_store: BlobStore | None = None
      image_token = "<image>"