| 1 | """Resolve a ``.dlm`` file to the artifacts sway needs. |
| 2 | |
| 3 | Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything |
| 4 | outside this package is oblivious to dlm's internal shape; the bridge |
| 5 | is the only place that knows, e.g., that a dlm section carries a |
| 6 | ``kind`` field named ``type`` or that adapters live at |
| 7 | ``adapter/versions/vNNNN/``. |
| 8 | """ |
| 9 | |
| 10 | from __future__ import annotations |
| 11 | |
| 12 | import hashlib |
| 13 | from dataclasses import dataclass |
| 14 | from pathlib import Path |
| 15 | |
| 16 | from dlm_sway.core.errors import DlmCompatError, SwayError |
| 17 | from dlm_sway.core.sections import ( |
| 18 | Section, |
| 19 | SectionKind, |
| 20 | SectionPreference, |
| 21 | SectionProbe, |
| 22 | ) |
| 23 | |
| 24 | |
| 25 | def _installed_dlm_version() -> str | None: |
| 26 | """Best-effort lookup of the installed ``dlm`` package version. |
| 27 | |
| 28 | Returns ``None`` when dlm isn't installed or metadata is missing; |
| 29 | the returned string is informational (attached to |
| 30 | ``DlmCompatError`` messages), never used for programmatic |
| 31 | branching. |
| 32 | """ |
| 33 | try: |
| 34 | from importlib.metadata import version |
| 35 | |
| 36 | return version("dlm") |
| 37 | except Exception: # noqa: BLE001 — metadata lookup is best-effort |
| 38 | return None |
| 39 | |
| 40 | |
| 41 | @dataclass(frozen=True, slots=True) |
| 42 | class DlmHandle: |
| 43 | """Everything the sway bridge pulls out of a ``.dlm`` file. |
| 44 | |
| 45 | Attributes |
| 46 | ---------- |
| 47 | dlm_id: |
| 48 | Stable identifier from the frontmatter. |
| 49 | base_model: |
| 50 | Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape |
| 51 | hatch, taken verbatim from the frontmatter. |
| 52 | adapter_path: |
| 53 | Directory containing the current trained PEFT adapter (resolved |
| 54 | via dlm's own ``StorePath.for_dlm``). ``None`` if the document |
| 55 | hasn't been trained yet. |
| 56 | sections: |
| 57 | Typed sections ready for sway's probes. |
| 58 | doc_text: |
| 59 | Concatenated raw content of all sections. Used by probes that |
| 60 | need a whole-document stylistic reference (C1). |
| 61 | """ |
| 62 | |
| 63 | dlm_id: str |
| 64 | base_model: str |
| 65 | adapter_path: Path | None |
| 66 | sections: tuple[Section, ...] |
| 67 | doc_text: str |
| 68 | |
| 69 | |
| 70 | def resolve_dlm(dlm_path: Path) -> DlmHandle: |
| 71 | """Parse ``dlm_path`` and return a :class:`DlmHandle`. |
| 72 | |
| 73 | Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message |
| 74 | when the file is malformed or when the resolved adapter path doesn't |
| 75 | exist on disk. |
| 76 | """ |
| 77 | try: |
| 78 | from dlm.doc.parser import parse_file as dlm_parse_file |
| 79 | except ImportError as exc: |
| 80 | raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc |
| 81 | |
| 82 | parsed = dlm_parse_file(dlm_path) |
| 83 | fm = parsed.frontmatter |
| 84 | sections = tuple(_translate_section(s) for s in parsed.sections) |
| 85 | doc_text = "\n\n".join(s.content for s in sections) |
| 86 | |
| 87 | adapter_path = _resolve_adapter_path(fm.dlm_id) |
| 88 | base_hf_id = _resolve_base_model_to_hf_id(fm.base_model) |
| 89 | |
| 90 | return DlmHandle( |
| 91 | dlm_id=fm.dlm_id, |
| 92 | base_model=base_hf_id, |
| 93 | adapter_path=adapter_path, |
| 94 | sections=sections, |
| 95 | doc_text=doc_text, |
| 96 | ) |
| 97 | |
| 98 | |
| 99 | def _resolve_base_model_to_hf_id(base_model: str) -> str: |
| 100 | """Translate dlm's base-model *key* to a HuggingFace repo id. |
| 101 | |
| 102 | dlm's frontmatter stores registry keys like ``smollm2-135m`` which |
| 103 | resolve to ``HuggingFaceTB/SmolLM2-135M-Instruct``. sway's backends |
| 104 | call ``AutoModelForCausalLM.from_pretrained`` directly and need the |
| 105 | HF id. The ``hf:org/name`` escape hatch passes through unchanged. |
| 106 | |
| 107 | Behavior when ``dlm`` is not installed: return the raw key |
| 108 | unchanged; the downstream backend load will surface a clean "not |
| 109 | a valid HF id" error. Behavior when ``dlm`` is installed but its |
| 110 | public surface drifts (e.g. ``.hf_id`` renamed to ``.repo_id``): |
| 111 | raise :class:`DlmCompatError` — silent fallback here would hand |
| 112 | the backend the raw registry key and produce a confusing |
| 113 | "model not found" error far from the root cause. |
| 114 | """ |
| 115 | if base_model.startswith("hf:"): |
| 116 | return base_model[len("hf:") :] |
| 117 | try: |
| 118 | from dlm.base_models import resolve as resolve_base |
| 119 | except ImportError: |
| 120 | return base_model |
| 121 | try: |
| 122 | spec = resolve_base(base_model) |
| 123 | except Exception as exc: # noqa: BLE001 — unknown dlm errors |
| 124 | raise DlmCompatError( |
| 125 | f"dlm.base_models.resolve({base_model!r}) raised {type(exc).__name__}: {exc}", |
| 126 | installed_dlm_version=_installed_dlm_version(), |
| 127 | ) from exc |
| 128 | if not hasattr(spec, "hf_id"): |
| 129 | raise DlmCompatError( |
| 130 | f"dlm.base_models.resolve({base_model!r}) returned " |
| 131 | f"{type(spec).__name__} without the expected 'hf_id' attribute " |
| 132 | f"(attrs seen: {sorted(a for a in dir(spec) if not a.startswith('_'))[:8]!r})", |
| 133 | installed_dlm_version=_installed_dlm_version(), |
| 134 | ) |
| 135 | hf_id = spec.hf_id |
| 136 | return str(hf_id) if hf_id else base_model |
| 137 | |
| 138 | |
| 139 | def _resolve_adapter_path(dlm_id: str) -> Path | None: |
| 140 | """Locate the current adapter directory for ``dlm_id``. |
| 141 | |
| 142 | Uses dlm's module-level ``for_dlm`` helper if available, else falls |
| 143 | back to the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt`` |
| 144 | pointer. Returns ``None`` if no adapter has been trained yet. |
| 145 | """ |
| 146 | # Primary path: use dlm's own store-path helpers. |
| 147 | try: |
| 148 | from dlm.store.paths import for_dlm as _for_dlm |
| 149 | except ImportError: |
| 150 | _for_dlm = None |
| 151 | |
| 152 | if _for_dlm is not None: |
| 153 | try: |
| 154 | store = _for_dlm(dlm_id) |
| 155 | except Exception: # noqa: BLE001 — unknown dlm exception shapes |
| 156 | store = None |
| 157 | if store is not None: |
| 158 | try: |
| 159 | resolved = store.resolve_current_adapter() |
| 160 | except (AttributeError, FileNotFoundError): |
| 161 | resolved = None |
| 162 | if resolved is not None and Path(resolved).exists(): |
| 163 | return Path(resolved) |
| 164 | |
| 165 | # Manual fallback. The ``current.txt`` pointer is relative to the |
| 166 | # **store root**, not to current.txt's parent dir — so go up one level. |
| 167 | import os |
| 168 | |
| 169 | home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser() |
| 170 | store_root = home / "store" / dlm_id |
| 171 | current_file = store_root / "adapter" / "current.txt" |
| 172 | if current_file.exists(): |
| 173 | pointer = current_file.read_text(encoding="utf-8").strip() |
| 174 | candidate = (store_root / pointer).resolve() |
| 175 | if candidate.exists(): |
| 176 | return candidate |
| 177 | return None |
| 178 | |
| 179 | |
| 180 | def _translate_section(dlm_section: object) -> Section: |
| 181 | """Adapt a ``dlm.doc.sections.Section`` to sway's section type. |
| 182 | |
| 183 | dlm's Section dataclass uses the attribute name ``type`` (not |
| 184 | ``kind``) and stores instruction/preference content as raw markdown |
| 185 | — dlm ships dedicated parsers (``parse_instruction_body``, |
| 186 | ``parse_preference_body``) that we reuse here so any future dlm |
| 187 | syntax additions land in sway for free. |
| 188 | """ |
| 189 | # dlm's current attribute is ``type``; older revisions used ``kind``. |
| 190 | kind_raw = getattr(dlm_section, "type", getattr(dlm_section, "kind", None)) |
| 191 | kind = _normalize_kind(kind_raw) |
| 192 | content = str(getattr(dlm_section, "content", "")) |
| 193 | section_id = str( |
| 194 | getattr(dlm_section, "section_id", None) |
| 195 | or getattr(dlm_section, "id", None) |
| 196 | or _content_hash(content) |
| 197 | ) |
| 198 | tag = getattr(dlm_section, "tag", None) |
| 199 | |
| 200 | probes: tuple[SectionProbe, ...] = () |
| 201 | preferences: tuple[SectionPreference, ...] = () |
| 202 | if kind == "instruction": |
| 203 | probes = tuple(_parse_instruction(content, section_id=section_id)) |
| 204 | elif kind == "preference": |
| 205 | preferences = tuple(_parse_preference(content, section_id=section_id)) |
| 206 | |
| 207 | return Section( |
| 208 | id=section_id, |
| 209 | kind=kind, |
| 210 | content=content, |
| 211 | probes=probes, |
| 212 | preferences=preferences, |
| 213 | tag=tag if isinstance(tag, str) else None, |
| 214 | ) |
| 215 | |
| 216 | |
| 217 | def _normalize_kind(raw: object) -> SectionKind: |
| 218 | """Map dlm's SectionType/str to sway's lowercase kind.""" |
| 219 | if raw is None: |
| 220 | return "prose" |
| 221 | value = str(raw).lower() |
| 222 | # dlm uses uppercase StrEnum values like "PROSE"; normalize. |
| 223 | if value.endswith("prose") or "prose" in value: |
| 224 | return "prose" |
| 225 | if "instruction" in value: |
| 226 | return "instruction" |
| 227 | if "preference" in value: |
| 228 | return "preference" |
| 229 | return "prose" |
| 230 | |
| 231 | |
| 232 | def _parse_instruction(content: str, *, section_id: str) -> list[SectionProbe]: |
| 233 | """Pull (Q, A) pairs out of a dlm INSTRUCTION section body. |
| 234 | |
| 235 | Delegates to dlm's own ``parse_instruction_body`` so syntax additions |
| 236 | land in sway without code changes here. Falls back to an empty list |
| 237 | on parse errors — the probe will fail gracefully. |
| 238 | """ |
| 239 | try: |
| 240 | from dlm.data.instruction_parser import parse_instruction_body |
| 241 | except ImportError: |
| 242 | return [] |
| 243 | try: |
| 244 | pairs = parse_instruction_body(content, section_id=section_id) |
| 245 | except Exception: # noqa: BLE001 — dlm raises InstructionParseError |
| 246 | return [] |
| 247 | out: list[SectionProbe] = [] |
| 248 | for p in pairs: |
| 249 | q = getattr(p, "question", getattr(p, "prompt", "")) |
| 250 | a = getattr(p, "answer", getattr(p, "gold", "")) |
| 251 | if q and a: |
| 252 | out.append(SectionProbe(prompt=str(q), gold=str(a))) |
| 253 | return out |
| 254 | |
| 255 | |
| 256 | def _parse_preference(content: str, *, section_id: str) -> list[SectionPreference]: |
| 257 | """Pull (prompt, chosen, rejected) triples out of a PREFERENCE body.""" |
| 258 | try: |
| 259 | from dlm.data.preference_parser import parse_preference_body |
| 260 | except ImportError: |
| 261 | return [] |
| 262 | try: |
| 263 | triples = parse_preference_body(content, section_id=section_id) |
| 264 | except Exception: # noqa: BLE001 — dlm raises PreferenceParseError |
| 265 | return [] |
| 266 | out: list[SectionPreference] = [] |
| 267 | for t in triples: |
| 268 | p = str(getattr(t, "prompt", "")) |
| 269 | c = str(getattr(t, "chosen", "")) |
| 270 | rej = str(getattr(t, "rejected", "")) |
| 271 | if p and c and rej: |
| 272 | out.append(SectionPreference(prompt=p, chosen=c, rejected=rej)) |
| 273 | return out |
| 274 | |
| 275 | |
| 276 | def _content_hash(content: str) -> str: |
| 277 | return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] |