@@ -0,0 +1,206 @@ |
| 1 | +"""Resolve a ``.dlm`` file to the artifacts sway needs. |
| 2 | + |
| 3 | +Imports ``dlm.*`` — requires the ``dlm-sway[dlm]`` extra. Everything |
| 4 | +outside this package is oblivious to dlm's internal shape; the bridge |
| 5 | +is the only place that knows, e.g., that a dlm section carries a |
| 6 | +``kind`` field named ``type`` or that adapters live at |
| 7 | +``adapter/versions/vNNNN/``. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import hashlib |
| 13 | +from dataclasses import dataclass |
| 14 | +from pathlib import Path |
| 15 | + |
| 16 | +from dlm_sway.core.errors import SwayError |
| 17 | +from dlm_sway.core.sections import ( |
| 18 | + Section, |
| 19 | + SectionKind, |
| 20 | + SectionPreference, |
| 21 | + SectionProbe, |
| 22 | +) |
| 23 | + |
| 24 | + |
| 25 | +@dataclass(frozen=True, slots=True) |
| 26 | +class DlmHandle: |
| 27 | + """Everything the sway bridge pulls out of a ``.dlm`` file. |
| 28 | + |
| 29 | + Attributes |
| 30 | + ---------- |
| 31 | + dlm_id: |
| 32 | + Stable identifier from the frontmatter. |
| 33 | + base_model: |
| 34 | + Either a HF id (``qwen2.5-1.5b``) or an ``hf:org/name`` escape |
| 35 | + hatch, taken verbatim from the frontmatter. |
| 36 | + adapter_path: |
| 37 | + Directory containing the current trained PEFT adapter (resolved |
| 38 | + via dlm's own ``StorePath.for_dlm``). ``None`` if the document |
| 39 | + hasn't been trained yet. |
| 40 | + sections: |
| 41 | + Typed sections ready for sway's probes. |
| 42 | + doc_text: |
| 43 | + Concatenated raw content of all sections. Used by probes that |
| 44 | + need a whole-document stylistic reference (C1). |
| 45 | + """ |
| 46 | + |
| 47 | + dlm_id: str |
| 48 | + base_model: str |
| 49 | + adapter_path: Path | None |
| 50 | + sections: tuple[Section, ...] |
| 51 | + doc_text: str |
| 52 | + |
| 53 | + |
| 54 | +def resolve_dlm(dlm_path: Path) -> DlmHandle: |
| 55 | + """Parse ``dlm_path`` and return a :class:`DlmHandle`. |
| 56 | + |
| 57 | + Raises :class:`~dlm_sway.core.errors.SwayError` with a clear message |
| 58 | + when the file is malformed or when the resolved adapter path doesn't |
| 59 | + exist on disk. |
| 60 | + """ |
| 61 | + try: |
| 62 | + from dlm.doc.parser import parse_file as dlm_parse_file |
| 63 | + except ImportError as exc: |
| 64 | + raise SwayError("dlm package not installed — run: pip install 'dlm-sway[dlm]'") from exc |
| 65 | + |
| 66 | + parsed = dlm_parse_file(dlm_path) |
| 67 | + fm = parsed.frontmatter |
| 68 | + sections = tuple(_translate_section(s) for s in parsed.sections) |
| 69 | + doc_text = "\n\n".join(s.content for s in sections) |
| 70 | + |
| 71 | + adapter_path = _resolve_adapter_path(fm.dlm_id) |
| 72 | + |
| 73 | + return DlmHandle( |
| 74 | + dlm_id=fm.dlm_id, |
| 75 | + base_model=fm.base_model, |
| 76 | + adapter_path=adapter_path, |
| 77 | + sections=sections, |
| 78 | + doc_text=doc_text, |
| 79 | + ) |
| 80 | + |
| 81 | + |
| 82 | +def _resolve_adapter_path(dlm_id: str) -> Path | None: |
| 83 | + """Locate the current adapter directory for ``dlm_id``. |
| 84 | + |
| 85 | + Uses dlm's ``StorePath`` helper if available, else falls back to |
| 86 | + the canonical ``~/.dlm/store/<dlm_id>/adapter/current.txt`` pointer. |
| 87 | + Returns ``None`` if no adapter has been trained yet. |
| 88 | + """ |
| 89 | + try: |
| 90 | + from dlm.store.paths import StorePath |
| 91 | + |
| 92 | + _store_path_cls: object | None = StorePath |
| 93 | + except ImportError: |
| 94 | + _store_path_cls = None |
| 95 | + |
| 96 | + if _store_path_cls is not None: |
| 97 | + try: |
| 98 | + store = _store_path_cls.for_dlm(dlm_id) # type: ignore[attr-defined] |
| 99 | + except Exception: # noqa: BLE001 — unknown dlm exception shapes |
| 100 | + return None |
| 101 | + try: |
| 102 | + resolved = store.resolve_current_adapter() |
| 103 | + except (AttributeError, FileNotFoundError): |
| 104 | + resolved = None |
| 105 | + if resolved is not None and resolved.exists(): |
| 106 | + return Path(resolved) |
| 107 | + |
| 108 | + # Manual fallback in case the dlm API evolves. |
| 109 | + import os |
| 110 | + |
| 111 | + home = Path(os.environ.get("DLM_HOME", "~/.dlm")).expanduser() |
| 112 | + current_file = home / "store" / dlm_id / "adapter" / "current.txt" |
| 113 | + if current_file.exists(): |
| 114 | + pointer = current_file.read_text(encoding="utf-8").strip() |
| 115 | + candidate = (current_file.parent / pointer).resolve() |
| 116 | + if candidate.exists(): |
| 117 | + return candidate |
| 118 | + return None |
| 119 | + |
| 120 | + |
| 121 | +def _translate_section(dlm_section: object) -> Section: |
| 122 | + """Adapt a ``dlm.doc.sections.Section`` to sway's section type. |
| 123 | + |
| 124 | + The shape dlm uses has been stable through the v0.x series but we |
| 125 | + treat field access defensively so a minor dlm refactor can't silently |
| 126 | + misread section content. |
| 127 | + """ |
| 128 | + kind_raw = getattr(dlm_section, "kind", None) |
| 129 | + # dlm uses the attribute name "kind" on its Section dataclass. |
| 130 | + kind = _normalize_kind(kind_raw) |
| 131 | + content = str(getattr(dlm_section, "content", "")) |
| 132 | + section_id = str( |
| 133 | + getattr(dlm_section, "section_id", None) |
| 134 | + or getattr(dlm_section, "id", None) |
| 135 | + or _content_hash(content) |
| 136 | + ) |
| 137 | + tag = getattr(dlm_section, "tag", None) |
| 138 | + |
| 139 | + probes: tuple[SectionProbe, ...] = () |
| 140 | + preferences: tuple[SectionPreference, ...] = () |
| 141 | + if kind == "instruction": |
| 142 | + probes = tuple(_extract_instruction_probes(dlm_section)) |
| 143 | + elif kind == "preference": |
| 144 | + preferences = tuple(_extract_preference_triples(dlm_section)) |
| 145 | + |
| 146 | + return Section( |
| 147 | + id=section_id, |
| 148 | + kind=kind, |
| 149 | + content=content, |
| 150 | + probes=probes, |
| 151 | + preferences=preferences, |
| 152 | + tag=tag if isinstance(tag, str) else None, |
| 153 | + ) |
| 154 | + |
| 155 | + |
| 156 | +def _normalize_kind(raw: object) -> SectionKind: |
| 157 | + """Map dlm's SectionType/str to sway's lowercase kind.""" |
| 158 | + if raw is None: |
| 159 | + return "prose" |
| 160 | + value = str(raw).lower() |
| 161 | + # dlm uses uppercase StrEnum values like "PROSE"; normalize. |
| 162 | + if value.endswith("prose") or "prose" in value: |
| 163 | + return "prose" |
| 164 | + if "instruction" in value: |
| 165 | + return "instruction" |
| 166 | + if "preference" in value: |
| 167 | + return "preference" |
| 168 | + return "prose" |
| 169 | + |
| 170 | + |
| 171 | +def _extract_instruction_probes(dlm_section: object) -> list[SectionProbe]: |
| 172 | + """Pull (Q, A) pairs out of a dlm INSTRUCTION section. |
| 173 | + |
| 174 | + dlm's Section carries its parsed Q/A as ``probes`` or ``qa`` depending |
| 175 | + on version. We read the first non-empty one and build |
| 176 | + :class:`SectionProbe` records defensively. |
| 177 | + """ |
| 178 | + raw_probes = getattr(dlm_section, "probes", None) or getattr(dlm_section, "qa", None) |
| 179 | + if not raw_probes: |
| 180 | + return [] |
| 181 | + out: list[SectionProbe] = [] |
| 182 | + for rp in raw_probes: |
| 183 | + q = str(getattr(rp, "prompt", getattr(rp, "question", ""))) |
| 184 | + a = str(getattr(rp, "gold", getattr(rp, "answer", ""))) |
| 185 | + if q and a: |
| 186 | + out.append(SectionProbe(prompt=q, gold=a)) |
| 187 | + return out |
| 188 | + |
| 189 | + |
| 190 | +def _extract_preference_triples(dlm_section: object) -> list[SectionPreference]: |
| 191 | + """Pull (prompt, chosen, rejected) triples out of a dlm PREFERENCE section.""" |
| 192 | + raw = getattr(dlm_section, "preferences", None) or getattr(dlm_section, "triples", None) |
| 193 | + if not raw: |
| 194 | + return [] |
| 195 | + out: list[SectionPreference] = [] |
| 196 | + for r in raw: |
| 197 | + p = str(getattr(r, "prompt", "")) |
| 198 | + c = str(getattr(r, "chosen", "")) |
| 199 | + rej = str(getattr(r, "rejected", "")) |
| 200 | + if p and c and rej: |
| 201 | + out.append(SectionPreference(prompt=p, chosen=c, rejected=rej)) |
| 202 | + return out |
| 203 | + |
| 204 | + |
| 205 | +def _content_hash(content: str) -> str: |
| 206 | + return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] |