Record preference mine metrics
- SHA
35c558faed456e06672a25accc894e3f9df52537- Parents
-
dcb09c5 - Tree
6f96a60
35c558f
35c558faed456e06672a25accc894e3f9df52537dcb09c5
6f96a60| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/cli/commands.py
|
19 | 0 |
| M |
src/dlm/metrics/__init__.py
|
2 | 0 |
| M |
src/dlm/metrics/db.py
|
12 | 0 |
| M |
src/dlm/metrics/events.py
|
18 | 0 |
| M |
src/dlm/metrics/queries.py
|
62 | 0 |
| M |
src/dlm/metrics/recorder.py
|
20 | 0 |
| M |
tests/unit/cli/test_preference_cmd.py
|
9 | 0 |
| M |
tests/unit/metrics/test_db_schema.py
|
10 | 1 |
| M |
tests/unit/metrics/test_queries.py
|
60 | 1 |
| M |
tests/unit/metrics/test_recorder.py
|
33 | 1 |
src/dlm/cli/commands.pymodified@@ -3734,6 +3734,8 @@ def preference_mine_cmd( | ||
| 3734 | 3734 | build_backend, |
| 3735 | 3735 | select_backend, |
| 3736 | 3736 | ) |
| 3737 | + from dlm.metrics import MetricsRecorder, PreferenceMineEvent | |
| 3738 | + from dlm.metrics.events import PreferenceMineWriteMode | |
| 3737 | 3739 | from dlm.modality import modality_for |
| 3738 | 3740 | from dlm.preference import ( |
| 3739 | 3741 | InvalidJudgeSpecError, |
@@ -3858,10 +3860,25 @@ def preference_mine_cmd( | ||
| 3858 | 3860 | finally: |
| 3859 | 3861 | backend_obj.unload() |
| 3860 | 3862 | |
| 3863 | + recorder = MetricsRecorder(store.root) | |
| 3864 | + | |
| 3865 | + def _record_preference_mine(write_mode: PreferenceMineWriteMode) -> None: | |
| 3866 | + recorder.record_preference_mine( | |
| 3867 | + PreferenceMineEvent( | |
| 3868 | + run_id=run_id, | |
| 3869 | + judge_name=judge_obj.name, | |
| 3870 | + sample_count=samples, | |
| 3871 | + mined_pairs=len(plan.additions), | |
| 3872 | + skipped_prompts=len(plan.skipped), | |
| 3873 | + write_mode=write_mode, | |
| 3874 | + ) | |
| 3875 | + ) | |
| 3876 | + | |
| 3861 | 3877 | out_console.print(render_mine_plan(plan)) |
| 3862 | 3878 | |
| 3863 | 3879 | if not plan.additions: |
| 3864 | 3880 | clear_pending_plan(store) |
| 3881 | + _record_preference_mine("empty") | |
| 3865 | 3882 | out_console.print( |
| 3866 | 3883 | "\n[yellow]no candidates to mine[/yellow] — either instruction prompts " |
| 3867 | 3884 | "did not yield a confident pair, or the matching preference sections " |
@@ -3877,6 +3894,7 @@ def preference_mine_cmd( | ||
| 3877 | 3894 | out_console.print(render_apply_plan(apply_plan)) |
| 3878 | 3895 | summary = apply_preference_plan(parsed, apply_plan, target=path) |
| 3879 | 3896 | clear_pending_plan(store) |
| 3897 | + _record_preference_mine("applied") | |
| 3880 | 3898 | out_console.print( |
| 3881 | 3899 | f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " |
| 3882 | 3900 | f"({summary.skipped} skipped)" |
@@ -3884,6 +3902,7 @@ def preference_mine_cmd( | ||
| 3884 | 3902 | return |
| 3885 | 3903 | |
| 3886 | 3904 | pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) |
| 3905 | + _record_preference_mine("staged") | |
| 3887 | 3906 | out_console.print( |
| 3888 | 3907 | f"\n[green]preference:[/green] staged {len(pending.sections)} mined preference " |
| 3889 | 3908 | f"section(s). Run [bold]dlm preference apply {path}[/bold] to write them." |
src/dlm/metrics/__init__.pymodified@@ -12,6 +12,7 @@ from dlm.metrics.events import ( | ||
| 12 | 12 | EvalEvent, |
| 13 | 13 | ExportEvent, |
| 14 | 14 | Phase, |
| 15 | + PreferenceMineEvent, | |
| 15 | 16 | RunEnd, |
| 16 | 17 | RunStart, |
| 17 | 18 | Status, |
@@ -28,6 +29,7 @@ __all__ = [ | ||
| 28 | 29 | "MetricsRecorder", |
| 29 | 30 | "MetricsSchemaError", |
| 30 | 31 | "Phase", |
| 32 | + "PreferenceMineEvent", | |
| 31 | 33 | "RunEnd", |
| 32 | 34 | "RunStart", |
| 33 | 35 | "Status", |
src/dlm/metrics/db.pymodified@@ -96,6 +96,18 @@ _SCHEMA_SQL = [ | ||
| 96 | 96 | PRIMARY KEY (run_id, adapter_name) |
| 97 | 97 | ) |
| 98 | 98 | """, |
| 99 | + """ | |
| 100 | + CREATE TABLE IF NOT EXISTS preference_mining ( | |
| 101 | + event_id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| 102 | + run_id INTEGER NOT NULL, | |
| 103 | + judge_name TEXT NOT NULL, | |
| 104 | + sample_count INTEGER NOT NULL, | |
| 105 | + mined_pairs INTEGER NOT NULL, | |
| 106 | + skipped_prompts INTEGER NOT NULL, | |
| 107 | + write_mode TEXT NOT NULL, | |
| 108 | + at TEXT NOT NULL | |
| 109 | + ) | |
| 110 | + """, | |
| 99 | 111 | ] |
| 100 | 112 | |
| 101 | 113 | |
src/dlm/metrics/events.pymodified@@ -12,6 +12,7 @@ from typing import Literal | ||
| 12 | 12 | |
| 13 | 13 | Phase = Literal["sft", "dpo", "orpo", "cpt"] |
| 14 | 14 | Status = Literal["running", "ok", "failed", "cancelled"] |
| 15 | +PreferenceMineWriteMode = Literal["staged", "applied", "empty"] | |
| 15 | 16 | |
| 16 | 17 | |
| 17 | 18 | def _utc_iso() -> str: |
@@ -130,6 +131,23 @@ class GateEvent: | ||
| 130 | 131 | object.__setattr__(self, "at", _utc_iso()) |
| 131 | 132 | |
| 132 | 133 | |
| 134 | +@dataclass(frozen=True) | |
| 135 | +class PreferenceMineEvent: | |
| 136 | + """Emitted from `dlm preference mine` after judging completes.""" | |
| 137 | + | |
| 138 | + run_id: int | |
| 139 | + judge_name: str | |
| 140 | + sample_count: int | |
| 141 | + mined_pairs: int | |
| 142 | + skipped_prompts: int | |
| 143 | + write_mode: PreferenceMineWriteMode | |
| 144 | + at: str = "" | |
| 145 | + | |
| 146 | + def __post_init__(self) -> None: | |
| 147 | + if not self.at: | |
| 148 | + object.__setattr__(self, "at", _utc_iso()) | |
| 149 | + | |
| 150 | + | |
| 133 | 151 | @dataclass(frozen=True) |
| 134 | 152 | class ExportEvent: |
| 135 | 153 | """Emitted from `dlm export` on completion.""" |
src/dlm/metrics/queries.pymodified@@ -70,6 +70,20 @@ class TokenizationRow: | ||
| 70 | 70 | return self.cache_hits / total if total else 0.0 |
| 71 | 71 | |
| 72 | 72 | |
| 73 | +@dataclass(frozen=True) | |
| 74 | +class PreferenceMineRow: | |
| 75 | + """One row from the `preference_mining` table.""" | |
| 76 | + | |
| 77 | + event_id: int | |
| 78 | + run_id: int | |
| 79 | + judge_name: str | |
| 80 | + sample_count: int | |
| 81 | + mined_pairs: int | |
| 82 | + skipped_prompts: int | |
| 83 | + write_mode: str | |
| 84 | + at: str | |
| 85 | + | |
| 86 | + | |
| 73 | 87 | def recent_runs( |
| 74 | 88 | store_root: Path, |
| 75 | 89 | *, |
@@ -170,6 +184,37 @@ def latest_tokenization(store_root: Path) -> TokenizationRow | None: | ||
| 170 | 184 | return TokenizationRow(*row) |
| 171 | 185 | |
| 172 | 186 | |
| 187 | +def preference_mining_for_run(store_root: Path, run_id: int) -> list[PreferenceMineRow]: | |
| 188 | + """All preference-mine events for `run_id`, oldest first.""" | |
| 189 | + try: | |
| 190 | + with connect(store_root) as conn: | |
| 191 | + rows = conn.execute( | |
| 192 | + "SELECT event_id, run_id, judge_name, sample_count, mined_pairs, " | |
| 193 | + "skipped_prompts, write_mode, at " | |
| 194 | + "FROM preference_mining WHERE run_id = ? ORDER BY event_id ASC", | |
| 195 | + (run_id,), | |
| 196 | + ).fetchall() | |
| 197 | + except sqlite3.Error: | |
| 198 | + return [] | |
| 199 | + return [PreferenceMineRow(*row) for row in rows] | |
| 200 | + | |
| 201 | + | |
| 202 | +def latest_preference_mining(store_root: Path) -> PreferenceMineRow | None: | |
| 203 | + """The most-recent preference-mine row, or None when absent.""" | |
| 204 | + try: | |
| 205 | + with connect(store_root) as conn: | |
| 206 | + row = conn.execute( | |
| 207 | + "SELECT event_id, run_id, judge_name, sample_count, mined_pairs, " | |
| 208 | + "skipped_prompts, write_mode, at " | |
| 209 | + "FROM preference_mining ORDER BY event_id DESC LIMIT 1" | |
| 210 | + ).fetchone() | |
| 211 | + except sqlite3.Error: | |
| 212 | + return None | |
| 213 | + if row is None: | |
| 214 | + return None | |
| 215 | + return PreferenceMineRow(*row) | |
| 216 | + | |
| 217 | + | |
| 173 | 218 | @dataclass(frozen=True) |
| 174 | 219 | class GateEventRow: |
| 175 | 220 | """One row of the gate_events table (per-run per-adapter).""" |
@@ -267,3 +312,20 @@ def evals_to_dict(evals: list[EvalRow]) -> list[dict[str, Any]]: | ||
| 267 | 312 | } |
| 268 | 313 | for e in evals |
| 269 | 314 | ] |
| 315 | + | |
| 316 | + | |
| 317 | +def preference_mining_to_dict(rows: list[PreferenceMineRow]) -> list[dict[str, Any]]: | |
| 318 | + """JSON-serializable view used by `dlm metrics --json` and `dlm show --json`.""" | |
| 319 | + return [ | |
| 320 | + { | |
| 321 | + "event_id": row.event_id, | |
| 322 | + "run_id": row.run_id, | |
| 323 | + "judge_name": row.judge_name, | |
| 324 | + "sample_count": row.sample_count, | |
| 325 | + "mined_pairs": row.mined_pairs, | |
| 326 | + "skipped_prompts": row.skipped_prompts, | |
| 327 | + "write_mode": row.write_mode, | |
| 328 | + "at": row.at, | |
| 329 | + } | |
| 330 | + for row in rows | |
| 331 | + ] | |
src/dlm/metrics/recorder.pymodified@@ -38,6 +38,7 @@ from dlm.metrics.events import ( | ||
| 38 | 38 | EvalEvent, |
| 39 | 39 | ExportEvent, |
| 40 | 40 | GateEvent, |
| 41 | + PreferenceMineEvent, | |
| 41 | 42 | RunEnd, |
| 42 | 43 | RunStart, |
| 43 | 44 | StepEvent, |
@@ -195,6 +196,25 @@ class MetricsRecorder: | ||
| 195 | 196 | |
| 196 | 197 | self._with_conn(_do, failure_key="gate", hard_fail=False) |
| 197 | 198 | |
| 199 | + def record_preference_mine(self, event: PreferenceMineEvent) -> None: | |
| 200 | + def _do(conn: sqlite3.Connection) -> None: | |
| 201 | + conn.execute( | |
| 202 | + "INSERT INTO preference_mining " | |
| 203 | + "(run_id, judge_name, sample_count, mined_pairs, skipped_prompts, write_mode, at) " | |
| 204 | + "VALUES (?, ?, ?, ?, ?, ?, ?)", | |
| 205 | + ( | |
| 206 | + event.run_id, | |
| 207 | + event.judge_name, | |
| 208 | + event.sample_count, | |
| 209 | + event.mined_pairs, | |
| 210 | + event.skipped_prompts, | |
| 211 | + event.write_mode, | |
| 212 | + event.at, | |
| 213 | + ), | |
| 214 | + ) | |
| 215 | + | |
| 216 | + self._with_conn(_do, failure_key="preference_mine", hard_fail=False) | |
| 217 | + | |
| 198 | 218 | def record_export(self, event: ExportEvent) -> None: |
| 199 | 219 | def _do(conn: sqlite3.Connection) -> None: |
| 200 | 220 | conn.execute( |
tests/unit/cli/test_preference_cmd.pymodified@@ -13,6 +13,7 @@ from typer.testing import CliRunner | ||
| 13 | 13 | from dlm.base_models import BaseModelSpec |
| 14 | 14 | from dlm.cli.app import app |
| 15 | 15 | from dlm.doc.parser import parse_file |
| 16 | +from dlm.metrics.queries import preference_mining_for_run | |
| 16 | 17 | from dlm.preference.judge import PairScore |
| 17 | 18 | from dlm.preference.pending import load_pending_plan |
| 18 | 19 | from dlm.store.manifest import Manifest, TrainingRunSummary, save_manifest |
@@ -163,6 +164,14 @@ class TestPreferenceCmd: | ||
| 163 | 164 | assert len(pending.sections) == 1 |
| 164 | 165 | assert pending.sections[0].auto_mined is True |
| 165 | 166 | |
| 167 | + rows = preference_mining_for_run(for_dlm(_DLM_ID, home=home).root, run_id=7) | |
| 168 | + assert len(rows) == 1 | |
| 169 | + assert rows[0].judge_name == "stub:judge" | |
| 170 | + assert rows[0].sample_count == 2 | |
| 171 | + assert rows[0].mined_pairs == 1 | |
| 172 | + assert rows[0].skipped_prompts == 0 | |
| 173 | + assert rows[0].write_mode == "staged" | |
| 174 | + | |
| 166 | 175 | def test_apply_writes_staged_preferences_and_clears_pending( |
| 167 | 176 | self, |
| 168 | 177 | tmp_path: Path, |
tests/unit/metrics/test_db_schema.pymodified@@ -22,7 +22,16 @@ class TestConnect: | ||
| 22 | 22 | tables = { |
| 23 | 23 | row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'") |
| 24 | 24 | } |
| 25 | - assert tables == {"runs", "steps", "evals", "exports", "tokenization", "gate_events"} | |
| 25 | + user_tables = {table for table in tables if not table.startswith("sqlite_")} | |
| 26 | + assert user_tables == { | |
| 27 | + "runs", | |
| 28 | + "steps", | |
| 29 | + "evals", | |
| 30 | + "exports", | |
| 31 | + "tokenization", | |
| 32 | + "gate_events", | |
| 33 | + "preference_mining", | |
| 34 | + } | |
| 26 | 35 | |
| 27 | 36 | def test_wal_mode_enabled(self, tmp_path: Path) -> None: |
| 28 | 37 | with connect(tmp_path) as conn: |
tests/unit/metrics/test_queries.pymodified@@ -5,11 +5,14 @@ from __future__ import annotations | ||
| 5 | 5 | from datetime import UTC, datetime, timedelta |
| 6 | 6 | from pathlib import Path |
| 7 | 7 | |
| 8 | -from dlm.metrics.events import EvalEvent, RunEnd, RunStart, StepEvent | |
| 8 | +from dlm.metrics.events import EvalEvent, PreferenceMineEvent, RunEnd, RunStart, StepEvent | |
| 9 | 9 | from dlm.metrics.queries import ( |
| 10 | 10 | evals_for_run, |
| 11 | 11 | evals_to_dict, |
| 12 | + latest_preference_mining, | |
| 12 | 13 | latest_run_id, |
| 14 | + preference_mining_for_run, | |
| 15 | + preference_mining_to_dict, | |
| 13 | 16 | recent_runs, |
| 14 | 17 | runs_to_dict, |
| 15 | 18 | steps_for_run, |
@@ -27,6 +30,26 @@ def _seed(store_root: Path) -> None: | ||
| 27 | 30 | rec.record_step(StepEvent(run_id=run_id, step=step, loss=2.0 - 0.1 * step)) |
| 28 | 31 | rec.record_eval(EvalEvent(run_id=run_id, step=30, val_loss=1.5)) |
| 29 | 32 | rec.record_run_end(RunEnd(run_id=run_id, status="ok")) |
| 33 | + rec.record_preference_mine( | |
| 34 | + PreferenceMineEvent( | |
| 35 | + run_id=2, | |
| 36 | + judge_name="sway", | |
| 37 | + sample_count=4, | |
| 38 | + mined_pairs=1, | |
| 39 | + skipped_prompts=0, | |
| 40 | + write_mode="staged", | |
| 41 | + ) | |
| 42 | + ) | |
| 43 | + rec.record_preference_mine( | |
| 44 | + PreferenceMineEvent( | |
| 45 | + run_id=2, | |
| 46 | + judge_name="hf:test/reward", | |
| 47 | + sample_count=6, | |
| 48 | + mined_pairs=2, | |
| 49 | + skipped_prompts=3, | |
| 50 | + write_mode="applied", | |
| 51 | + ) | |
| 52 | + ) | |
| 30 | 53 | |
| 31 | 54 | |
| 32 | 55 | class TestRecentRuns: |
@@ -101,6 +124,28 @@ class TestLatestRunId: | ||
| 101 | 124 | assert latest_run_id(tmp_path) is None |
| 102 | 125 | |
| 103 | 126 | |
| 127 | +class TestPreferenceMiningQueries: | |
| 128 | + def test_preference_mining_for_run_returns_oldest_first(self, tmp_path: Path) -> None: | |
| 129 | + _seed(tmp_path) | |
| 130 | + rows = preference_mining_for_run(tmp_path, run_id=2) | |
| 131 | + assert [row.judge_name for row in rows] == ["sway", "hf:test/reward"] | |
| 132 | + assert [row.write_mode for row in rows] == ["staged", "applied"] | |
| 133 | + | |
| 134 | + def test_latest_preference_mining_returns_most_recent_event(self, tmp_path: Path) -> None: | |
| 135 | + _seed(tmp_path) | |
| 136 | + row = latest_preference_mining(tmp_path) | |
| 137 | + assert row is not None | |
| 138 | + assert row.judge_name == "hf:test/reward" | |
| 139 | + assert row.write_mode == "applied" | |
| 140 | + | |
| 141 | + def test_latest_preference_mining_none_when_empty(self, tmp_path: Path) -> None: | |
| 142 | + from dlm.metrics.db import connect | |
| 143 | + | |
| 144 | + with connect(tmp_path) as _conn: | |
| 145 | + pass | |
| 146 | + assert latest_preference_mining(tmp_path) is None | |
| 147 | + | |
| 148 | + | |
| 104 | 149 | class TestDictSerialization: |
| 105 | 150 | def test_runs_to_dict_shape(self, tmp_path: Path) -> None: |
| 106 | 151 | _seed(tmp_path) |
@@ -122,3 +167,17 @@ class TestDictSerialization: | ||
| 122 | 167 | assert all({"step", "loss", "lr", "grad_norm", "at"}.issubset(s.keys()) for s in steps) |
| 123 | 168 | evals = evals_to_dict(evals_for_run(tmp_path, run_id=1)) |
| 124 | 169 | assert all("val_loss" in e for e in evals) |
| 170 | + | |
| 171 | + def test_preference_mining_to_dict_shape(self, tmp_path: Path) -> None: | |
| 172 | + _seed(tmp_path) | |
| 173 | + payload = preference_mining_to_dict(preference_mining_for_run(tmp_path, run_id=2)) | |
| 174 | + assert payload[0].keys() == { | |
| 175 | + "event_id", | |
| 176 | + "run_id", | |
| 177 | + "judge_name", | |
| 178 | + "sample_count", | |
| 179 | + "mined_pairs", | |
| 180 | + "skipped_prompts", | |
| 181 | + "write_mode", | |
| 182 | + "at", | |
| 183 | + } | |
tests/unit/metrics/test_recorder.pymodified@@ -11,7 +11,14 @@ from pathlib import Path | ||
| 11 | 11 | import pytest |
| 12 | 12 | |
| 13 | 13 | from dlm.metrics.db import metrics_db_path |
| 14 | -from dlm.metrics.events import EvalEvent, ExportEvent, RunEnd, RunStart, StepEvent | |
| 14 | +from dlm.metrics.events import ( | |
| 15 | + EvalEvent, | |
| 16 | + ExportEvent, | |
| 17 | + PreferenceMineEvent, | |
| 18 | + RunEnd, | |
| 19 | + RunStart, | |
| 20 | + StepEvent, | |
| 21 | +) | |
| 15 | 22 | from dlm.metrics.recorder import MetricsRecorder |
| 16 | 23 | |
| 17 | 24 | |
@@ -121,6 +128,31 @@ class TestExports: | ||
| 121 | 128 | assert rows[0][3] == "mydoc:v1" |
| 122 | 129 | |
| 123 | 130 | |
| 131 | +class TestPreferenceMining: | |
| 132 | + def test_preference_mine_written_without_run_row(self, tmp_path: Path) -> None: | |
| 133 | + rec = MetricsRecorder(tmp_path) | |
| 134 | + rec.record_preference_mine( | |
| 135 | + PreferenceMineEvent( | |
| 136 | + run_id=7, | |
| 137 | + judge_name="sway", | |
| 138 | + sample_count=4, | |
| 139 | + mined_pairs=2, | |
| 140 | + skipped_prompts=1, | |
| 141 | + write_mode="staged", | |
| 142 | + ) | |
| 143 | + ) | |
| 144 | + rows = _select_all(metrics_db_path(tmp_path), "preference_mining") | |
| 145 | + assert len(rows) == 1 | |
| 146 | + _, run_id, judge_name, sample_count, mined_pairs, skipped_prompts, write_mode, at = rows[0] | |
| 147 | + assert run_id == 7 | |
| 148 | + assert judge_name == "sway" | |
| 149 | + assert sample_count == 4 | |
| 150 | + assert mined_pairs == 2 | |
| 151 | + assert skipped_prompts == 1 | |
| 152 | + assert write_mode == "staged" | |
| 153 | + assert at | |
| 154 | + | |
| 155 | + | |
| 124 | 156 | class TestBestEffort: |
| 125 | 157 | def test_step_write_logs_error_once_per_stream( |
| 126 | 158 | self, |