Scrub data jargon
- SHA
07260df058dff20d0014ebc8e8f627a1c7487616- Parents
-
95402d2 - Tree
7210611
07260df
07260df058dff20d0014ebc8e8f627a1c748761695402d2
7210611| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/data/__init__.py
|
3 | 4 |
| M |
src/dlm/data/audio_preprocessor.py
|
4 | 3 |
| M |
src/dlm/data/dataset_builder.py
|
6 | 8 |
| M |
src/dlm/data/formatter.py
|
2 | 2 |
| M |
src/dlm/data/preference_parser.py
|
0 | 2 |
| M |
src/dlm/data/sections_to_rows.py
|
4 | 5 |
| M |
src/dlm/data/tokenizer_bringup.py
|
6 | 6 |
| M |
src/dlm/data/tokenizer_contract.py
|
10 | 9 |
| M |
src/dlm/data/vl_cache.py
|
3 | 3 |
| M |
src/dlm/data/weighted_rows.py
|
13 | 13 |
src/dlm/data/__init__.pymodified@@ -1,9 +1,8 @@ | ||
| 1 | 1 | """Dataset assembly — turn parsed `.dlm` sections into a ready-to-train dataset. |
| 2 | 2 | |
| 3 | -See Sprint 07 for the design. Heavy imports (`datasets`, `transformers`, | |
| 4 | -`trl`, `peft`) are deferred to the call sites that actually use them, | |
| 5 | -so `import dlm.data` stays cheap even when the training stack isn't | |
| 6 | -installed. | |
| 3 | +Heavy imports (`datasets`, `transformers`, `trl`, `peft`) are deferred | |
| 4 | +to the call sites that actually use them, so `import dlm.data` stays | |
| 5 | +cheap even when the training stack isn't installed. | |
| 7 | 6 | """ |
| 8 | 7 | |
| 9 | 8 | from __future__ import annotations |
src/dlm/data/audio_preprocessor.pymodified@@ -40,9 +40,10 @@ from dlm.data.errors import DataError | ||
| 40 | 40 | class AudioSampleRateMismatch(DataError): # noqa: N818 — `*Mismatch` mirrors other DataError subclasses |
| 41 | 41 | """Audio file sample rate doesn't match the base's pinned value. |
| 42 | 42 | |
| 43 | - Sprint 35.2 v1 refuses rather than resampling silently. The error | |
| 44 | - message echoes both rates so the user can re-encode with `ffmpeg | |
| 45 | - -ar <target>` or pick a base pinned to the clip's native rate. | |
| 43 | + Current releases refuse rather than resampling silently. The error | |
| 44 | + message echoes both rates so the user can re-encode with | |
| 45 | + `ffmpeg -ar <target>` or pick a base pinned to the clip's native | |
| 46 | + rate. | |
| 46 | 47 | """ |
| 47 | 48 | |
| 48 | 49 | |
src/dlm/data/dataset_builder.pymodified@@ -1,17 +1,15 @@ | ||
| 1 | 1 | """End-to-end: parsed `.dlm` sections → (train_ds, val_ds). |
| 2 | 2 | |
| 3 | -This is the single entry point Sprint 09's trainer calls. It: | |
| 3 | +This is the single entry point the trainer calls. It: | |
| 4 | 4 | |
| 5 | 5 | 1. Flattens `sections` to dict rows via `sections_to_rows`. |
| 6 | -2. Optionally concatenates a replay-corpus row iterable (Sprint 08 | |
| 7 | - supplies this; we just accept an iterable here to keep the | |
| 8 | - dependency one-directional). | |
| 6 | +2. Optionally concatenates a replay-corpus row iterable (we just | |
| 7 | + accept an iterable here to keep the dependency one-directional). | |
| 9 | 8 | 3. Splits into train / val via the deterministic splitter. |
| 10 | 9 | |
| 11 | 10 | The split is keyed on each row's `_dlm_section_id` + sub-index, so |
| 12 | -replay rows must also carry a stable `_dlm_section_id` — Sprint 08's | |
| 13 | -corpus reader stamps one derived from the originating document's | |
| 14 | -version. | |
| 11 | +replay rows must also carry a stable `_dlm_section_id` — the corpus | |
| 12 | +reader stamps one derived from the originating document's version. | |
| 15 | 13 | """ |
| 16 | 14 | |
| 17 | 15 | from __future__ import annotations |
@@ -46,7 +44,7 @@ def build_dataset( | ||
| 46 | 44 | """Build a (train, val) `Dataset` pair from parsed `.dlm` sections. |
| 47 | 45 | |
| 48 | 46 | `seed` is required (not defaulted) so the split is always traceable |
| 49 | - to a manifest entry; `val_frac=0.1` matches Sprint 07's spec. | |
| 47 | + to a manifest entry; `val_frac=0.1` matches the current default. | |
| 50 | 48 | |
| 51 | 49 | `weights`, when non-empty, expands rows by `(tag_key, tag_value)` |
| 52 | 50 | multipliers before the train/val split — integer factors duplicate |
src/dlm/data/formatter.pymodified@@ -10,8 +10,8 @@ Branches per row shape: | ||
| 10 | 10 | - neither → `DataFormatError`. |
| 11 | 11 | |
| 12 | 12 | PREFERENCE rows (`prompt`/`chosen`/`rejected`) are NOT formatted here — |
| 13 | -they're routed to DPOTrainer by Sprint 17, which has its own formatter. | |
| 14 | -This function refuses them explicitly so an accidentally-mixed dataset | |
| 13 | +they're routed to DPOTrainer, which has its own formatter. This | |
| 14 | +function refuses them explicitly so an accidentally-mixed dataset | |
| 15 | 15 | fails loudly at format time rather than producing silently-wrong data. |
| 16 | 16 | """ |
| 17 | 17 | |
src/dlm/data/preference_parser.pymodified@@ -17,8 +17,6 @@ The three headers must appear in order (Prompt → Chosen → Rejected) for | ||
| 17 | 17 | each triple. Missing, duplicated, or reordered headers raise |
| 18 | 18 | `PreferenceParseError`. Empty field bodies are errors — DPO on empty |
| 19 | 19 | text is never intentional. |
| 20 | - | |
| 21 | -Sprint 07 only parses + validates. The DPO consumer is Sprint 17. | |
| 22 | 20 | """ |
| 23 | 21 | |
| 24 | 22 | from __future__ import annotations |
src/dlm/data/sections_to_rows.pymodified@@ -1,7 +1,6 @@ | ||
| 1 | 1 | """Turn `doc.sections.Section` objects into ready-to-train dict rows. |
| 2 | 2 | |
| 3 | -Per Sprint 07's shape table (extended by Sprint 35 v1 for images and | |
| 4 | -Sprint 35.2 for audio): | |
| 3 | +Current shape table: | |
| 5 | 4 | |
| 6 | 5 | | Section type | Row shape | |
| 7 | 6 | |---|---| |
@@ -16,9 +15,9 @@ IMAGE / AUDIO emission requires a `BlobStore` (to resolve | ||
| 16 | 15 | Callers that leave `blob_store=None` with media sections in the |
| 17 | 16 | input raise `ValueError` — the row shape isn't viable without the |
| 18 | 17 | actual bytes. Audio rows hold only the path + sha, not the decoded |
| 19 | -waveform; the audio cache (Sprint 35.2) is the right place to hold | |
| 20 | -preprocessed features across epochs, and loading lazily at collate | |
| 21 | -time keeps dataset rows small. | |
| 18 | +waveform; the audio cache is the right place to hold preprocessed | |
| 19 | +features across epochs, and loading lazily at collate time keeps | |
| 20 | +dataset rows small. | |
| 22 | 21 | |
| 23 | 22 | Every row carries `_dlm_section_id` so `splitter.split()` can key |
| 24 | 23 | deterministically on (seed, section_id) rather than row index. This is |
src/dlm/data/tokenizer_bringup.pymodified@@ -7,7 +7,7 @@ Three invariants enforced here (see CLAUDE.md pitfall #4): | ||
| 7 | 7 | token, or labels get corrupted by mid-sequence EOS masking. |
| 8 | 8 | Fallback order: `unk_token` → else add `<|pad|>` as a new special |
| 9 | 9 | token (which grows the vocab and sets `tokenizer_grew=True` for |
| 10 | - the caller to propagate into Sprint 09's LoRA config). | |
| 10 | + the caller to propagate into the LoRA config). | |
| 11 | 11 | 2. **chat_template must be present.** Without it, SFTTrainer can't |
| 12 | 12 | render `messages`-shaped rows. We surface a typed |
| 13 | 13 | `TokenizerBringupError` rather than letting SFT fail deep inside |
@@ -38,10 +38,10 @@ class TokenizerBringup: | ||
| 38 | 38 | """Result of `prepare_tokenizer`. |
| 39 | 39 | |
| 40 | 40 | `tokenizer_grew=True` means a new `<|pad|>` token was added to the |
| 41 | - vocab. Sprint 09 MUST set `modules_to_save=["embed_tokens","lm_head"]` | |
| 42 | - on the LoRA config in that case (audit F02) — otherwise the new | |
| 43 | - embedding row will not be trained and its output distribution is | |
| 44 | - undefined. | |
| 41 | + vocab. The LoRA config MUST set | |
| 42 | + `modules_to_save=["embed_tokens","lm_head"]` in that case — | |
| 43 | + otherwise the new embedding row will not be trained and its | |
| 44 | + output distribution is undefined. | |
| 45 | 45 | """ |
| 46 | 46 | |
| 47 | 47 | tokenizer: PreTrainedTokenizerBase |
@@ -89,7 +89,7 @@ def _ensure_pad_token(tok: Any) -> bool: | ||
| 89 | 89 | return False |
| 90 | 90 | |
| 91 | 91 | # Last resort: add a new pad token. This grows the vocab, which |
| 92 | - # forces Sprint 09 to train embed_tokens + lm_head. | |
| 92 | + # forces training to update embed_tokens + lm_head. | |
| 93 | 93 | tok.add_special_tokens({"pad_token": _PAD_TOKEN_LITERAL}) |
| 94 | 94 | return True |
| 95 | 95 | |
src/dlm/data/tokenizer_contract.pymodified@@ -1,11 +1,12 @@ | ||
| 1 | -"""Canonical tokenizer-vocabulary-extension contract (Sprint 12b, audit F02/F06). | |
| 1 | +"""Canonical tokenizer-vocabulary-extension contract. | |
| 2 | 2 | |
| 3 | -A training run whose bringup (Sprint 07) adds a new special token grows | |
| 4 | -the vocabulary. Every downstream stage — LoRA config (`modules_to_save`), | |
| 5 | -export preflight (`tokenizer_from_adapter_dir.vocab_size == | |
| 6 | -gguf_base.vocab_size + N_added`), Modelfile stops (Sprint 12) — depends | |
| 7 | -on *the same* predicate for "did this tokenizer grow". This module is | |
| 8 | -that predicate's canonical home. | |
| 3 | +A training run whose bringup adds a new special token grows the | |
| 4 | +vocabulary. Every downstream stage — LoRA config | |
| 5 | +(`modules_to_save`), export preflight | |
| 6 | +(`tokenizer_from_adapter_dir.vocab_size == gguf_base.vocab_size + | |
| 7 | +N_added`), Modelfile stops — depends on *the same* predicate for | |
| 8 | +"did this tokenizer grow". This module is that predicate's canonical | |
| 9 | +home. | |
| 9 | 10 | |
| 10 | 11 | Two functions: |
| 11 | 12 | |
@@ -13,7 +14,7 @@ Two functions: | ||
| 13 | 14 | added-token set changed. Works for any `PreTrainedTokenizerBase` |
| 14 | 15 | (BPE or SentencePiece family). |
| 15 | 16 | - `modules_to_save_for_growth(grew)` — `["embed_tokens", "lm_head"]` |
| 16 | - when `grew=True`, else `[]`. Sprint 09 calls this when building the | |
| 17 | + when `grew=True`, else `[]`. Training calls this when building the | |
| 17 | 18 | LoRA config. Per pitfall #4, without the modules_to_save entry the |
| 18 | 19 | new embedding row's output is undefined. |
| 19 | 20 | |
@@ -34,7 +35,7 @@ def tokenizer_grew(base: PreTrainedTokenizerBase, final: PreTrainedTokenizerBase | ||
| 34 | 35 | """True iff `final` has a larger vocab or different added-token set than `base`. |
| 35 | 36 | |
| 36 | 37 | `vocab_size` comparison catches the `add_special_tokens` path used by |
| 37 | - Sprint 07's pad fallback. The `get_added_vocab()` set-comparison | |
| 38 | + the pad fallback. The `get_added_vocab()` set-comparison | |
| 38 | 39 | catches cases where an added token was *replaced* with a same-count |
| 39 | 40 | variant (vocab size unchanged but contents differ) — rare but |
| 40 | 41 | possible when users manually mutate the tokenizer between runs. |
src/dlm/data/vl_cache.pymodified@@ -1,9 +1,9 @@ | ||
| 1 | -"""VL preprocessor tensor cache (Sprint 35 v1). | |
| 1 | +"""VL preprocessor tensor cache. | |
| 2 | 2 | |
| 3 | 3 | Keyed on `(blob_sha, processor_sha, target_size)` — a blob-bytes |
| 4 | 4 | change, a processor upgrade, or a resize-policy bump each invalidate |
| 5 | -the entry. Orthogonal to the tokenized-section cache (Sprint 31): | |
| 6 | -different inputs, different consumers, different keys. | |
| 5 | +the entry. Orthogonal to the tokenized-section cache: different | |
| 6 | +inputs, different consumers, different keys. | |
| 7 | 7 | |
| 8 | 8 | Layout: `<vl-cache>/<blob_sha[:2]>/<blob_sha>.<proc_sha[:12]>.<h>x<w>.npz`. |
| 9 | 9 | Contents: single numpy array stored under the key `pixel_values`. |
src/dlm/data/weighted_rows.pymodified@@ -19,17 +19,17 @@ weight 1.0 (= 2.0 × 0.5). | ||
| 19 | 19 | |
| 20 | 20 | Determinism: the keep/extra-copy decision is a hash of |
| 21 | 21 | `(seed, section_id, fractional_index)`. Same seed + same corpus → |
| 22 | -same expanded row list, bit-exact. This preserves the Sprint 31.5 | |
| 23 | -determinism guarantee: a cached run and an uncached run on the same | |
| 24 | -weights config produce byte-identical adapter weights. | |
| 25 | - | |
| 26 | -**Why row repetition, not per-row loss scaling?** Sprint 31.5's | |
| 27 | -hard-won bit-identity against TRL's `_tokenize` would be lost the | |
| 28 | -moment we subclassed `SFTTrainer.compute_loss` to multiply by a | |
| 29 | -sample-weights tensor — any TRL internal refactor of the loss path | |
| 30 | -becomes a silent correctness bug. Expansion is a dataset-level | |
| 31 | -transform; every downstream layer (pretokenize cache, TRL | |
| 32 | -collator, AdamW) sees a plain list of rows and stays dumb. | |
| 22 | +same expanded row list, bit-exact. This preserves the determinism | |
| 23 | +guarantee: a cached run and an uncached run on the same weights | |
| 24 | +config produce byte-identical adapter weights. | |
| 25 | + | |
| 26 | +**Why row repetition, not per-row loss scaling?** Bit-identity against | |
| 27 | +TRL's `_tokenize` would be lost the moment we subclassed | |
| 28 | +`SFTTrainer.compute_loss` to multiply by a sample-weights tensor — | |
| 29 | +any TRL internal refactor of the loss path becomes a silent | |
| 30 | +correctness bug. Expansion is a dataset-level transform; every | |
| 31 | +downstream layer (pretokenize cache, TRL collator, AdamW) sees a | |
| 32 | +plain list of rows and stays dumb. | |
| 33 | 33 | """ |
| 34 | 34 | |
| 35 | 35 | from __future__ import annotations |
@@ -110,8 +110,8 @@ def expand_rows_by_weight( | ||
| 110 | 110 | An empty `weights` map is a no-op (returns a shallow copy of |
| 111 | 111 | `rows`). Section-ID preservation means the replay corpus still |
| 112 | 112 | tracks per-row identity — the N copies of a repeated row share |
| 113 | - a section_id, which matches the Sprint 08 semantics of "retraining | |
| 114 | - on the same content N times". | |
| 113 | + a section_id, which matches the replay semantics of retraining on | |
| 114 | + the same content N times. | |
| 115 | 115 | """ |
| 116 | 116 | if not weights: |
| 117 | 117 | return list(rows) |