Python · 2356 bytes Raw Blame History
1 """Minimal section contract for attribution probes.
2
3 The flagship B1 ``section_internalization`` probe needs *structured*
4 input — a section has an id, a kind, content text, and possibly some
5 Q/A pairs or chosen/rejected triples. sway defines this shape here so
6 the probes stay oblivious to the upstream (``.dlm`` parser, custom
7 loaders, synthetic test fixtures).
8
9 Field names are aligned with :mod:`dlm.doc.sections` but this module
10 does not import ``dlm`` — the bridge at
11 :mod:`dlm_sway.integrations.dlm` does the adaptation.
12 """
13
14 from __future__ import annotations
15
16 from dataclasses import dataclass, field
17 from typing import Literal
18
19 SectionKind = Literal["prose", "instruction", "preference"]
20
21
22 @dataclass(frozen=True, slots=True)
23 class SectionProbe:
24 """A ``(prompt, gold)`` pair lifted from an INSTRUCTION section."""
25
26 prompt: str
27 gold: str
28
29
30 @dataclass(frozen=True, slots=True)
31 class SectionPreference:
32 """A ``(prompt, chosen, rejected)`` triple from a PREFERENCE section."""
33
34 prompt: str
35 chosen: str
36 rejected: str
37
38
39 @dataclass(frozen=True, slots=True)
40 class Section:
41 """One typed chunk of a training document.
42
43 Attributes
44 ----------
45 id:
46 Content-addressed identifier. ``.dlm`` uses a 16-hex-char
47 sha256 prefix; sway doesn't enforce a format.
48 kind:
49 Discriminator for which of :attr:`probes` /
50 :attr:`preferences` / :attr:`content` is the primary signal.
51 content:
52 Raw section text. Always populated; used by the rolling-PPL
53 path for PROSE sections.
54 probes:
55 For INSTRUCTION: parsed Q/A pairs. Empty tuple for others.
56 preferences:
57 For PREFERENCE: parsed chosen/rejected triples. Empty otherwise.
58 tag:
59 Optional free-form label for the section (e.g., "intro",
60 "api-reference"). Surfaces in per-section reports.
61 """
62
63 id: str
64 kind: SectionKind
65 content: str
66 probes: tuple[SectionProbe, ...] = field(default_factory=tuple)
67 preferences: tuple[SectionPreference, ...] = field(default_factory=tuple)
68 tag: str | None = None
69
70
71 def filter_kinds(
72 sections: tuple[Section, ...], kinds: tuple[SectionKind, ...]
73 ) -> tuple[Section, ...]:
74 """Return only sections whose ``kind`` matches one of ``kinds``."""
75 allow = set(kinds)
76 return tuple(s for s in sections if s.kind in allow)