`cc3896e`

probes/style_fingerprint: detect zero-fp ft as ERROR; replace cosine with projection (B4)

Authored by

espadonne 3 weeks ago

SHA: cc3896ec6fc4d99d7a92510cb0b7b9b79fd14e7a
Parents: 0f66b70
Tree: 44a9797

2 changed files

Status	File	+	-
M	`src/dlm_sway/probes/style_fingerprint.py`	48	8
M	`tests/unit/test_probe_style_fingerprint.py`	64	0

src/dlm_sway/probes/style_fingerprint.pymodified

          ft_fp = fingerprint("\n".join(ft_samples))
          doc_fp = fingerprint(doc_text)
 -        shift = _cosine_shift(base_fp, ft_fp, doc_fp)
 +        # B4 fix: a degenerate ft fingerprint (all-empty generations →
 +        # zeros) used to coincidentally produce a positive cosine shift
 +        # because cos(ft-base, doc-base) ≈ cos(-base, doc-base) is often
 +        # positive. Detect that case and emit ERROR rather than PASS.
 +        ft_is_zero = bool(np.allclose(ft_fp, 0.0))
 +        ft_text_is_empty = all(not s.strip() for s in ft_samples)
 +        if ft_is_zero or ft_text_is_empty:
 +            return safe_finalize(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.ERROR,
 +                score=None,
 +                raw=None,
 +                evidence={
 +                    "base_fp": base_fp.tolist(),
 +                    "ft_fp": ft_fp.tolist(),
 +                    "doc_fp": doc_fp.tolist(),
 +                    "ft_text_is_empty": ft_text_is_empty,
 +                    "ft_fp_is_zero": ft_is_zero,
 +                    "weight": spec.weight,
 +                },
 +                message=(
 +                    "fine-tuned model produced empty / zero-fingerprint output — "
 +                    "cannot measure style shift on a degenerate ft view"
 +                ),
 +            )
++
 +        shift = _projection_shift(base_fp, ft_fp, doc_fp)
          verdict = Verdict.PASS if shift >= spec.assert_shift_gte else Verdict.FAIL
 -        score = float(np.clip((shift + 1.0) / 2.0, 0.0, 1.0))
 +        # Score: 0 at no shift, 1 when ft moves a full doc-gap toward
 +        # doc; clamp to [0, 1].
 +        score = float(np.clip(shift, 0.0, 1.0))
          return safe_finalize(
              name=spec.name,
+         )
 -def _cosine_shift(
 +def _projection_shift(
      base: NDArray[np.float64], ft: NDArray[np.float64], doc: NDArray[np.float64]
  ) -> float:
 -    """Cosine between (ft - base) and (doc - base) in fingerprint space."""
 +    """Project (ft - base) onto (doc - base), normalized by ||doc - base||².
++
 +    Returns ``((ft - base) · (doc - base)) / ||doc - base||²``. Properties:
++
 +    - ``ft == base`` → 0 (no shift)
 +    - ``ft == doc`` → 1 (ft moved a full doc-gap toward doc)
 +    - ``ft`` moved opposite to doc → negative
 +    - ``doc == base`` (no doc gap to measure) → 0
++
 +    This replaces the older ``cos(ft-base, doc-base)`` which silently
 +    treated a zero ft-shift as a phantom positive correlation when
 +    ``-base`` happened to point in roughly the doc direction (B4).
 +    """
      a = ft - base
      b = doc - base
 -    na = float(np.linalg.norm(a))
 -    nb = float(np.linalg.norm(b))
 -    if na == 0.0 or nb == 0.0:
 +    nb_sq = float(np.dot(b, b))
 +    if nb_sq == 0.0:
          return 0.0
 -    return float(np.dot(a, b) / (na * nb))
 +    return float(np.dot(a, b) / nb_sq)

tests/unit/test_probe_style_fingerprint.pymodified

          ctx = RunContext(backend=backend)
          result = probe.run(spec, ctx)
          assert result.verdict == Verdict.ERROR
++
++
 +class TestB4ZeroFtFingerprint:
 +    """Pins the B4 fix: a degenerate ft (empty generations / zero
 +    fingerprint) must NOT pass — must produce ERROR with a clear
 +    message. The historical bug used cos(ft-base, doc-base) which
 +    coincidentally aligned with -base when ft was zero, producing
 +    spurious +0.82 PASS verdicts."""
++
 +    def test_empty_ft_generations_route_to_error(self) -> None:
 +        base_samples = ["Some real prose. With multiple sentences."] * 2
 +        ft_samples = ["", ""]  # broken ft model produces no text
 +        doc = "Wherein clauses conjoin into meandering wholes."
 +        backend = _backend_with_samples(base_samples, ft_samples)
 +        probe, spec = build_probe(
 +            {
 +                "name": "c1",
 +                "kind": "style_fingerprint",
 +                "prompts": ["p0", "p1"],
 +                "doc_reference": doc,
 +                "assert_shift_gte": 0.0,
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.ERROR
 +        assert "empty" in result.message.lower() or "degenerate" in result.message.lower()
 +        # Evidence preserves the fingerprints for postmortem.
 +        assert result.evidence["ft_text_is_empty"] is True
++
 +    def test_whitespace_only_ft_generations_route_to_error(self) -> None:
 +        base_samples = ["Some real prose."] * 2
 +        ft_samples = ["   ", "\n\n"]
 +        backend = _backend_with_samples(base_samples, ft_samples)
 +        probe, spec = build_probe(
 +            {
 +                "name": "c1",
 +                "kind": "style_fingerprint",
 +                "prompts": ["p0", "p1"],
 +                "doc_reference": "doc",
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.ERROR
++
 +    def test_projection_shift_zero_when_ft_equals_base(self) -> None:
 +        """ft == base → 0 shift, regardless of where doc sits."""
 +        same = "Same prose. Same words."
 +        backend = _backend_with_samples([same, same], [same, same])
 +        probe, spec = build_probe(
 +            {
 +                "name": "c1",
 +                "kind": "style_fingerprint",
 +                "prompts": ["p0", "p1"],
 +                "doc_reference": "Wholly different doc style with many words.",
 +                "assert_shift_gte": 0.01,
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        # ft fp == base fp → projection is exactly 0.
 +        assert result.raw == 0.0
 +        assert result.verdict == Verdict.FAIL  # no shift, gate fails