sway Public

Watch 0 Fork 0 Star 0

Python · 22676 bytes Raw Blame History

  
        1
        """Tool-use fidelity — does the adapter preserve the base's tool-call format?
      
        2
        
        3
        LoRA fine-tunes increasingly target tool-use behavior: function-calling
      
        4
        JSON schemas, MCP tool plans, code-execution gating. An adapter that
      
        5
        accidentally degrades JSON-schema validity or starts hallucinating tool
      
        6
        names while acing every other probe is a silent production failure.
      
        7
        
        8
        For each ``(prompt, tool_spec, gold_tool_name)`` case the probe greedy-
      
        9
        generates from both views and computes three independent signals:
      
        10
        
        11
        - **JSON-schema validity delta** — ``ft_valid_rate − base_valid_rate``.
      
        12
          Negative values mean the adapter degraded the base's tool-call
      
        13
          formatting; positive values mean it improved (or the base was
      
        14
          already tool-clueless).
      
        15
        - **Argument-field disagreement rate** — over cases where *both* views
      
        16
          produced a schema-valid call, the leaf-field disagreement rate
      
        17
          between ``ft_call.arguments`` and ``base_call.arguments``. Catches
      
        18
          numeric / string drift inside an otherwise-well-formed call.
      
        19
          (We deliberately ship leaf-field equality rather than per-token KL
      
        20
          on argument values: per-token KL requires alignment between two
      
        21
          decoded strings, which is the same hard problem we explicitly
      
        22
          defer past v1.)
      
        23
        - **Tool-name hallucination rate** — over schema-valid ft calls, the
      
        24
          fraction whose ``name`` is not in ``allowed_tools`` (when the user
      
        25
          declared a tool surface) or differs from ``gold_tool_name`` (when
      
        26
          no surface is declared).
      
        27
        
        28
        Composite verdict logic:
      
        29
        
        30
        - The pass criterion is compound — validity delta above the floor AND
      
        31
          hallucination rate below the cap. Argument disagreement is
      
        32
          informational on the v1 surface.
      
        33
        - ``json_valid_rate_ft`` is the metric that's z-scored against the
      
        34
          null-adapter baseline. A null adapter should produce essentially no
      
        35
          schema-valid calls, so ``z >= assert_z_gte`` is the principled
      
        36
          "the adapter actually preserved tool-call structure" claim.
      
        37
        
        38
        JSON-schema check is deliberately minimal: this v1 implementation
      
        39
        validates ``required`` membership and per-field type tags from a tiny
      
        40
        OpenAI-flavored subset (``string``, ``number``, ``integer``, ``boolean``,
      
        41
        ``object``, ``array``). We don't pull in the ``jsonschema`` package —
      
        42
        the OpenAI-style spec is small enough to validate directly, and core
      
        43
        sway dependencies stay lean.
      
        44
        """
      
        45
        
        46
        from __future__ import annotations
      
        47
        
        48
        import json
      
        49
        import statistics
      
        50
        from typing import Any, Literal
      
        51
        
        52
        from pydantic import BaseModel, ConfigDict, Field
      
        53
        
        54
        from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
      
        55
        from dlm_sway.core.stats import bootstrap_ci
      
        56
        from dlm_sway.probes._zscore import (
      
        57
            no_calibration_note,
      
        58
            score_from_z,
      
        59
            verdict_from_z,
      
        60
            z_score,
      
        61
            z_scores_by_rank,
      
        62
        )
      
        63
        from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
      
        64
        from dlm_sway.probes.null_adapter import get_null_stats, get_null_stats_by_rank
      
        65
        
        66
        
        67
        class ToolUseCase(BaseModel):
      
        68
            """One ``(prompt, tool_spec, gold_tool_name)`` case.
      
        69
        
        70
            ``tool_spec`` follows the OpenAI function-calling shape::
      
        71
        
        72
                {
      
        73
                    "name": "search_web",
      
        74
                    "description": "...",
      
        75
                    "parameters": {
      
        76
                        "type": "object",
      
        77
                        "properties": {
      
        78
                            "query": {"type": "string"},
      
        79
                            "max_results": {"type": "integer"}
      
        80
                        },
      
        81
                        "required": ["query"]
      
        82
                    }
      
        83
                }
      
        84
        
        85
            ``gold_tool_name`` is the tool the case expects ft to call. It's
      
        86
            used by the hallucination check when no broader ``allowed_tools``
      
        87
            list is declared on the spec.
      
        88
            """
      
        89
        
        90
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        91
        
        92
            prompt: str
      
        93
            tool_spec: dict[str, Any]
      
        94
            gold_tool_name: str
      
        95
            max_new_tokens: int = 256
      
        96
        
        97
        
        98
        class ToolUseFidelitySpec(ProbeSpec):
      
        99
            """Spec for ``kind: tool_use_fidelity``."""
      
        100
        
        101
            kind: Literal["tool_use_fidelity"] = "tool_use_fidelity"
      
        102
            cases: list[ToolUseCase] = Field(default_factory=list, min_length=0)
      
        103
            """Inline cases. Empty list → probe SKIPs (the .dlm autogen path
      
        104
            leaves this empty unless a tool-use template seeded the doc)."""
      
        105
            allowed_tools: list[str] | None = None
      
        106
            """Optional tool-surface declaration. When set, hallucination is
      
        107
            ``ft.name not in allowed_tools``. When ``None``, hallucination is
      
        108
            ``ft.name != case.gold_tool_name`` per-case."""
      
        109
            assert_validity_delta_gte: float = -0.05
      
        110
            """Pass criterion on JSON-schema validity. Default tolerates a 5pp
      
        111
            drop from base — anything worse is an adapter regression."""
      
        112
            assert_hallucination_lte: float = 0.10
      
        113
            """Pass criterion on tool-name hallucination. Default 10% — above
      
        114
            that the adapter is actively inventing tools."""
      
        115
            assert_z_gte: float = 3.0
      
        116
            """Z-score pass criterion on ``json_valid_rate_ft`` against the
      
        117
            null-adapter baseline. The principled signal — preferred over
      
        118
            the raw thresholds when null calibration ran."""
      
        119
        
        120
        
        121
        class ToolUseFidelityProbe(Probe):
      
        122
            """The "did the LoRA preserve tool-call format?" probe."""
      
        123
        
        124
            kind = "tool_use_fidelity"
      
        125
            spec_cls = ToolUseFidelitySpec
      
        126
            category = "attribution"
      
        127
        
        128
            @classmethod
      
        129
            def calibrate_spec(cls, ctx: RunContext) -> ToolUseFidelitySpec | None:
      
        130
                """Two trivial sentinel cases for null calibration.
      
        131
        
        132
                A null (random-noise) adapter should produce essentially no
      
        133
                schema-valid output here, so ``json_valid_rate`` clusters
      
        134
                tightly near zero — exactly what we want as the denominator
      
        135
                when z-scoring the real adapter's validity rate.
      
        136
                """
      
        137
                del ctx
      
        138
                return ToolUseFidelitySpec(
      
        139
                    name="_calibration",
      
        140
                    kind="tool_use_fidelity",
      
        141
                    cases=[
      
        142
                        ToolUseCase(
      
        143
                            prompt="Search the web for the capital of France.",
      
        144
                            tool_spec={
      
        145
                                "name": "search_web",
      
        146
                                "parameters": {
      
        147
                                    "type": "object",
      
        148
                                    "properties": {"query": {"type": "string"}},
      
        149
                                    "required": ["query"],
      
        150
                                },
      
        151
                            },
      
        152
                            gold_tool_name="search_web",
      
        153
                        ),
      
        154
                        ToolUseCase(
      
        155
                            prompt="Add 2 and 2.",
      
        156
                            tool_spec={
      
        157
                                "name": "calculator",
      
        158
                                "parameters": {
      
        159
                                    "type": "object",
      
        160
                                    "properties": {
      
        161
                                        "a": {"type": "number"},
      
        162
                                        "b": {"type": "number"},
      
        163
                                    },
      
        164
                                    "required": ["a", "b"],
      
        165
                                },
      
        166
                            },
      
        167
                            gold_tool_name="calculator",
      
        168
                        ),
      
        169
                    ],
      
        170
                )
      
        171
        
        172
            def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
      
        173
                assert isinstance(spec, ToolUseFidelitySpec)
      
        174
                if not spec.cases:
      
        175
                    return ProbeResult(
      
        176
                        name=spec.name,
      
        177
                        kind=spec.kind,
      
        178
                        verdict=Verdict.SKIP,
      
        179
                        score=None,
      
        180
                        message="no tool-use cases (inline 'cases' was empty)",
      
        181
                    )
      
        182
        
        183
                base_valid: list[bool] = []
      
        184
                ft_valid: list[bool] = []
      
        185
                ft_calls: list[dict[str, Any] | None] = []
      
        186
                base_calls: list[dict[str, Any] | None] = []
      
        187
        
        188
                # Greedy decode each case under both views. We open one base
      
        189
                # context for the whole sweep and one ft context — same shape
      
        190
                # as preference_flip's per-triple loop, but at the case level
      
        191
                # so a single backend toggle covers all N cases.
      
        192
                with ctx.require_backend.as_base() as base_view:
      
        193
                    base_outputs = [
      
        194
                        base_view.generate(c.prompt, max_new_tokens=c.max_new_tokens) for c in spec.cases
      
        195
                    ]
      
        196
                with ctx.require_backend.as_finetuned() as ft_view:
      
        197
                    ft_outputs = [
      
        198
                        ft_view.generate(c.prompt, max_new_tokens=c.max_new_tokens) for c in spec.cases
      
        199
                    ]
      
        200
        
        201
                for case, b_out, f_out in zip(spec.cases, base_outputs, ft_outputs, strict=True):
      
        202
                    b_call = _parse_tool_call(b_out)
      
        203
                    f_call = _parse_tool_call(f_out)
      
        204
                    base_calls.append(b_call)
      
        205
                    ft_calls.append(f_call)
      
        206
                    base_valid.append(b_call is not None and _matches_schema(b_call, case.tool_spec))
      
        207
                    ft_valid.append(f_call is not None and _matches_schema(f_call, case.tool_spec))
      
        208
        
        209
                n = len(spec.cases)
      
        210
                base_valid_rate = sum(base_valid) / n
      
        211
                ft_valid_rate = sum(ft_valid) / n
      
        212
                validity_delta = ft_valid_rate - base_valid_rate
      
        213
        
        214
                # Argument-disagreement: only computed where BOTH views produced
      
        215
                # a schema-valid call. Otherwise the comparison is between a
      
        216
                # call and a non-call, which the probe scores via the validity
      
        217
                # path instead.
      
        218
                agreed_pairs = [
      
        219
                    (bc, fc)
      
        220
                    for bc, fc, bv, fv in zip(base_calls, ft_calls, base_valid, ft_valid, strict=True)
      
        221
                    if bv and fv and bc is not None and fc is not None
      
        222
                ]
      
        223
                if agreed_pairs:
      
        224
                    disagreement_per_pair = [_field_disagreement(bc, fc) for bc, fc in agreed_pairs]
      
        225
                    mean_arg_disagreement = statistics.fmean(disagreement_per_pair)
      
        226
                else:
      
        227
                    mean_arg_disagreement = 0.0
      
        228
        
        229
                # Hallucination: over schema-valid ft calls, what fraction call
      
        230
                # the wrong tool? Denominator excludes invalid calls so
      
        231
                # validity-failure double-counts don't pollute the metric.
      
        232
                ft_valid_calls = [
      
        233
                    (case, fc)
      
        234
                    for case, fc, fv in zip(spec.cases, ft_calls, ft_valid, strict=True)
      
        235
                    if fv and fc is not None
      
        236
                ]
      
        237
                if ft_valid_calls:
      
        238
                    hallucinated = [
      
        239
                        not _name_allowed(fc.get("name", ""), case.gold_tool_name, spec.allowed_tools)
      
        240
                        for case, fc in ft_valid_calls
      
        241
                    ]
      
        242
                    hallucination_rate = sum(hallucinated) / len(ft_valid_calls)
      
        243
                else:
      
        244
                    hallucination_rate = 0.0
      
        245
        
        246
                # Z-score the ft validity rate against the null baseline. CIs
      
        247
                # are over the per-case validity flags so they reflect sampling
      
        248
                # noise on the proportion estimate.
      
        249
                stats = get_null_stats(ctx, spec.kind)
      
        250
                z = z_score(ft_valid_rate, stats)
      
        251
                z_by_rank = z_scores_by_rank(ft_valid_rate, get_null_stats_by_rank(ctx, spec.kind), sign=+1)
      
        252
                ci_95 = bootstrap_ci([1.0 if v else 0.0 for v in ft_valid], seed=ctx.seed)
      
        253
        
        254
                # Compound pass criterion: every gate must hold. The z-score
      
        255
                # path subsumes the validity-delta gate when null calibration
      
        256
                # ran (a 3σ-significant validity rate is by construction a
      
        257
                # rate above noise), but we apply both so a probe with z=4σ
      
        258
                # but a -20pp validity delta still flags as a regression.
      
        259
                validity_pass = validity_delta >= spec.assert_validity_delta_gte
      
        260
                hallucination_pass = hallucination_rate <= spec.assert_hallucination_lte
      
        261
                verdict_z = verdict_from_z(z, spec.assert_z_gte)
      
        262
        
        263
                if verdict_z is not None:
      
        264
                    z_path_pass = verdict_z == Verdict.PASS
      
        265
                    verdict = (
      
        266
                        Verdict.PASS
      
        267
                        if (z_path_pass and validity_pass and hallucination_pass)
      
        268
                        else Verdict.FAIL
      
        269
                    )
      
        270
                    base_msg = (
      
        271
                        f"validity ft={ft_valid_rate:.0%} (base={base_valid_rate:.0%}, "
      
        272
                        f"Δ={validity_delta:+.0%}), hallucination={hallucination_rate:.0%}, "
      
        273
                        f"z={z:+.2f}σ vs null"
      
        274
                    )
      
        275
                else:
      
        276
                    verdict = Verdict.PASS if (validity_pass and hallucination_pass) else Verdict.FAIL
      
        277
                    base_msg = (
      
        278
                        f"validity ft={ft_valid_rate:.0%} (base={base_valid_rate:.0%}, "
      
        279
                        f"Δ={validity_delta:+.0%}), hallucination={hallucination_rate:.0%} "
      
        280
                        f"{no_calibration_note(spec.kind)}"
      
        281
                    )
      
        282
        
        283
                # Composite score blends the three signals. Weights tuned so
      
        284
                # that perfect validity preservation + zero hallucination
      
        285
                # → 1.0; full hallucination or full validity collapse → 0.0.
      
        286
                validity_factor = max(0.0, min(1.0, 1.0 + validity_delta))
      
        287
                hallucination_factor = max(0.0, 1.0 - hallucination_rate)
      
        288
                score_raw = validity_factor * hallucination_factor
      
        289
                # Z-score boost only applies when calibration is available
      
        290
                # — scaling toward score_from_z(z) so a strongly-significant
      
        291
                # adapter scores higher than a marginally-significant one
      
        292
                # even at the same raw rates.
      
        293
                z_score_val = score_from_z(z) if z is not None else None
      
        294
                score = 0.7 * score_raw + 0.3 * z_score_val if z_score_val is not None else score_raw
      
        295
        
        296
                return safe_finalize(
      
        297
                    name=spec.name,
      
        298
                    kind=spec.kind,
      
        299
                    verdict=verdict,
      
        300
                    score=score,
      
        301
                    raw=ft_valid_rate,
      
        302
                    z_score=z,
      
        303
                    base_value=base_valid_rate,
      
        304
                    ft_value=ft_valid_rate,
      
        305
                    evidence={
      
        306
                        "json_valid_rate_base": base_valid_rate,
      
        307
                        "json_valid_rate_ft": ft_valid_rate,
      
        308
                        "validity_delta": validity_delta,
      
        309
                        "mean_arg_disagreement": mean_arg_disagreement,
      
        310
                        "hallucination_rate": hallucination_rate,
      
        311
                        "num_cases": n,
      
        312
                        "num_arg_pairs_compared": len(agreed_pairs),
      
        313
                        "z_by_rank": z_by_rank,
      
        314
                        "raw_ci_95": list(ci_95) if ci_95 is not None else None,
      
        315
                        "weight": spec.weight,
      
        316
                    },
      
        317
                    message=base_msg,
      
        318
                    ci_95=ci_95,
      
        319
                )
      
        320
        
        321
        
        322
        # ---------------------------------------------------------------------------
      
        323
        # Tool-call parsing + minimal schema validator
      
        324
        # ---------------------------------------------------------------------------
      
        325
        
        326
        
        327
        def _parse_tool_call(text: str) -> dict[str, Any] | None:
      
        328
            """Extract ``{"name": ..., "arguments": {...}}`` from a generation.
      
        329
        
        330
            Tries three increasingly forgiving strategies:
      
        331
        
        332
            1. The whole text parses as JSON and is a dict.
      
        333
            2. The text contains a ``{...}`` substring that parses as JSON.
      
        334
            3. The text contains a fenced code block (``\\`\\`\\`json ... \\`\\`\\```)
      
        335
               whose body parses as JSON.
      
        336
        
        337
            Returns ``None`` when no parse succeeds OR when the parsed object
      
        338
            isn't a dict — both base and ft models often wander prose without
      
        339
            producing structured calls; that's exactly the failure mode the
      
        340
            probe scores.
      
        341
        
        342
            The returned dict isn't validated yet — callers that care about
      
        343
            schema conformance run :func:`_matches_schema` separately so the
      
        344
            "valid JSON, wrong shape" failure mode is observable downstream.
      
        345
            """
      
        346
            text = text.strip()
      
        347
            # Strategy 1.
      
        348
            try:
      
        349
                obj = json.loads(text)
      
        350
            except (ValueError, TypeError):
      
        351
                obj = None
      
        352
            if isinstance(obj, dict):
      
        353
                return obj
      
        354
        
        355
            # Strategy 3 — fenced ```json ... ``` block first because its
      
        356
            # boundaries are unambiguous; embedded ``{...}`` heuristics in
      
        357
            # strategy 2 sometimes match a JSON-looking fragment inside a
      
        358
            # fenced block's body.
      
        359
            fenced = _extract_fenced_json(text)
      
        360
            if fenced is not None:
      
        361
                try:
      
        362
                    obj = json.loads(fenced)
      
        363
                except (ValueError, TypeError):
      
        364
                    obj = None
      
        365
                if isinstance(obj, dict):
      
        366
                    return obj
      
        367
        
        368
            # Strategy 2 — first balanced ``{...}`` substring.
      
        369
            candidate = _first_balanced_braces(text)
      
        370
            if candidate is not None:
      
        371
                try:
      
        372
                    obj = json.loads(candidate)
      
        373
                except (ValueError, TypeError):
      
        374
                    obj = None
      
        375
                if isinstance(obj, dict):
      
        376
                    return obj
      
        377
        
        378
            return None
      
        379
        
        380
        
        381
        def _extract_fenced_json(text: str) -> str | None:
      
        382
            """Return the body of a ```json ... ``` block, if any."""
      
        383
            marker = "```"
      
        384
            start = text.find(marker)
      
        385
            if start < 0:
      
        386
                return None
      
        387
            # Skip the opening fence (and an optional ``json`` language tag).
      
        388
            body_start = start + len(marker)
      
        389
            if text[body_start : body_start + 4].lower() == "json":
      
        390
                body_start += 4
      
        391
            # Strip a single trailing newline after the language tag.
      
        392
            if body_start < len(text) and text[body_start] == "\n":
      
        393
                body_start += 1
      
        394
            end = text.find(marker, body_start)
      
        395
            if end < 0:
      
        396
                return None
      
        397
            return text[body_start:end].strip()
      
        398
        
        399
        
        400
        def _first_balanced_braces(text: str) -> str | None:
      
        401
            """Return the first balanced ``{...}`` substring, or ``None``.
      
        402
        
        403
            Tracks string-literal context so a brace inside a string doesn't
      
        404
            throw off the depth counter. Quadratic worst case is fine — the
      
        405
            inputs are model generations capped at a few hundred tokens.
      
        406
            """
      
        407
            depth = 0
      
        408
            in_string = False
      
        409
            escape = False
      
        410
            start = -1
      
        411
            for i, ch in enumerate(text):
      
        412
                if in_string:
      
        413
                    if escape:
      
        414
                        escape = False
      
        415
                    elif ch == "\\":
      
        416
                        escape = True
      
        417
                    elif ch == '"':
      
        418
                        in_string = False
      
        419
                    continue
      
        420
                if ch == '"':
      
        421
                    in_string = True
      
        422
                    continue
      
        423
                if ch == "{":
      
        424
                    if depth == 0:
      
        425
                        start = i
      
        426
                    depth += 1
      
        427
                elif ch == "}":
      
        428
                    depth -= 1
      
        429
                    if depth == 0 and start >= 0:
      
        430
                        return text[start : i + 1]
      
        431
                    if depth < 0:
      
        432
                        # Unbalanced — reset and keep scanning.
      
        433
                        depth = 0
      
        434
                        start = -1
      
        435
            return None
      
        436
        
        437
        
        438
        # Type tags we recognize. Matches the OpenAI function-call subset; we
      
        439
        # accept ``int`` for ``"integer"`` and ``int|float`` for ``"number"``.
      
        440
        _TYPE_CHECKS: dict[str, tuple[type, ...]] = {
      
        441
            "string": (str,),
      
        442
            "integer": (int,),
      
        443
            "number": (int, float),
      
        444
            "boolean": (bool,),
      
        445
            "object": (dict,),
      
        446
            "array": (list,),
      
        447
        }
      
        448
        
        449
        
        450
        def _matches_schema(call: dict[str, Any], tool_spec: dict[str, Any]) -> bool:
      
        451
            """Light OpenAI-flavored JSON-schema check.
      
        452
        
        453
            Validates:
      
        454
        
        455
            - The call has a ``name`` field (string) — the model is supposed to
      
        456
              identify the tool it's calling.
      
        457
            - The call has an ``arguments`` field (dict) — the OpenAI shape.
      
        458
            - Every name in ``parameters.required`` appears in ``arguments``.
      
        459
            - Every field declared in ``parameters.properties`` whose value is
      
        460
              present has a type that matches the property's ``type`` tag.
      
        461
        
        462
            Doesn't validate: enums, ``minLength``/``maxLength``, nested
      
        463
            schemas, ``oneOf``/``anyOf``. Those land in a follow-up sprint
      
        464
            when users push tool specs that need them — the v1 surface is
      
        465
            deliberately the OpenAI-tutorial shape.
      
        466
        
        467
            Booleans-as-integers caveat: Python's ``isinstance(True, int)``
      
        468
            is True, so we explicitly reject bools when the schema asks for
      
        469
            integer/number. Otherwise an adapter that emits ``true`` would
      
        470
            silently pass an "integer" check.
      
        471
            """
      
        472
            if not isinstance(call.get("name"), str):
      
        473
                return False
      
        474
            args = call.get("arguments")
      
        475
            if not isinstance(args, dict):
      
        476
                return False
      
        477
        
        478
            parameters = tool_spec.get("parameters") or {}
      
        479
            properties = parameters.get("properties") or {}
      
        480
            required = parameters.get("required") or []
      
        481
        
        482
            for r in required:
      
        483
                if r not in args:
      
        484
                    return False
      
        485
        
        486
            for field_name, field_value in args.items():
      
        487
                prop = properties.get(field_name)
      
        488
                if not prop:
      
        489
                    # Unknown field — OpenAI's actual function-call rules
      
        490
                    # forbid this, but we permit it because real models often
      
        491
                    # add benign extra fields. Validity stays True.
      
        492
                    continue
      
        493
                type_tag = prop.get("type")
      
        494
                if type_tag is None:
      
        495
                    continue
      
        496
                allowed = _TYPE_CHECKS.get(type_tag)
      
        497
                if allowed is None:
      
        498
                    continue
      
        499
                if isinstance(field_value, bool) and bool not in allowed:
      
        500
                    return False
      
        501
                if not isinstance(field_value, allowed):
      
        502
                    return False
      
        503
            return True
      
        504
        
        505
        
        506
        def _name_allowed(name: str, gold: str, allowed_tools: list[str] | None) -> bool:
      
        507
            """Return True iff the model's tool ``name`` is acceptable.
      
        508
        
        509
            Rules:
      
        510
        
        511
            - ``allowed_tools`` declared → ``name`` must be in the list.
      
        512
            - Otherwise → ``name`` must equal ``gold``.
      
        513
        
        514
            The split exists because some users want a per-case strict gold
      
        515
            (calibrator is checking "did the model pick the *right* tool")
      
        516
            while others want a surface gate ("did the model stay inside the
      
        517
            declared tool list").
      
        518
            """
      
        519
            if allowed_tools is not None:
      
        520
                return name in allowed_tools
      
        521
            return name == gold
      
        522
        
        523
        
        524
        def _field_disagreement(base_call: dict[str, Any], ft_call: dict[str, Any]) -> float:
      
        525
            """Leaf-field disagreement rate between two parsed tool calls.
      
        526
        
        527
            Walks the union of leaf paths in ``base_call.arguments`` and
      
        528
            ``ft_call.arguments``; counts a disagreement for each leaf where
      
        529
            the values differ (or the path exists on only one side). Rate is
      
        530
            ``disagreements / total_leaves``; returns ``0.0`` when both
      
        531
            arguments dicts are empty (a vacuously-perfect agreement).
      
        532
        
        533
            Intentionally simple: no semantic equality (``"2.0"`` ≠ ``2.0``
      
        534
            by design — drift in numeric *type* counts as drift). Probes
      
        535
            that want softer comparison (numeric tolerance, embedding
      
        536
            distance on strings) build on this in a follow-up sprint.
      
        537
            """
      
        538
            base_args = base_call.get("arguments") or {}
      
        539
            ft_args = ft_call.get("arguments") or {}
      
        540
            if not isinstance(base_args, dict) or not isinstance(ft_args, dict):
      
        541
                # One side wasn't a real arguments dict; treat as full disagreement.
      
        542
                return 1.0
      
        543
            base_leaves = dict(_walk_leaves(base_args))
      
        544
            ft_leaves = dict(_walk_leaves(ft_args))
      
        545
            all_paths = set(base_leaves) | set(ft_leaves)
      
        546
            if not all_paths:
      
        547
                return 0.0
      
        548
            disagreements = sum(
      
        549
                1 for p in all_paths if base_leaves.get(p, _MISSING) != ft_leaves.get(p, _MISSING)
      
        550
            )
      
        551
            return disagreements / len(all_paths)
      
        552
        
        553
        
        554
        _MISSING = object()
      
        555
        """Sentinel for "field not present" in :func:`_field_disagreement`. A
      
        556
        leaf with value ``None`` should NOT compare equal to a leaf that's
      
        557
        absent — using a private singleton makes the distinction explicit
      
        558
        where ``dict.get(k)`` would conflate the two."""
      
        559
        
        560
        
        561
        def _walk_leaves(obj: Any, path: tuple[str, ...] = ()) -> Any:
      
        562
            """Yield ``(path, leaf_value)`` pairs for nested dicts.
      
        563
        
        564
            Lists are leaves themselves (not recursed into) — list-element
      
        565
            drift would otherwise dominate the disagreement metric in cases
      
        566
            where the model legitimately returns the same set in a different
      
        567
            order, which we don't want to flag as fidelity loss.
      
        568
            """
      
        569
            if isinstance(obj, dict):
      
        570
                for k, v in obj.items():
      
        571
                    yield from _walk_leaves(v, path + (str(k),))
      
        572
            else:
      
        573
                yield path, obj

1	"""Tool-use fidelity — does the adapter preserve the base's tool-call format?
2
3	LoRA fine-tunes increasingly target tool-use behavior: function-calling
4	JSON schemas, MCP tool plans, code-execution gating. An adapter that
5	accidentally degrades JSON-schema validity or starts hallucinating tool
6	names while acing every other probe is a silent production failure.
7
8	For each ``(prompt, tool_spec, gold_tool_name)`` case the probe greedy-
9	generates from both views and computes three independent signals:
10
11	- JSON-schema validity delta — ``ft_valid_rate − base_valid_rate``.
12	Negative values mean the adapter degraded the base's tool-call
13	formatting; positive values mean it improved (or the base was
14	already tool-clueless).
15	- Argument-field disagreement rate — over cases where both views
16	produced a schema-valid call, the leaf-field disagreement rate
17	between ``ft_call.arguments`` and ``base_call.arguments``. Catches
18	numeric / string drift inside an otherwise-well-formed call.
19	(We deliberately ship leaf-field equality rather than per-token KL
20	on argument values: per-token KL requires alignment between two
21	decoded strings, which is the same hard problem we explicitly
22	defer past v1.)
23	- Tool-name hallucination rate — over schema-valid ft calls, the
24	fraction whose ``name`` is not in ``allowed_tools`` (when the user
25	declared a tool surface) or differs from ``gold_tool_name`` (when
26	no surface is declared).
27
28	Composite verdict logic:
29
30	- The pass criterion is compound — validity delta above the floor AND
31	hallucination rate below the cap. Argument disagreement is
32	informational on the v1 surface.
33	- ``json_valid_rate_ft`` is the metric that's z-scored against the
34	null-adapter baseline. A null adapter should produce essentially no
35	schema-valid calls, so ``z >= assert_z_gte`` is the principled
36	"the adapter actually preserved tool-call structure" claim.
37
38	JSON-schema check is deliberately minimal: this v1 implementation
39	validates ``required`` membership and per-field type tags from a tiny
40	OpenAI-flavored subset (``string``, ``number``, ``integer``, ``boolean``,
41	``object``, ``array``). We don't pull in the ``jsonschema`` package —
42	the OpenAI-style spec is small enough to validate directly, and core
43	sway dependencies stay lean.
44	"""
45
46	from __future__ import annotations
47
48	import json
49	import statistics
50	from typing import Any, Literal
51
52	from pydantic import BaseModel, ConfigDict, Field
53
54	from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
55	from dlm_sway.core.stats import bootstrap_ci
56	from dlm_sway.probes._zscore import (
57	no_calibration_note,
58	score_from_z,
59	verdict_from_z,
60	z_score,
61	z_scores_by_rank,
62	)
63	from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
64	from dlm_sway.probes.null_adapter import get_null_stats, get_null_stats_by_rank
65
66
67	class ToolUseCase(BaseModel):
68	"""One ``(prompt, tool_spec, gold_tool_name)`` case.
69
70	``tool_spec`` follows the OpenAI function-calling shape::
71
72	{
73	"name": "search_web",
74	"description": "...",
75	"parameters": {
76	"type": "object",
77	"properties": {
78	"query": {"type": "string"},
79	"max_results": {"type": "integer"}
80	},
81	"required": ["query"]
82	}
83	}
84
85	``gold_tool_name`` is the tool the case expects ft to call. It's
86	used by the hallucination check when no broader ``allowed_tools``
87	list is declared on the spec.
88	"""
89
90	model_config = ConfigDict(extra="forbid", frozen=True)
91
92	prompt: str
93	tool_spec: dict[str, Any]
94	gold_tool_name: str
95	max_new_tokens: int = 256
96
97
98	class ToolUseFidelitySpec(ProbeSpec):
99	"""Spec for ``kind: tool_use_fidelity``."""
100
101	kind: Literal["tool_use_fidelity"] = "tool_use_fidelity"
102	cases: list[ToolUseCase] = Field(default_factory=list, min_length=0)
103	"""Inline cases. Empty list → probe SKIPs (the .dlm autogen path
104	leaves this empty unless a tool-use template seeded the doc)."""
105	allowed_tools: list[str] \| None = None
106	"""Optional tool-surface declaration. When set, hallucination is
107	``ft.name not in allowed_tools``. When ``None``, hallucination is
108	``ft.name != case.gold_tool_name`` per-case."""
109	assert_validity_delta_gte: float = -0.05
110	"""Pass criterion on JSON-schema validity. Default tolerates a 5pp
111	drop from base — anything worse is an adapter regression."""
112	assert_hallucination_lte: float = 0.10
113	"""Pass criterion on tool-name hallucination. Default 10% — above
114	that the adapter is actively inventing tools."""
115	assert_z_gte: float = 3.0
116	"""Z-score pass criterion on ``json_valid_rate_ft`` against the
117	null-adapter baseline. The principled signal — preferred over
118	the raw thresholds when null calibration ran."""
119
120
121	class ToolUseFidelityProbe(Probe):
122	"""The "did the LoRA preserve tool-call format?" probe."""
123
124	kind = "tool_use_fidelity"
125	spec_cls = ToolUseFidelitySpec
126	category = "attribution"
127
128	@classmethod
129	def calibrate_spec(cls, ctx: RunContext) -> ToolUseFidelitySpec \| None:
130	"""Two trivial sentinel cases for null calibration.
131
132	A null (random-noise) adapter should produce essentially no
133	schema-valid output here, so ``json_valid_rate`` clusters
134	tightly near zero — exactly what we want as the denominator
135	when z-scoring the real adapter's validity rate.
136	"""
137	del ctx
138	return ToolUseFidelitySpec(
139	name="_calibration",
140	kind="tool_use_fidelity",
141	cases=[
142	ToolUseCase(
143	prompt="Search the web for the capital of France.",
144	tool_spec={
145	"name": "search_web",
146	"parameters": {
147	"type": "object",
148	"properties": {"query": {"type": "string"}},
149	"required": ["query"],
150	},
151	},
152	gold_tool_name="search_web",
153	),
154	ToolUseCase(
155	prompt="Add 2 and 2.",
156	tool_spec={
157	"name": "calculator",
158	"parameters": {
159	"type": "object",
160	"properties": {
161	"a": {"type": "number"},
162	"b": {"type": "number"},
163	},
164	"required": ["a", "b"],
165	},
166	},
167	gold_tool_name="calculator",
168	),
169	],
170	)
171
172	def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
173	assert isinstance(spec, ToolUseFidelitySpec)
174	if not spec.cases:
175	return ProbeResult(
176	name=spec.name,
177	kind=spec.kind,
178	verdict=Verdict.SKIP,
179	score=None,
180	message="no tool-use cases (inline 'cases' was empty)",
181	)
182
183	base_valid: list[bool] = []
184	ft_valid: list[bool] = []
185	ft_calls: list[dict[str, Any] \| None] = []
186	base_calls: list[dict[str, Any] \| None] = []
187
188	# Greedy decode each case under both views. We open one base
189	# context for the whole sweep and one ft context — same shape
190	# as preference_flip's per-triple loop, but at the case level
191	# so a single backend toggle covers all N cases.
192	with ctx.require_backend.as_base() as base_view:
193	base_outputs = [
194	base_view.generate(c.prompt, max_new_tokens=c.max_new_tokens) for c in spec.cases
195	]
196	with ctx.require_backend.as_finetuned() as ft_view:
197	ft_outputs = [
198	ft_view.generate(c.prompt, max_new_tokens=c.max_new_tokens) for c in spec.cases
199	]
200
201	for case, b_out, f_out in zip(spec.cases, base_outputs, ft_outputs, strict=True):
202	b_call = _parse_tool_call(b_out)
203	f_call = _parse_tool_call(f_out)
204	base_calls.append(b_call)
205	ft_calls.append(f_call)
206	base_valid.append(b_call is not None and _matches_schema(b_call, case.tool_spec))
207	ft_valid.append(f_call is not None and _matches_schema(f_call, case.tool_spec))
208
209	n = len(spec.cases)
210	base_valid_rate = sum(base_valid) / n
211	ft_valid_rate = sum(ft_valid) / n
212	validity_delta = ft_valid_rate - base_valid_rate
213
214	# Argument-disagreement: only computed where BOTH views produced
215	# a schema-valid call. Otherwise the comparison is between a
216	# call and a non-call, which the probe scores via the validity
217	# path instead.
218	agreed_pairs = [
219	(bc, fc)
220	for bc, fc, bv, fv in zip(base_calls, ft_calls, base_valid, ft_valid, strict=True)
221	if bv and fv and bc is not None and fc is not None
222	]
223	if agreed_pairs:
224	disagreement_per_pair = [_field_disagreement(bc, fc) for bc, fc in agreed_pairs]
225	mean_arg_disagreement = statistics.fmean(disagreement_per_pair)
226	else:
227	mean_arg_disagreement = 0.0
228
229	# Hallucination: over schema-valid ft calls, what fraction call
230	# the wrong tool? Denominator excludes invalid calls so
231	# validity-failure double-counts don't pollute the metric.
232	ft_valid_calls = [
233	(case, fc)
234	for case, fc, fv in zip(spec.cases, ft_calls, ft_valid, strict=True)
235	if fv and fc is not None
236	]
237	if ft_valid_calls:
238	hallucinated = [
239	not _name_allowed(fc.get("name", ""), case.gold_tool_name, spec.allowed_tools)
240	for case, fc in ft_valid_calls
241	]
242	hallucination_rate = sum(hallucinated) / len(ft_valid_calls)
243	else:
244	hallucination_rate = 0.0
245
246	# Z-score the ft validity rate against the null baseline. CIs
247	# are over the per-case validity flags so they reflect sampling
248	# noise on the proportion estimate.
249	stats = get_null_stats(ctx, spec.kind)
250	z = z_score(ft_valid_rate, stats)
251	z_by_rank = z_scores_by_rank(ft_valid_rate, get_null_stats_by_rank(ctx, spec.kind), sign=+1)
252	ci_95 = bootstrap_ci([1.0 if v else 0.0 for v in ft_valid], seed=ctx.seed)
253
254	# Compound pass criterion: every gate must hold. The z-score
255	# path subsumes the validity-delta gate when null calibration
256	# ran (a 3σ-significant validity rate is by construction a
257	# rate above noise), but we apply both so a probe with z=4σ
258	# but a -20pp validity delta still flags as a regression.
259	validity_pass = validity_delta >= spec.assert_validity_delta_gte
260	hallucination_pass = hallucination_rate <= spec.assert_hallucination_lte
261	verdict_z = verdict_from_z(z, spec.assert_z_gte)
262
263	if verdict_z is not None:
264	z_path_pass = verdict_z == Verdict.PASS
265	verdict = (
266	Verdict.PASS
267	if (z_path_pass and validity_pass and hallucination_pass)
268	else Verdict.FAIL
269	)
270	base_msg = (
271	f"validity ft={ft_valid_rate:.0%} (base={base_valid_rate:.0%}, "
272	f"Δ={validity_delta:+.0%}), hallucination={hallucination_rate:.0%}, "
273	f"z={z:+.2f}σ vs null"
274	)
275	else:
276	verdict = Verdict.PASS if (validity_pass and hallucination_pass) else Verdict.FAIL
277	base_msg = (
278	f"validity ft={ft_valid_rate:.0%} (base={base_valid_rate:.0%}, "
279	f"Δ={validity_delta:+.0%}), hallucination={hallucination_rate:.0%} "
280	f"{no_calibration_note(spec.kind)}"
281	)
282
283	# Composite score blends the three signals. Weights tuned so
284	# that perfect validity preservation + zero hallucination
285	# → 1.0; full hallucination or full validity collapse → 0.0.
286	validity_factor = max(0.0, min(1.0, 1.0 + validity_delta))
287	hallucination_factor = max(0.0, 1.0 - hallucination_rate)
288	score_raw = validity_factor * hallucination_factor
289	# Z-score boost only applies when calibration is available
290	# — scaling toward score_from_z(z) so a strongly-significant
291	# adapter scores higher than a marginally-significant one
292	# even at the same raw rates.
293	z_score_val = score_from_z(z) if z is not None else None
294	score = 0.7 * score_raw + 0.3 * z_score_val if z_score_val is not None else score_raw
295
296	return safe_finalize(
297	name=spec.name,
298	kind=spec.kind,
299	verdict=verdict,
300	score=score,
301	raw=ft_valid_rate,
302	z_score=z,
303	base_value=base_valid_rate,
304	ft_value=ft_valid_rate,
305	evidence={
306	"json_valid_rate_base": base_valid_rate,
307	"json_valid_rate_ft": ft_valid_rate,
308	"validity_delta": validity_delta,
309	"mean_arg_disagreement": mean_arg_disagreement,
310	"hallucination_rate": hallucination_rate,
311	"num_cases": n,
312	"num_arg_pairs_compared": len(agreed_pairs),
313	"z_by_rank": z_by_rank,
314	"raw_ci_95": list(ci_95) if ci_95 is not None else None,
315	"weight": spec.weight,
316	},
317	message=base_msg,
318	ci_95=ci_95,
319	)
320
321
322	# ---------------------------------------------------------------------------
323	# Tool-call parsing + minimal schema validator
324	# ---------------------------------------------------------------------------
325
326
327	def _parse_tool_call(text: str) -> dict[str, Any] \| None:
328	"""Extract ``{"name": ..., "arguments": {...}}`` from a generation.
329
330	Tries three increasingly forgiving strategies:
331
332	1. The whole text parses as JSON and is a dict.
333	2. The text contains a ``{...}`` substring that parses as JSON.
334	3. The text contains a fenced code block (``\\`\\`\\`json ... \\`\\`\\```)
335	whose body parses as JSON.
336
337	Returns ``None`` when no parse succeeds OR when the parsed object
338	isn't a dict — both base and ft models often wander prose without
339	producing structured calls; that's exactly the failure mode the
340	probe scores.
341
342	The returned dict isn't validated yet — callers that care about
343	schema conformance run :func:`_matches_schema` separately so the
344	"valid JSON, wrong shape" failure mode is observable downstream.
345	"""
346	text = text.strip()
347	# Strategy 1.
348	try:
349	obj = json.loads(text)
350	except (ValueError, TypeError):
351	obj = None
352	if isinstance(obj, dict):
353	return obj
354
355	# Strategy 3 — fenced ```json ... ``` block first because its
356	# boundaries are unambiguous; embedded ``{...}`` heuristics in
357	# strategy 2 sometimes match a JSON-looking fragment inside a
358	# fenced block's body.
359	fenced = _extract_fenced_json(text)
360	if fenced is not None:
361	try:
362	obj = json.loads(fenced)
363	except (ValueError, TypeError):
364	obj = None
365	if isinstance(obj, dict):
366	return obj
367
368	# Strategy 2 — first balanced ``{...}`` substring.
369	candidate = _first_balanced_braces(text)
370	if candidate is not None:
371	try:
372	obj = json.loads(candidate)
373	except (ValueError, TypeError):
374	obj = None
375	if isinstance(obj, dict):
376	return obj
377
378	return None
379
380
381	def _extract_fenced_json(text: str) -> str \| None:
382	"""Return the body of a ```json ... ``` block, if any."""
383	marker = "```"
384	start = text.find(marker)
385	if start < 0:
386	return None
387	# Skip the opening fence (and an optional ``json`` language tag).
388	body_start = start + len(marker)
389	if text[body_start : body_start + 4].lower() == "json":
390	body_start += 4
391	# Strip a single trailing newline after the language tag.
392	if body_start < len(text) and text[body_start] == "\n":
393	body_start += 1
394	end = text.find(marker, body_start)
395	if end < 0:
396	return None
397	return text[body_start:end].strip()
398
399
400	def _first_balanced_braces(text: str) -> str \| None:
401	"""Return the first balanced ``{...}`` substring, or ``None``.
402
403	Tracks string-literal context so a brace inside a string doesn't
404	throw off the depth counter. Quadratic worst case is fine — the
405	inputs are model generations capped at a few hundred tokens.
406	"""
407	depth = 0
408	in_string = False
409	escape = False
410	start = -1
411	for i, ch in enumerate(text):
412	if in_string:
413	if escape:
414	escape = False
415	elif ch == "\\":
416	escape = True
417	elif ch == '"':
418	in_string = False
419	continue
420	if ch == '"':
421	in_string = True
422	continue
423	if ch == "{":
424	if depth == 0:
425	start = i
426	depth += 1
427	elif ch == "}":
428	depth -= 1
429	if depth == 0 and start >= 0:
430	return text[start : i + 1]
431	if depth < 0:
432	# Unbalanced — reset and keep scanning.
433	depth = 0
434	start = -1
435	return None
436
437
438	# Type tags we recognize. Matches the OpenAI function-call subset; we
439	# accept ``int`` for ``"integer"`` and ``int\|float`` for ``"number"``.
440	_TYPE_CHECKS: dict[str, tuple[type, ...]] = {
441	"string": (str,),
442	"integer": (int,),
443	"number": (int, float),
444	"boolean": (bool,),
445	"object": (dict,),
446	"array": (list,),
447	}
448
449
450	def _matches_schema(call: dict[str, Any], tool_spec: dict[str, Any]) -> bool:
451	"""Light OpenAI-flavored JSON-schema check.
452
453	Validates:
454
455	- The call has a ``name`` field (string) — the model is supposed to
456	identify the tool it's calling.
457	- The call has an ``arguments`` field (dict) — the OpenAI shape.
458	- Every name in ``parameters.required`` appears in ``arguments``.
459	- Every field declared in ``parameters.properties`` whose value is
460	present has a type that matches the property's ``type`` tag.
461
462	Doesn't validate: enums, ``minLength``/``maxLength``, nested
463	schemas, ``oneOf``/``anyOf``. Those land in a follow-up sprint
464	when users push tool specs that need them — the v1 surface is
465	deliberately the OpenAI-tutorial shape.
466
467	Booleans-as-integers caveat: Python's ``isinstance(True, int)``
468	is True, so we explicitly reject bools when the schema asks for
469	integer/number. Otherwise an adapter that emits ``true`` would
470	silently pass an "integer" check.
471	"""
472	if not isinstance(call.get("name"), str):
473	return False
474	args = call.get("arguments")
475	if not isinstance(args, dict):
476	return False
477
478	parameters = tool_spec.get("parameters") or {}
479	properties = parameters.get("properties") or {}
480	required = parameters.get("required") or []
481
482	for r in required:
483	if r not in args:
484	return False
485
486	for field_name, field_value in args.items():
487	prop = properties.get(field_name)
488	if not prop:
489	# Unknown field — OpenAI's actual function-call rules
490	# forbid this, but we permit it because real models often
491	# add benign extra fields. Validity stays True.
492	continue
493	type_tag = prop.get("type")
494	if type_tag is None:
495	continue
496	allowed = _TYPE_CHECKS.get(type_tag)
497	if allowed is None:
498	continue
499	if isinstance(field_value, bool) and bool not in allowed:
500	return False
501	if not isinstance(field_value, allowed):
502	return False
503	return True
504
505
506	def _name_allowed(name: str, gold: str, allowed_tools: list[str] \| None) -> bool:
507	"""Return True iff the model's tool ``name`` is acceptable.
508
509	Rules:
510
511	- ``allowed_tools`` declared → ``name`` must be in the list.
512	- Otherwise → ``name`` must equal ``gold``.
513
514	The split exists because some users want a per-case strict gold
515	(calibrator is checking "did the model pick the right tool")
516	while others want a surface gate ("did the model stay inside the
517	declared tool list").
518	"""
519	if allowed_tools is not None:
520	return name in allowed_tools
521	return name == gold
522
523
524	def _field_disagreement(base_call: dict[str, Any], ft_call: dict[str, Any]) -> float:
525	"""Leaf-field disagreement rate between two parsed tool calls.
526
527	Walks the union of leaf paths in ``base_call.arguments`` and
528	``ft_call.arguments``; counts a disagreement for each leaf where
529	the values differ (or the path exists on only one side). Rate is
530	``disagreements / total_leaves``; returns ``0.0`` when both
531	arguments dicts are empty (a vacuously-perfect agreement).
532
533	Intentionally simple: no semantic equality (``"2.0"`` ≠ ``2.0``
534	by design — drift in numeric type counts as drift). Probes
535	that want softer comparison (numeric tolerance, embedding
536	distance on strings) build on this in a follow-up sprint.
537	"""
538	base_args = base_call.get("arguments") or {}
539	ft_args = ft_call.get("arguments") or {}
540	if not isinstance(base_args, dict) or not isinstance(ft_args, dict):
541	# One side wasn't a real arguments dict; treat as full disagreement.
542	return 1.0
543	base_leaves = dict(_walk_leaves(base_args))
544	ft_leaves = dict(_walk_leaves(ft_args))
545	all_paths = set(base_leaves) \| set(ft_leaves)
546	if not all_paths:
547	return 0.0
548	disagreements = sum(
549	1 for p in all_paths if base_leaves.get(p, _MISSING) != ft_leaves.get(p, _MISSING)
550	)
551	return disagreements / len(all_paths)
552
553
554	_MISSING = object()
555	"""Sentinel for "field not present" in :func:`_field_disagreement`. A
556	leaf with value ``None`` should NOT compare equal to a leaf that's
557	absent — using a private singleton makes the distinction explicit
558	where ``dict.get(k)`` would conflate the two."""
559
560
561	def _walk_leaves(obj: Any, path: tuple[str, ...] = ()) -> Any:
562	"""Yield ``(path, leaf_value)`` pairs for nested dicts.
563
564	Lists are leaves themselves (not recursed into) — list-element
565	drift would otherwise dominate the disagreement metric in cases
566	where the model legitimately returns the same set in a different
567	order, which we don't want to flag as fidelity loss.
568	"""
569	if isinstance(obj, dict):
570	for k, v in obj.items():
571	yield from _walk_leaves(v, path + (str(k),))
572	else:
573	yield path, obj