loader Public

Watch 0 Fork 0 Star 0
Python · 81914 bytes Raw Blame History
  
        1
        """Deterministic runtime parity coverage for the current Loader loop."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import json
      
        6
        from pathlib import Path
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.agent.loop import Agent, AgentConfig
      
        11
        from loader.llm.base import CompletionResponse, Role, StreamChunk, ToolCall
      
        12
        from loader.runtime.capabilities import resolve_capability_profile
      
        13
        from loader.runtime.permissions import PermissionMode
      
        14
        from tests.helpers.runtime_harness import (
      
        15
            ScriptedBackend,
      
        16
            run_explore_scenario,
      
        17
            run_scenario,
      
        18
        )
      
        19
        
        20
        SCENARIO_NAMES = [
      
        21
            "streaming_text",
      
        22
            "read_file_roundtrip",
      
        23
            "multi_tool_turn_roundtrip",
      
        24
            "turn_summary_smoke_for_multi_tool_turn",
      
        25
            "write_file_allowed",
      
        26
            "write_file_denied",
      
        27
            "bash_stdout_roundtrip",
      
        28
            "bash_confirmation_prompt_approved",
      
        29
            "bash_confirmation_prompt_denied",
      
        30
            "read_only_mode_denies_write",
      
        31
            "read_only_mode_denies_mutating_bash",
      
        32
            "read_only_mode_allows_safe_bash",
      
        33
            "workspace_write_denies_write_outside_root",
      
        34
            "danger_full_access_allows_dangerous_bash",
      
        35
            "prompt_mode_prompts_destructive_write",
      
        36
            "allow_mode_skips_prompt_for_destructive_write",
      
        37
            "deny_rule_blocks_allowed_mode",
      
        38
            "ask_rule_prompts_even_when_mode_would_allow",
      
        39
            "raw_json_tool_call_fallback",
      
        40
            "raw_json_todowrite_tool_call_fallback",
      
        41
            "raw_json_patch_tool_call_fallback",
      
        42
            "raw_json_ask_user_question_tool_call_fallback",
      
        43
            "raw_bracket_ask_user_question_tool_call_fallback",
      
        44
            "native_and_raw_tool_paths_share_executor_trace",
      
        45
            "backend_capability_probe_refreshes_native_tool_mode",
      
        46
            "run_streaming_delegates_to_primary_runtime",
      
        47
            "definition_of_done_verify_phase",
      
        48
            "verify_failure_routes_to_fix_loop",
      
        49
            "verify_retry_budget_exhaustion",
      
        50
            "ambiguous_prompt_routes_to_clarify",
      
        51
            "complex_prompt_routes_to_plan",
      
        52
            "verify_failure_fix_loop_does_not_reroute_workflow",
      
        53
            "conversational_task_skips_verify_phase",
      
        54
            "explore_mode_skips_dod_and_router",
      
        55
            "explore_mode_denies_write",
      
        56
            "explore_mode_ignores_global_allow_policy",
      
        57
            "non_mutating_completion_no_longer_forces_continuation",
      
        58
            "tool_result_contract_regression",
      
        59
        ]
      
        60
        
        61
        
        62
        def load_manifest() -> list[dict[str, str]]:
      
        63
            """Load the auditable parity scenario manifest."""
      
        64
        
        65
            manifest_path = Path(__file__).parent / "fixtures" / "runtime_parity_manifest.json"
      
        66
            return json.loads(manifest_path.read_text())
      
        67
        
        68
        
        69
        def non_streaming_config(*, completion_check: bool = False) -> AgentConfig:
      
        70
            """Shared config for deterministic complete() tests."""
      
        71
        
        72
            config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
      
        73
            config.reasoning.completion_check = completion_check
      
        74
            return config
      
        75
        
        76
        
        77
        def native_tool_response(
      
        78
            *tool_calls: ToolCall,
      
        79
            content: str = "Using tools.",
      
        80
        ) -> CompletionResponse:
      
        81
            """Build a completion that includes native tool calls."""
      
        82
        
        83
            return CompletionResponse(content=content, tool_calls=list(tool_calls))
      
        84
        
        85
        
        86
        def final_response(content: str) -> CompletionResponse:
      
        87
            """Build a completion with no further tool calls."""
      
        88
        
        89
            return CompletionResponse(content=content)
      
        90
        
        91
        
        92
        def tool_event_names(run) -> list[str]:
      
        93
            """Return emitted tool event names in order."""
      
        94
        
        95
            return [
      
        96
                event.tool_name
      
        97
                for event in run.events
      
        98
                if event.type == "tool_call" and event.tool_name and event.phase != "verification"
      
        99
            ]
      
        100
        
        101
        
        102
        def tool_result_messages(run) -> list[str]:
      
        103
            """Return emitted tool result messages in order."""
      
        104
        
        105
            return [
      
        106
                event.content
      
        107
                for event in run.events
      
        108
                if event.type == "tool_result" and event.phase != "verification"
      
        109
            ]
      
        110
        
        111
        
        112
        def verification_commands(run) -> list[str]:
      
        113
            """Return verification-phase bash commands."""
      
        114
        
        115
            return [
      
        116
                str((event.tool_args or {}).get("command", ""))
      
        117
                for event in run.events
      
        118
                if event.type == "tool_call" and event.phase == "verification"
      
        119
            ]
      
        120
        
        121
        
        122
        def trace_event_names(run) -> list[str]:
      
        123
            """Return recorded runtime trace event names."""
      
        124
        
        125
            summary = run.agent.last_turn_summary
      
        126
            assert summary is not None
      
        127
            return [event.name for event in summary.trace]
      
        128
        
        129
        
        130
        def dod_statuses(run) -> list[str]:
      
        131
            """Return DoD statuses emitted during a run."""
      
        132
        
        133
            return [
      
        134
                event.dod_status
      
        135
                for event in run.events
      
        136
                if event.type == "dod_status" and event.dod_status
      
        137
            ]
      
        138
        
        139
        
        140
        def workflow_modes(run) -> list[str]:
      
        141
            """Return emitted workflow modes in order."""
      
        142
        
        143
            return [
      
        144
                event.workflow_mode
      
        145
                for event in run.events
      
        146
                if event.type == "workflow_mode" and event.workflow_mode
      
        147
            ]
      
        148
        
        149
        
        150
        def artifact_kinds(run) -> list[str]:
      
        151
            """Return emitted artifact kinds in order."""
      
        152
        
        153
            return [
      
        154
                event.artifact_kind
      
        155
                for event in run.events
      
        156
                if event.type == "artifact" and event.artifact_kind
      
        157
            ]
      
        158
        
        159
        
        160
        @pytest.mark.asyncio
      
        161
        async def test_runtime_parity_manifest_matches_implemented_cases() -> None:
      
        162
            manifest_names = [entry["name"] for entry in load_manifest()]
      
        163
            assert manifest_names == SCENARIO_NAMES
      
        164
        
        165
        
        166
        @pytest.mark.asyncio
      
        167
        async def test_streaming_text_scenario() -> None:
      
        168
            backend = ScriptedBackend(
      
        169
                streams=[
      
        170
                    [
      
        171
                        StreamChunk(content="Mock streaming ", is_done=False),
      
        172
                        StreamChunk(
      
        173
                            content="says hello from Loader.",
      
        174
                            full_content="Mock streaming says hello from Loader.",
      
        175
                            is_done=True,
      
        176
                        ),
      
        177
                    ]
      
        178
                ]
      
        179
            )
      
        180
        
        181
            run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
      
        182
        
        183
            assert run.response == "Mock streaming says hello from Loader."
      
        184
            assert [call.mode for call in run.invocations] == ["stream"]
      
        185
            assert not tool_event_names(run)
      
        186
        
        187
        
        188
        @pytest.mark.asyncio
      
        189
        async def test_read_file_roundtrip(temp_dir: Path) -> None:
      
        190
            fixture = temp_dir / "fixture.txt"
      
        191
            fixture.write_text("alpha parity line\nbeta line\n")
      
        192
        
        193
            backend = ScriptedBackend(
      
        194
                completions=[
      
        195
                    native_tool_response(
      
        196
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
      
        197
                        content="I'll inspect that file.",
      
        198
                    ),
      
        199
                    final_response("The file contains alpha parity line."),
      
        200
                ]
      
        201
            )
      
        202
        
        203
            run = await run_scenario(
      
        204
                "Read the fixture file and summarize it.",
      
        205
                backend,
      
        206
                config=non_streaming_config(),
      
        207
                project_root=temp_dir,
      
        208
            )
      
        209
        
        210
            assert "alpha parity line" in run.response
      
        211
            assert tool_event_names(run) == ["read"]
      
        212
            assert any("alpha parity line" in message for message in tool_result_messages(run))
      
        213
            assert len(run.invocations) == 2
      
        214
            assert any(message.role == Role.TOOL for message in run.invocations[1].messages)
      
        215
        
        216
        
        217
        @pytest.mark.asyncio
      
        218
        @pytest.mark.parametrize("alias_key", ["file", "filepath"])
      
        219
        async def test_read_file_alias_roundtrip(temp_dir: Path, alias_key: str) -> None:
      
        220
            fixture = temp_dir / "fixture.txt"
      
        221
            fixture.write_text("alpha parity line\nbeta line\n")
      
        222
        
        223
            backend = ScriptedBackend(
      
        224
                completions=[
      
        225
                    native_tool_response(
      
        226
                        ToolCall(id="read-1", name="read", arguments={alias_key: str(fixture)}),
      
        227
                        content="I'll inspect that file.",
      
        228
                    ),
      
        229
                    final_response("The file contains alpha parity line."),
      
        230
                ]
      
        231
            )
      
        232
        
        233
            run = await run_scenario(
      
        234
                "Read the fixture file and summarize it.",
      
        235
                backend,
      
        236
                config=non_streaming_config(),
      
        237
                project_root=temp_dir,
      
        238
            )
      
        239
        
        240
            assert "alpha parity line" in run.response
      
        241
            assert tool_event_names(run) == ["read"]
      
        242
            assert any("alpha parity line" in message for message in tool_result_messages(run))
      
        243
        
        244
        
        245
        @pytest.mark.asyncio
      
        246
        async def test_multi_tool_turn_roundtrip(temp_dir: Path) -> None:
      
        247
            fixture = temp_dir / "fixture.txt"
      
        248
            fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
      
        249
        
        250
            backend = ScriptedBackend(
      
        251
                completions=[
      
        252
                    native_tool_response(
      
        253
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
      
        254
                        ToolCall(
      
        255
                            id="grep-1",
      
        256
                            name="grep",
      
        257
                            arguments={"pattern": "parity", "path": str(fixture)},
      
        258
                        ),
      
        259
                        content="I'll inspect the file and count parity matches.",
      
        260
                    ),
      
        261
                    final_response("The file has two parity lines, including alpha parity line."),
      
        262
                ]
      
        263
            )
      
        264
        
        265
            run = await run_scenario(
      
        266
                "Inspect the fixture and find parity lines.",
      
        267
                backend,
      
        268
                config=non_streaming_config(),
      
        269
                project_root=temp_dir,
      
        270
            )
      
        271
        
        272
            assert tool_event_names(run) == ["read", "grep"]
      
        273
            assert len(tool_result_messages(run)) == 2
      
        274
            assert "two parity lines" in run.response
      
        275
        
        276
        
        277
        @pytest.mark.asyncio
      
        278
        async def test_turn_summary_smoke_for_multi_tool_turn(temp_dir: Path) -> None:
      
        279
            fixture = temp_dir / "fixture.txt"
      
        280
            fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
      
        281
        
        282
            backend = ScriptedBackend(
      
        283
                completions=[
      
        284
                    native_tool_response(
      
        285
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
      
        286
                        ToolCall(
      
        287
                            id="grep-1",
      
        288
                            name="grep",
      
        289
                            arguments={"pattern": "parity", "path": str(fixture)},
      
        290
                        ),
      
        291
                        content="I'll inspect the file and count parity matches.",
      
        292
                    ),
      
        293
                    final_response("The file has two parity lines, including alpha parity line."),
      
        294
                ]
      
        295
            )
      
        296
        
        297
            run = await run_scenario(
      
        298
                "Inspect the fixture and find parity lines.",
      
        299
                backend,
      
        300
                config=non_streaming_config(),
      
        301
                project_root=temp_dir,
      
        302
            )
      
        303
        
        304
            summary = run.agent.last_turn_summary
      
        305
            assert summary is not None
      
        306
            assert summary.final_response == run.response
      
        307
            assert summary.iterations == 2
      
        308
            assert len(summary.assistant_messages) == 2
      
        309
            assert len(summary.tool_result_messages) == 2
      
        310
            assert "assistant.tool_batch" in trace_event_names(run)
      
        311
        
        312
        
        313
        @pytest.mark.asyncio
      
        314
        async def test_write_file_allowed(temp_dir: Path) -> None:
      
        315
            target = temp_dir / "allowed.txt"
      
        316
            backend = ScriptedBackend(
      
        317
                completions=[
      
        318
                    native_tool_response(
      
        319
                        ToolCall(
      
        320
                            id="write-1",
      
        321
                            name="write",
      
        322
                            arguments={"file_path": str(target), "content": "hello from loader\n"},
      
        323
                        ),
      
        324
                        content="I'll create the file now.",
      
        325
                    ),
      
        326
                    final_response("Successfully created the file."),
      
        327
                ]
      
        328
            )
      
        329
        
        330
            run = await run_scenario(
      
        331
                "Create allowed.txt with a greeting.",
      
        332
                backend,
      
        333
                config=non_streaming_config(),
      
        334
                project_root=temp_dir,
      
        335
            )
      
        336
        
        337
            assert target.read_text() == "hello from loader\n"
      
        338
            assert "Successfully created the file." in run.response
      
        339
            assert tool_event_names(run) == ["write"]
      
        340
        
        341
        
        342
        @pytest.mark.asyncio
      
        343
        async def test_write_file_denied(temp_dir: Path) -> None:
      
        344
            target = temp_dir / "denied.txt"
      
        345
            config = non_streaming_config()
      
        346
            config.permission_mode = PermissionMode.PROMPT
      
        347
            backend = ScriptedBackend(
      
        348
                completions=[
      
        349
                    native_tool_response(
      
        350
                        ToolCall(
      
        351
                            id="write-1",
      
        352
                            name="write",
      
        353
                            arguments={"file_path": str(target), "content": "should not exist\n"},
      
        354
                        ),
      
        355
                        content="I'll create the file if you approve it.",
      
        356
                    ),
      
        357
                    final_response("I skipped the write as requested."),
      
        358
                ]
      
        359
            )
      
        360
        
        361
            async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        362
                assert tool_name == "write"
      
        363
                assert "approval" in message.lower()
      
        364
                assert "active_mode=prompt" in details
      
        365
                return False
      
        366
        
        367
            run = await run_scenario(
      
        368
                "Create denied.txt with a greeting.",
      
        369
                backend,
      
        370
                config=config,
      
        371
                project_root=temp_dir,
      
        372
                on_confirmation=deny_confirmation,
      
        373
            )
      
        374
        
        375
            assert not target.exists()
      
        376
            assert "skipped the write" in run.response.lower()
      
        377
            assert any(event.type == "confirmation" for event in run.events)
      
        378
        
        379
        
        380
        @pytest.mark.asyncio
      
        381
        async def test_bash_stdout_roundtrip(temp_dir: Path, monkeypatch: pytest.MonkeyPatch) -> None:
      
        382
            monkeypatch.chdir(temp_dir)
      
        383
            backend = ScriptedBackend(
      
        384
                completions=[
      
        385
                    native_tool_response(
      
        386
                        ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
      
        387
                        content="I'll check the current directory.",
      
        388
                    ),
      
        389
                    final_response("Confirmed the working directory."),
      
        390
                ]
      
        391
            )
      
        392
        
        393
            run = await run_scenario(
      
        394
                "Tell me the current directory.",
      
        395
                backend,
      
        396
                config=non_streaming_config(),
      
        397
                project_root=temp_dir,
      
        398
            )
      
        399
        
        400
            assert str(temp_dir) in tool_result_messages(run)[0]
      
        401
            assert "Confirmed the working directory." in run.response
      
        402
        
        403
        
        404
        @pytest.mark.asyncio
      
        405
        async def test_bash_confirmation_prompt_approved(
      
        406
            temp_dir: Path,
      
        407
            monkeypatch: pytest.MonkeyPatch,
      
        408
        ) -> None:
      
        409
            monkeypatch.chdir(temp_dir)
      
        410
            target = temp_dir / "approved.txt"
      
        411
            config = non_streaming_config()
      
        412
            config.permission_mode = PermissionMode.PROMPT
      
        413
            backend = ScriptedBackend(
      
        414
                completions=[
      
        415
                    native_tool_response(
      
        416
                        ToolCall(id="bash-1", name="bash", arguments={"command": "touch approved.txt"}),
      
        417
                        content="I'll create the file after approval.",
      
        418
                    ),
      
        419
                    final_response("The shell command completed."),
      
        420
                ]
      
        421
            )
      
        422
        
        423
            async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        424
                assert tool_name == "bash"
      
        425
                assert "approval" in message.lower()
      
        426
                assert "touch approved.txt" in details
      
        427
                return True
      
        428
        
        429
            run = await run_scenario(
      
        430
                "Create approved.txt using bash.",
      
        431
                backend,
      
        432
                config=config,
      
        433
                project_root=temp_dir,
      
        434
                on_confirmation=approve_confirmation,
      
        435
            )
      
        436
        
        437
            assert target.exists()
      
        438
            assert "shell command completed" in run.response.lower()
      
        439
            assert any(event.type == "confirmation" for event in run.events)
      
        440
        
        441
        
        442
        @pytest.mark.asyncio
      
        443
        async def test_bash_confirmation_prompt_denied(
      
        444
            temp_dir: Path,
      
        445
            monkeypatch: pytest.MonkeyPatch,
      
        446
        ) -> None:
      
        447
            monkeypatch.chdir(temp_dir)
      
        448
            target = temp_dir / "denied-bash.txt"
      
        449
            config = non_streaming_config()
      
        450
            config.permission_mode = PermissionMode.PROMPT
      
        451
            backend = ScriptedBackend(
      
        452
                completions=[
      
        453
                    native_tool_response(
      
        454
                        ToolCall(id="bash-1", name="bash", arguments={"command": "touch denied-bash.txt"}),
      
        455
                        content="I'll create the file if you allow it.",
      
        456
                    ),
      
        457
                    final_response("I left the shell command undone."),
      
        458
                ]
      
        459
            )
      
        460
        
        461
            async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        462
                assert tool_name == "bash"
      
        463
                assert "touch denied-bash.txt" in details
      
        464
                return False
      
        465
        
        466
            run = await run_scenario(
      
        467
                "Create denied-bash.txt using bash.",
      
        468
                backend,
      
        469
                config=config,
      
        470
                project_root=temp_dir,
      
        471
                on_confirmation=deny_confirmation,
      
        472
            )
      
        473
        
        474
            assert not target.exists()
      
        475
            assert "left the shell command undone" in run.response.lower()
      
        476
            assert any(event.type == "confirmation" for event in run.events)
      
        477
        
        478
        
        479
        @pytest.mark.asyncio
      
        480
        async def test_read_only_mode_denies_write(temp_dir: Path) -> None:
      
        481
            config = non_streaming_config()
      
        482
            config.permission_mode = PermissionMode.READ_ONLY
      
        483
            config.auto_recover = False
      
        484
            target = temp_dir / "blocked-by-policy.txt"
      
        485
            backend = ScriptedBackend(
      
        486
                completions=[
      
        487
                    native_tool_response(
      
        488
                        ToolCall(
      
        489
                            id="write-1",
      
        490
                            name="write",
      
        491
                            arguments={"file_path": str(target), "content": "denied\n"},
      
        492
                        ),
      
        493
                        content="I'll create the file.",
      
        494
                    ),
      
        495
                    final_response("The write was blocked."),
      
        496
                ]
      
        497
            )
      
        498
        
        499
            run = await run_scenario(
      
        500
                "Create blocked-by-policy.txt.",
      
        501
                backend,
      
        502
                config=config,
      
        503
                project_root=temp_dir,
      
        504
            )
      
        505
        
        506
            assert not target.exists()
      
        507
            assert any("requires workspace-write" in message for message in tool_result_messages(run))
      
        508
        
        509
        
        510
        @pytest.mark.asyncio
      
        511
        async def test_read_only_mode_denies_mutating_bash(temp_dir: Path) -> None:
      
        512
            config = non_streaming_config()
      
        513
            config.permission_mode = PermissionMode.READ_ONLY
      
        514
            config.auto_recover = False
      
        515
            target = temp_dir / "bash-blocked.txt"
      
        516
            backend = ScriptedBackend(
      
        517
                completions=[
      
        518
                    native_tool_response(
      
        519
                        ToolCall(
      
        520
                            id="bash-1",
      
        521
                            name="bash",
      
        522
                            arguments={"command": f"touch {target}"},
      
        523
                        ),
      
        524
                        content="I'll create the file with bash.",
      
        525
                    ),
      
        526
                    final_response("The bash command was blocked."),
      
        527
                ]
      
        528
            )
      
        529
        
        530
            run = await run_scenario(
      
        531
                "Create bash-blocked.txt using bash.",
      
        532
                backend,
      
        533
                config=config,
      
        534
                project_root=temp_dir,
      
        535
            )
      
        536
        
        537
            assert not target.exists()
      
        538
            assert any("requires workspace-write" in message for message in tool_result_messages(run))
      
        539
        
        540
        
        541
        @pytest.mark.asyncio
      
        542
        async def test_read_only_mode_allows_safe_bash(temp_dir: Path) -> None:
      
        543
            config = non_streaming_config()
      
        544
            config.permission_mode = PermissionMode.READ_ONLY
      
        545
            backend = ScriptedBackend(
      
        546
                completions=[
      
        547
                    native_tool_response(
      
        548
                        ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
      
        549
                        content="I'll inspect the current directory.",
      
        550
                    ),
      
        551
                    final_response("Inspected the current directory."),
      
        552
                ]
      
        553
            )
      
        554
        
        555
            run = await run_scenario(
      
        556
                "Show the current directory.",
      
        557
                backend,
      
        558
                config=config,
      
        559
                project_root=temp_dir,
      
        560
            )
      
        561
        
        562
            assert tool_event_names(run) == ["bash"]
      
        563
            assert not any("requires" in message for message in tool_result_messages(run))
      
        564
        
        565
        
        566
        @pytest.mark.asyncio
      
        567
        async def test_workspace_write_denies_write_outside_root(temp_dir: Path) -> None:
      
        568
            config = non_streaming_config()
      
        569
            config.auto_recover = False
      
        570
            outside = temp_dir.parent / "outside-root.txt"
      
        571
            if outside.exists():
      
        572
                outside.unlink()
      
        573
        
        574
            backend = ScriptedBackend(
      
        575
                completions=[
      
        576
                    native_tool_response(
      
        577
                        ToolCall(
      
        578
                            id="write-1",
      
        579
                            name="write",
      
        580
                            arguments={"file_path": str(outside), "content": "outside\n"},
      
        581
                        ),
      
        582
                        content="I'll write outside the workspace.",
      
        583
                    ),
      
        584
                    final_response("The write was blocked."),
      
        585
                ]
      
        586
            )
      
        587
        
        588
            async def decline_confirmation(_name: str, _msg: str, _details: str) -> bool:
      
        589
                return False
      
        590
        
        591
            run = await run_scenario(
      
        592
                "Write a file outside the workspace.",
      
        593
                backend,
      
        594
                config=config,
      
        595
                project_root=temp_dir,
      
        596
                on_confirmation=decline_confirmation,
      
        597
            )
      
        598
        
        599
            assert not outside.exists()
      
        600
            assert any(
      
        601
                "declined" in message.lower() or "outside workspace" in message.lower()
      
        602
                for message in tool_result_messages(run)
      
        603
            )
      
        604
        
        605
        
        606
        @pytest.mark.asyncio
      
        607
        async def test_danger_full_access_allows_dangerous_bash(temp_dir: Path) -> None:
      
        608
            target = temp_dir / "mode.txt"
      
        609
            target.write_text("hello\n")
      
        610
            config = non_streaming_config()
      
        611
            config.permission_mode = PermissionMode.DANGER_FULL_ACCESS
      
        612
            backend = ScriptedBackend(
      
        613
                completions=[
      
        614
                    native_tool_response(
      
        615
                        ToolCall(
      
        616
                            id="bash-1",
      
        617
                            name="bash",
      
        618
                            arguments={"command": f"chmod 600 {target}"},
      
        619
                        ),
      
        620
                        content="I'll change the file permissions.",
      
        621
                    ),
      
        622
                    final_response("Updated the file permissions."),
      
        623
                ]
      
        624
            )
      
        625
        
        626
            run = await run_scenario(
      
        627
                "Lock down mode.txt permissions.",
      
        628
                backend,
      
        629
                config=config,
      
        630
                project_root=temp_dir,
      
        631
            )
      
        632
        
        633
            assert tool_event_names(run) == ["bash"]
      
        634
            assert not any("requires" in message for message in tool_result_messages(run))
      
        635
            assert not any(event.type == "confirmation" for event in run.events)
      
        636
        
        637
        
        638
        @pytest.mark.asyncio
      
        639
        async def test_prompt_mode_prompts_destructive_write(temp_dir: Path) -> None:
      
        640
            target = temp_dir / "prompted.txt"
      
        641
            config = non_streaming_config()
      
        642
            config.permission_mode = PermissionMode.PROMPT
      
        643
            backend = ScriptedBackend(
      
        644
                completions=[
      
        645
                    native_tool_response(
      
        646
                        ToolCall(
      
        647
                            id="write-1",
      
        648
                            name="write",
      
        649
                            arguments={"file_path": str(target), "content": "prompted\n"},
      
        650
                        ),
      
        651
                        content="I'll create the file after approval.",
      
        652
                    ),
      
        653
                    final_response("The file was created."),
      
        654
                ]
      
        655
            )
      
        656
            prompts: list[str] = []
      
        657
        
        658
            async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        659
                assert tool_name == "write"
      
        660
                prompts.append(details)
      
        661
                return True
      
        662
        
        663
            run = await run_scenario(
      
        664
                "Create prompted.txt after approval.",
      
        665
                backend,
      
        666
                config=config,
      
        667
                project_root=temp_dir,
      
        668
                on_confirmation=approve_confirmation,
      
        669
            )
      
        670
        
        671
            assert target.read_text() == "prompted\n"
      
        672
            assert prompts and "active_mode=prompt" in prompts[0]
      
        673
            assert any(event.type == "confirmation" for event in run.events)
      
        674
        
        675
        
        676
        @pytest.mark.asyncio
      
        677
        async def test_allow_mode_skips_prompt_for_destructive_write(temp_dir: Path) -> None:
      
        678
            target = temp_dir / "allow-mode.txt"
      
        679
            config = non_streaming_config()
      
        680
            config.permission_mode = PermissionMode.ALLOW
      
        681
            backend = ScriptedBackend(
      
        682
                completions=[
      
        683
                    native_tool_response(
      
        684
                        ToolCall(
      
        685
                            id="write-1",
      
        686
                            name="write",
      
        687
                            arguments={"file_path": str(target), "content": "allow mode\n"},
      
        688
                        ),
      
        689
                        content="I'll create the file directly.",
      
        690
                    ),
      
        691
                    final_response("The file was created."),
      
        692
                ]
      
        693
            )
      
        694
            prompts: list[str] = []
      
        695
        
        696
            async def unexpected_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        697
                prompts.append(tool_name)
      
        698
                return False
      
        699
        
        700
            run = await run_scenario(
      
        701
                "Create allow-mode.txt directly.",
      
        702
                backend,
      
        703
                config=config,
      
        704
                project_root=temp_dir,
      
        705
                on_confirmation=unexpected_confirmation,
      
        706
            )
      
        707
        
        708
            assert target.read_text() == "allow mode\n"
      
        709
            assert prompts == []
      
        710
            assert not any(event.type == "confirmation" for event in run.events)
      
        711
            assert "The file was created." in run.response
      
        712
        
        713
        
        714
        @pytest.mark.asyncio
      
        715
        async def test_deny_rule_blocks_allowed_mode(temp_dir: Path) -> None:
      
        716
            loader_root = temp_dir / ".loader"
      
        717
            loader_root.mkdir()
      
        718
            (loader_root / "permission-rules.json").write_text(
      
        719
                '{"deny": [{"tool": "write", "path_contains": "secrets"}]}\n'
      
        720
            )
      
        721
            target = temp_dir / "secrets.txt"
      
        722
            config = non_streaming_config()
      
        723
            config.permission_mode = PermissionMode.ALLOW
      
        724
            config.auto_recover = False
      
        725
            backend = ScriptedBackend(
      
        726
                completions=[
      
        727
                    native_tool_response(
      
        728
                        ToolCall(
      
        729
                            id="write-1",
      
        730
                            name="write",
      
        731
                            arguments={"file_path": str(target), "content": "denied\n"},
      
        732
                        ),
      
        733
                        content="I'll write the secret file.",
      
        734
                    ),
      
        735
                    final_response("The write was blocked by policy."),
      
        736
                ]
      
        737
            )
      
        738
        
        739
            run = await run_scenario(
      
        740
                "Create secrets.txt.",
      
        741
                backend,
      
        742
                config=config,
      
        743
                project_root=temp_dir,
      
        744
            )
      
        745
        
        746
            assert not target.exists()
      
        747
            assert any("denied by rule" in message for message in tool_result_messages(run))
      
        748
            assert "tool.permission_denied" in trace_event_names(run)
      
        749
        
        750
        
        751
        @pytest.mark.asyncio
      
        752
        async def test_ask_rule_prompts_even_when_mode_would_allow(temp_dir: Path) -> None:
      
        753
            loader_root = temp_dir / ".loader"
      
        754
            loader_root.mkdir()
      
        755
            (loader_root / "permission-rules.json").write_text(
      
        756
                '{"ask": [{"tool": "write", "path_contains": "README"}]}\n'
      
        757
            )
      
        758
            target = temp_dir / "README.md"
      
        759
            config = non_streaming_config()
      
        760
            config.permission_mode = PermissionMode.ALLOW
      
        761
            backend = ScriptedBackend(
      
        762
                completions=[
      
        763
                    native_tool_response(
      
        764
                        ToolCall(
      
        765
                            id="write-1",
      
        766
                            name="write",
      
        767
                            arguments={"file_path": str(target), "content": "ask rule\n"},
      
        768
                        ),
      
        769
                        content="I'll update the README if you approve it.",
      
        770
                    ),
      
        771
                    final_response("The write was declined."),
      
        772
                ]
      
        773
            )
      
        774
            prompts: list[str] = []
      
        775
        
        776
            async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
      
        777
                prompts.append(details)
      
        778
                return False
      
        779
        
        780
            run = await run_scenario(
      
        781
                "Update README.md.",
      
        782
                backend,
      
        783
                config=config,
      
        784
                project_root=temp_dir,
      
        785
                on_confirmation=deny_confirmation,
      
        786
            )
      
        787
        
        788
            assert not target.exists()
      
        789
            assert prompts and "matched_ask_rule=tool=write, path_contains=README" in prompts[0]
      
        790
            assert any(event.type == "confirmation" for event in run.events)
      
        791
            assert "declined" in run.response.lower()
      
        792
        
        793
        
        794
        @pytest.mark.asyncio
      
        795
        async def test_raw_json_tool_call_fallback(temp_dir: Path) -> None:
      
        796
            fixture = temp_dir / "fixture.txt"
      
        797
            fixture.write_text("alpha parity line\n")
      
        798
            raw_json = f'{{"name": "read", "arguments": {{"file_path": "{fixture}"}}}}'
      
        799
        
        800
            backend = ScriptedBackend(
      
        801
                streams=[
      
        802
                    [
      
        803
                        StreamChunk(content=raw_json[:25], is_done=False),
      
        804
                        StreamChunk(content=raw_json[25:], full_content=raw_json, is_done=True),
      
        805
                    ],
      
        806
                    [
      
        807
                        StreamChunk(
      
        808
                            content="Recovered the raw JSON tool call and read the file.",
      
        809
                            full_content="Recovered the raw JSON tool call and read the file.",
      
        810
                            is_done=True,
      
        811
                        )
      
        812
                    ],
      
        813
                ]
      
        814
            )
      
        815
        
        816
            run = await run_scenario(
      
        817
                "Read the fixture file.",
      
        818
                backend,
      
        819
                config=AgentConfig(auto_context=False, max_iterations=8),
      
        820
                project_root=temp_dir,
      
        821
            )
      
        822
        
        823
            assert tool_event_names(run) == ["read"]
      
        824
            assert any("alpha parity line" in message for message in tool_result_messages(run))
      
        825
            assert "Recovered the raw JSON tool call" in run.response
      
        826
        
        827
        
        828
        @pytest.mark.asyncio
      
        829
        async def test_raw_json_todowrite_tool_call_fallback(temp_dir: Path) -> None:
      
        830
            raw_json = json.dumps(
      
        831
                {
      
        832
                    "name": "TodoWrite",
      
        833
                    "arguments": {
      
        834
                        "todos": [
      
        835
                            {
      
        836
                                "content": "Run tests",
      
        837
                                "active_form": "Running tests",
      
        838
                                "status": "completed",
      
        839
                            }
      
        840
                        ]
      
        841
                    },
      
        842
                }
      
        843
            )
      
        844
            backend = ScriptedBackend(
      
        845
                completions=[
      
        846
                    CompletionResponse(content=raw_json),
      
        847
                    final_response("Tracked the current todo list."),
      
        848
                ]
      
        849
            )
      
        850
        
        851
            run = await run_scenario(
      
        852
                "Track the current work items.",
      
        853
                backend,
      
        854
                config=non_streaming_config(),
      
        855
                project_root=temp_dir,
      
        856
            )
      
        857
        
        858
            todo_store = temp_dir / ".loader" / "todos" / "active.json"
      
        859
            assert tool_event_names(run) == ["TodoWrite"]
      
        860
            assert json.loads(todo_store.read_text()) == []
      
        861
            assert "Tracked the current todo list." in run.response
      
        862
        
        863
        
        864
        @pytest.mark.asyncio
      
        865
        async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None:
      
        866
            target = temp_dir / "sample.txt"
      
        867
            target.write_text("alpha\nbeta\ngamma\n")
      
        868
            raw_json = json.dumps(
      
        869
                {
      
        870
                    "name": "patch",
      
        871
                    "arguments": {
      
        872
                        "file_path": str(target),
      
        873
                        "hunks": [
      
        874
                            {
      
        875
                                "old_start": 2,
      
        876
                                "old_lines": 1,
      
        877
                                "new_start": 2,
      
        878
                                "new_lines": 1,
      
        879
                                "lines": ["-beta", "+beta updated"],
      
        880
                            }
      
        881
                        ],
      
        882
                    },
      
        883
                }
      
        884
            )
      
        885
            backend = ScriptedBackend(
      
        886
                completions=[
      
        887
                    CompletionResponse(content=raw_json),
      
        888
                    final_response("Patched sample.txt."),
      
        889
                ]
      
        890
            )
      
        891
        
        892
            run = await run_scenario(
      
        893
                "Update sample.txt.",
      
        894
                backend,
      
        895
                config=non_streaming_config(),
      
        896
                project_root=temp_dir,
      
        897
            )
      
        898
        
        899
            assert tool_event_names(run) == ["patch"]
      
        900
            assert target.read_text() == "alpha\nbeta updated\ngamma\n"
      
        901
            assert "Patched sample.txt." in run.response
      
        902
        
        903
        
        904
        @pytest.mark.asyncio
      
        905
        async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
      
        906
            target = temp_dir / "sample.txt"
      
        907
            target.write_text("alpha\nbeta\ngamma\n")
      
        908
        
        909
            backend = ScriptedBackend(
      
        910
                completions=[
      
        911
                    native_tool_response(
      
        912
                        ToolCall(
      
        913
                            id="patch-1",
      
        914
                            name="patch",
      
        915
                            arguments={
      
        916
                                "file_path": str(target),
      
        917
                                "patch": (
      
        918
                                    "--- a/sample.txt\n"
      
        919
                                    "+++ b/sample.txt\n"
      
        920
                                    "@@ -2,1 +2,1 @@\n"
      
        921
                                    "-beta\n"
      
        922
                                    "+beta updated\n"
      
        923
                                ),
      
        924
                            },
      
        925
                        ),
      
        926
                        content="I'll patch the file directly.",
      
        927
                    ),
      
        928
                    final_response("Patched sample.txt."),
      
        929
                ]
      
        930
            )
      
        931
        
        932
            run = await run_scenario(
      
        933
                "Update sample.txt.",
      
        934
                backend,
      
        935
                config=non_streaming_config(),
      
        936
                project_root=temp_dir,
      
        937
            )
      
        938
        
        939
            assert tool_event_names(run) == ["patch"]
      
        940
            assert target.read_text() == "alpha\nbeta updated\ngamma\n"
      
        941
            assert "Patched sample.txt." in run.response
      
        942
        
        943
        
        944
        @pytest.mark.asyncio
      
        945
        async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
      
        946
            raw_json = json.dumps(
      
        947
                {
      
        948
                    "name": "AskUserQuestion",
      
        949
                    "arguments": {
      
        950
                        "title": "Path Choice",
      
        951
                        "context": "Choose the safer Loader cleanup path.",
      
        952
                        "question": "Which path should we take?",
      
        953
                        "options": [
      
        954
                            {
      
        955
                                "label": "Plan first",
      
        956
                                "description": "Keep the next move documented.",
      
        957
                            },
      
        958
                            {
      
        959
                                "label": "Execute now",
      
        960
                                "description": "Start changing code immediately.",
      
        961
                            },
      
        962
                        ],
      
        963
                    },
      
        964
                }
      
        965
            )
      
        966
            backend = ScriptedBackend(
      
        967
                completions=[
      
        968
                    CompletionResponse(content=raw_json),
      
        969
                    final_response("We'll execute now."),
      
        970
                ]
      
        971
            )
      
        972
        
        973
            async def answer(question: str, options: list[str] | None) -> str:
      
        974
                assert "Which path should we take?" in question
      
        975
                assert options == [
      
        976
                    "Plan first - Keep the next move documented.",
      
        977
                    "Execute now - Start changing code immediately.",
      
        978
                ]
      
        979
                return "2"
      
        980
        
        981
            run = await run_scenario(
      
        982
                "Decide the next path before changing code.",
      
        983
                backend,
      
        984
                config=non_streaming_config(),
      
        985
                project_root=temp_dir,
      
        986
                on_user_question=answer,
      
        987
            )
      
        988
        
        989
            assert tool_event_names(run) == ["AskUserQuestion"]
      
        990
            assert any("Execute now" in message for message in tool_result_messages(run))
      
        991
            assert "We'll execute now." in run.response
      
        992
        
        993
        
        994
        @pytest.mark.asyncio
      
        995
        async def test_raw_bracket_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
      
        996
            backend = ScriptedBackend(
      
        997
                streams=[
      
        998
                    [
      
        999
                        StreamChunk(
      
        1000
                            content='[calls askuserquestion tool with: question="Which path should we take?"]',
      
        1001
                            full_content='[calls askuserquestion tool with: question="Which path should we take?"]',
      
        1002
                            is_done=True,
      
        1003
                        )
      
        1004
                    ],
      
        1005
                    [
      
        1006
                        StreamChunk(
      
        1007
                            content="We'll plan first.",
      
        1008
                            full_content="We'll plan first.",
      
        1009
                            is_done=True,
      
        1010
                        )
      
        1011
                    ],
      
        1012
                ]
      
        1013
            )
      
        1014
        
        1015
            async def answer(question: str, options: list[str] | None) -> str:
      
        1016
                assert "Which path should we take?" in question
      
        1017
                assert options is None
      
        1018
                return "Plan first"
      
        1019
        
        1020
            run = await run_scenario(
      
        1021
                "Read the fixture file.",
      
        1022
                backend,
      
        1023
                config=AgentConfig(auto_context=False, max_iterations=8),
      
        1024
                project_root=temp_dir,
      
        1025
                on_user_question=answer,
      
        1026
            )
      
        1027
        
        1028
            assert tool_event_names(run) == ["AskUserQuestion"]
      
        1029
            assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
      
        1030
            assert "We'll plan first." in run.response
      
        1031
        
        1032
        
        1033
        @pytest.mark.asyncio
      
        1034
        async def test_non_streaming_bracket_ask_user_question_tool_call_fallback(
      
        1035
            temp_dir: Path,
      
        1036
        ) -> None:
      
        1037
            backend = ScriptedBackend(
      
        1038
                completions=[
      
        1039
                    CompletionResponse(
      
        1040
                        content='[calls askuserquestion tool with: question="Which path should we take?"]'
      
        1041
                    ),
      
        1042
                    final_response("We'll plan first."),
      
        1043
                ]
      
        1044
            )
      
        1045
        
        1046
            async def answer(question: str, options: list[str] | None) -> str:
      
        1047
                assert "Which path should we take?" in question
      
        1048
                assert options is None
      
        1049
                return "Plan first"
      
        1050
        
        1051
            run = await run_scenario(
      
        1052
                "Read the fixture file.",
      
        1053
                backend,
      
        1054
                config=non_streaming_config(),
      
        1055
                project_root=temp_dir,
      
        1056
                on_user_question=answer,
      
        1057
            )
      
        1058
        
        1059
            assert tool_event_names(run) == ["AskUserQuestion"]
      
        1060
            assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
      
        1061
            assert "We'll plan first." in run.response
      
        1062
        
        1063
        
        1064
        @pytest.mark.asyncio
      
        1065
        async def test_native_and_raw_tool_paths_share_executor_trace(temp_dir: Path) -> None:
      
        1066
            native_fixture = temp_dir / "native.txt"
      
        1067
            native_fixture.write_text("native parity line\n")
      
        1068
            native_backend = ScriptedBackend(
      
        1069
                completions=[
      
        1070
                    native_tool_response(
      
        1071
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(native_fixture)}),
      
        1072
                        content="I'll inspect the native tool result.",
      
        1073
                    ),
      
        1074
                    final_response("Native read complete."),
      
        1075
                ]
      
        1076
            )
      
        1077
            native_run = await run_scenario(
      
        1078
                "Read native.txt.",
      
        1079
                native_backend,
      
        1080
                config=non_streaming_config(),
      
        1081
                project_root=temp_dir,
      
        1082
            )
      
        1083
        
        1084
            raw_fixture = temp_dir / "raw.txt"
      
        1085
            raw_fixture.write_text("raw parity line\n")
      
        1086
            raw_json = f'{{"name": "read", "arguments": {{"file_path": "{raw_fixture}"}}}}'
      
        1087
            raw_backend = ScriptedBackend(
      
        1088
                streams=[
      
        1089
                    [
      
        1090
                        StreamChunk(content=raw_json[:20], is_done=False),
      
        1091
                        StreamChunk(content=raw_json[20:], full_content=raw_json, is_done=True),
      
        1092
                    ],
      
        1093
                    [
      
        1094
                        StreamChunk(
      
        1095
                            content="Raw read complete.",
      
        1096
                            full_content="Raw read complete.",
      
        1097
                            is_done=True,
      
        1098
                        )
      
        1099
                    ],
      
        1100
                ]
      
        1101
            )
      
        1102
            raw_run = await run_scenario(
      
        1103
                "Read raw.txt.",
      
        1104
                raw_backend,
      
        1105
                config=AgentConfig(auto_context=False, max_iterations=8),
      
        1106
                project_root=temp_dir,
      
        1107
            )
      
        1108
        
        1109
            for run in (native_run, raw_run):
      
        1110
                names = trace_event_names(run)
      
        1111
                assert "assistant.tool_batch" in names
      
        1112
                assert "tool.received" in names
      
        1113
                assert "tool.executed" in names
      
        1114
        
        1115
            native_summary = native_run.agent.last_turn_summary
      
        1116
            raw_summary = raw_run.agent.last_turn_summary
      
        1117
            assert native_summary is not None
      
        1118
            assert raw_summary is not None
      
        1119
            assert any(
      
        1120
                event.name == "tool.received" and event.data["source"] == "native"
      
        1121
                for event in native_summary.trace
      
        1122
            )
      
        1123
            assert any(
      
        1124
                event.name == "tool.received" and event.data["source"] == "raw_text"
      
        1125
                for event in raw_summary.trace
      
        1126
            )
      
        1127
        
        1128
        
        1129
        @pytest.mark.asyncio
      
        1130
        async def test_backend_capability_probe_refreshes_native_tool_mode(
      
        1131
            temp_dir: Path,
      
        1132
        ) -> None:
      
        1133
            fixture = temp_dir / "fixture.txt"
      
        1134
            fixture.write_text("capability probe line\n")
      
        1135
        
        1136
            class LazyCapabilityBackend(ScriptedBackend):
      
        1137
                def __init__(self, completions: list[CompletionResponse]) -> None:
      
        1138
                    super().__init__(completions=completions, supports_native_tools=False)
      
        1139
                    self.model = "custom-qwen-build"
      
        1140
                    self._described = False
      
        1141
        
        1142
                async def describe_model(self) -> dict[str, dict[str, list[str]]]:
      
        1143
                    self._described = True
      
        1144
                    return {"details": {"families": ["qwen2.5"]}}
      
        1145
        
        1146
                def capability_profile(self):
      
        1147
                    model_details = (
      
        1148
                        {"details": {"families": ["qwen2.5"]}} if self._described else None
      
        1149
                    )
      
        1150
                    return resolve_capability_profile(
      
        1151
                        self.model,
      
        1152
                        model_details=model_details,
      
        1153
                    )
      
        1154
        
        1155
            backend = LazyCapabilityBackend(
      
        1156
                completions=[
      
        1157
                    native_tool_response(
      
        1158
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
      
        1159
                        content="I'll inspect that file after probing capabilities.",
      
        1160
                    ),
      
        1161
                    final_response("Capability probing enabled the native read."),
      
        1162
                ]
      
        1163
            )
      
        1164
        
        1165
            run = await run_scenario(
      
        1166
                "Read the fixture file after checking model capabilities.",
      
        1167
                backend,
      
        1168
                config=non_streaming_config(),
      
        1169
                project_root=temp_dir,
      
        1170
            )
      
        1171
        
        1172
            assert backend._described
      
        1173
            assert not run.agent.use_react
      
        1174
            assert run.invocations[0].tools is not None
      
        1175
            assert tool_event_names(run) == ["read"]
      
        1176
            assert "Capability probing enabled the native read." in run.response
      
        1177
        
        1178
        
        1179
        @pytest.mark.asyncio
      
        1180
        async def test_run_streaming_delegates_to_primary_runtime(temp_dir: Path) -> None:
      
        1181
            fixture = temp_dir / "streaming.txt"
      
        1182
            fixture.write_text("streamed runtime line\n")
      
        1183
            backend = ScriptedBackend(
      
        1184
                streams=[
      
        1185
                    [
      
        1186
                        StreamChunk(
      
        1187
                            content="I'll inspect the file now.",
      
        1188
                            full_content="I'll inspect the file now.",
      
        1189
                            tool_calls=[
      
        1190
                                ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)})
      
        1191
                            ],
      
        1192
                            is_done=True,
      
        1193
                        )
      
        1194
                    ],
      
        1195
                    [
      
        1196
                        StreamChunk(
      
        1197
                            content="Finished reading the streamed fixture.",
      
        1198
                            full_content="Finished reading the streamed fixture.",
      
        1199
                            is_done=True,
      
        1200
                        )
      
        1201
                    ],
      
        1202
                ]
      
        1203
            )
      
        1204
            agent = Agent(
      
        1205
                backend=backend,
      
        1206
                config=AgentConfig(auto_context=False, max_iterations=8),
      
        1207
                project_root=temp_dir,
      
        1208
            )
      
        1209
        
        1210
            events = [event async for event in agent.run_streaming("Read the streamed fixture file.")]
      
        1211
        
        1212
            assert any(event.type == "tool_call" and event.tool_name == "read" for event in events)
      
        1213
            assert any(
      
        1214
                event.type == "tool_result" and "streamed runtime line" in event.content
      
        1215
                for event in events
      
        1216
            )
      
        1217
            assert agent.last_turn_summary is not None
      
        1218
            assert agent.last_turn_summary.final_response.startswith(
      
        1219
                "Finished reading the streamed fixture."
      
        1220
            )
      
        1221
        
        1222
        
        1223
        @pytest.mark.asyncio
      
        1224
        async def test_definition_of_done_verify_phase(temp_dir: Path) -> None:
      
        1225
            target = temp_dir / "verified.txt"
      
        1226
            backend = ScriptedBackend(
      
        1227
                completions=[
      
        1228
                    native_tool_response(
      
        1229
                        ToolCall(
      
        1230
                            id="write-1",
      
        1231
                            name="write",
      
        1232
                            arguments={"file_path": str(target), "content": "verified\n"},
      
        1233
                        ),
      
        1234
                        content="I'll create the file now.",
      
        1235
                    ),
      
        1236
                    final_response("Created verified.txt."),
      
        1237
                ]
      
        1238
            )
      
        1239
        
        1240
            run = await run_scenario(
      
        1241
                "Create verified.txt with a line of text.",
      
        1242
                backend,
      
        1243
                config=non_streaming_config(),
      
        1244
                project_root=temp_dir,
      
        1245
            )
      
        1246
        
        1247
            assert verification_commands(run) == [f"test -f {target}"]
      
        1248
            assert dod_statuses(run) == ["draft", "verifying", "done"]
      
        1249
            assert "Verification:" in run.response
      
        1250
            assert run.agent.last_turn_summary is not None
      
        1251
            assert run.agent.last_turn_summary.verification_status == "passed"
      
        1252
            assert run.agent.last_turn_summary.definition_of_done is not None
      
        1253
        
        1254
        
        1255
        @pytest.mark.asyncio
      
        1256
        async def test_verify_failure_routes_to_fix_loop(
      
        1257
            temp_dir: Path,
      
        1258
            monkeypatch: pytest.MonkeyPatch,
      
        1259
        ) -> None:
      
        1260
            monkeypatch.chdir(temp_dir)
      
        1261
            target = temp_dir / "broken.py"
      
        1262
            backend = ScriptedBackend(
      
        1263
                completions=[
      
        1264
                    native_tool_response(
      
        1265
                        ToolCall(
      
        1266
                            id="write-1",
      
        1267
                            name="write",
      
        1268
                            arguments={"file_path": str(target), "content": "print(\n"},
      
        1269
                        ),
      
        1270
                        content="I'll create the script.",
      
        1271
                    ),
      
        1272
                    final_response("Created broken.py."),
      
        1273
                    native_tool_response(
      
        1274
                        ToolCall(
      
        1275
                            id="write-2",
      
        1276
                            name="write",
      
        1277
                            arguments={
      
        1278
                                "file_path": str(target),
      
        1279
                                "content": "print('fixed from verify loop')\n",
      
        1280
                            },
      
        1281
                        ),
      
        1282
                        content="I'll fix the verification failure.",
      
        1283
                    ),
      
        1284
                    final_response("Fixed broken.py."),
      
        1285
                ]
      
        1286
            )
      
        1287
        
        1288
            run = await run_scenario(
      
        1289
                "Create broken.py and make sure it runs.",
      
        1290
                backend,
      
        1291
                config=non_streaming_config(),
      
        1292
                project_root=temp_dir,
      
        1293
            )
      
        1294
        
        1295
            assert target.read_text() == "print('fixed from verify loop')\n"
      
        1296
            assert verification_commands(run) == ["python broken.py", "python broken.py"]
      
        1297
            assert "fixing" in dod_statuses(run)
      
        1298
            assert "Verification:" in run.response
      
        1299
            assert run.agent.last_turn_summary is not None
      
        1300
            assert run.agent.last_turn_summary.verification_status == "passed"
      
        1301
        
        1302
        
        1303
        @pytest.mark.asyncio
      
        1304
        async def test_verify_retry_budget_exhaustion(
      
        1305
            temp_dir: Path,
      
        1306
            monkeypatch: pytest.MonkeyPatch,
      
        1307
        ) -> None:
      
        1308
            monkeypatch.chdir(temp_dir)
      
        1309
            target = temp_dir / "still-broken.py"
      
        1310
            config = non_streaming_config()
      
        1311
            config.verification_retry_budget = 1
      
        1312
            backend = ScriptedBackend(
      
        1313
                completions=[
      
        1314
                    native_tool_response(
      
        1315
                        ToolCall(
      
        1316
                            id="write-1",
      
        1317
                            name="write",
      
        1318
                            arguments={"file_path": str(target), "content": "print(\n"},
      
        1319
                        ),
      
        1320
                        content="I'll create the script.",
      
        1321
                    ),
      
        1322
                    final_response("Created still-broken.py."),
      
        1323
                    native_tool_response(
      
        1324
                        ToolCall(
      
        1325
                            id="write-2",
      
        1326
                            name="write",
      
        1327
                            arguments={"file_path": str(target), "content": "print(\n"},
      
        1328
                        ),
      
        1329
                        content="I'll try one more fix.",
      
        1330
                    ),
      
        1331
                    final_response("Tried to fix still-broken.py."),
      
        1332
                ]
      
        1333
            )
      
        1334
        
        1335
            run = await run_scenario(
      
        1336
                "Create still-broken.py and make sure it runs.",
      
        1337
                backend,
      
        1338
                config=config,
      
        1339
                project_root=temp_dir,
      
        1340
            )
      
        1341
        
        1342
            assert "couldn't verify" in run.response.lower()
      
        1343
            assert dod_statuses(run)[-1] == "failed"
      
        1344
            assert run.agent.last_turn_summary is not None
      
        1345
            assert run.agent.last_turn_summary.verification_status == "failed"
      
        1346
        
        1347
        
        1348
        @pytest.mark.asyncio
      
        1349
        async def test_ambiguous_prompt_routes_to_clarify(temp_dir: Path) -> None:
      
        1350
            backend = ScriptedBackend(
      
        1351
                completions=[
      
        1352
                    native_tool_response(
      
        1353
                        ToolCall(
      
        1354
                            id="ask-1",
      
        1355
                            name="AskUserQuestion",
      
        1356
                            arguments={
      
        1357
                                "question": (
      
        1358
                                    "What outcome matters most, and what should stay out of scope?"
      
        1359
                                )
      
        1360
                            },
      
        1361
                        ),
      
        1362
                        content="I need one clarification first.",
      
        1363
                    ),
      
        1364
                    final_response(
      
        1365
                        "\n".join(
      
        1366
                            [
      
        1367
                                "## Task Statement",
      
        1368
                                "Improve Loader so it feels more like claw-code.",
      
        1369
                                "",
      
        1370
                                "## Desired Outcome",
      
        1371
                                "- Make Loader more reliable without broad redesign.",
      
        1372
                                "",
      
        1373
                                "## In Scope",
      
        1374
                                "- Tighten the runtime workflow around the user-facing goal.",
      
        1375
                                "",
      
        1376
                                "## Non Goals",
      
        1377
                                "- Rebuild unrelated subsystems.",
      
        1378
                                "",
      
        1379
                                "## Decision Boundaries",
      
        1380
                                "- Escalate before changing unrelated UX patterns.",
      
        1381
                                "",
      
        1382
                                "## Constraints",
      
        1383
                                "- Stay inside the current repository.",
      
        1384
                                "",
      
        1385
                                "## Likely Touchpoints",
      
        1386
                                "- Runtime entry points and prompt behavior.",
      
        1387
                                "",
      
        1388
                                "## Assumptions",
      
        1389
                                "- The user wants a narrow runtime-quality improvement.",
      
        1390
                                "",
      
        1391
                                "## Acceptance Criteria",
      
        1392
                                "- The improvement stays focused on runtime behavior.",
      
        1393
                            ]
      
        1394
                        )
      
        1395
                    ),
      
        1396
                    final_response("I have the brief and can move forward."),
      
        1397
                ]
      
        1398
            )
      
        1399
        
        1400
            async def answer(question: str, options: list[str] | None) -> str:
      
        1401
                assert "outcome matters most" in question.lower()
      
        1402
                assert options is None
      
        1403
                return "Do not redesign the whole interface."
      
        1404
        
        1405
            run = await run_scenario(
      
        1406
                "Improve Loader so it feels more like claw-code.",
      
        1407
                backend,
      
        1408
                config=non_streaming_config(),
      
        1409
                project_root=temp_dir,
      
        1410
                on_user_question=answer,
      
        1411
            )
      
        1412
        
        1413
            dod = run.agent.last_turn_summary.definition_of_done
      
        1414
            assert dod is not None
      
        1415
            assert workflow_modes(run)[:2] == ["clarify", "execute"]
      
        1416
            assert artifact_kinds(run) == ["clarify_brief"]
      
        1417
            assert dod.clarify_brief is not None
      
        1418
            assert Path(dod.clarify_brief).exists()
      
        1419
        
        1420
        
        1421
        @pytest.mark.asyncio
      
        1422
        async def test_complex_prompt_routes_to_plan(temp_dir: Path) -> None:
      
        1423
            target = temp_dir / "planned.txt"
      
        1424
            backend = ScriptedBackend(
      
        1425
                completions=[
      
        1426
                    final_response(
      
        1427
                        "\n".join(
      
        1428
                            [
      
        1429
                                "# Implementation Plan",
      
        1430
                                "",
      
        1431
                                "## File Changes",
      
        1432
                                f"- Create {target.name} in the workspace root.",
      
        1433
                                "",
      
        1434
                                "## Execution Order",
      
        1435
                                f"1. Write {target.name}.",
      
        1436
                                "2. Confirm the file exists.",
      
        1437
                                "",
      
        1438
                                "## Risks",
      
        1439
                                "- Writing the wrong file path.",
      
        1440
                                "",
      
        1441
                                "<<<VERIFICATION>>>",
      
        1442
                                "",
      
        1443
                                "# Verification Plan",
      
        1444
                                "",
      
        1445
                                "## Acceptance Criteria",
      
        1446
                                f"- {target.name} exists in the workspace root.",
      
        1447
                                "",
      
        1448
                                "## Verification Commands",
      
        1449
                                f"- `test -f {target}`",
      
        1450
                                "",
      
        1451
                                "## Notes",
      
        1452
                                "- Use a deterministic file existence check.",
      
        1453
                            ]
      
        1454
                        )
      
        1455
                    ),
      
        1456
                    native_tool_response(
      
        1457
                        ToolCall(
      
        1458
                            id="write-1",
      
        1459
                            name="write",
      
        1460
                            arguments={"file_path": str(target), "content": "planned output\n"},
      
        1461
                        ),
      
        1462
                        content="I'll create the file now.",
      
        1463
                    ),
      
        1464
                    final_response("The file is in place."),
      
        1465
                ]
      
        1466
            )
      
        1467
        
        1468
            run = await run_scenario(
      
        1469
                "Implement a persistent workflow mode router with clarify artifacts, "
      
        1470
                "planning artifacts, and verification-plan wiring in the runtime.",
      
        1471
                backend,
      
        1472
                config=non_streaming_config(),
      
        1473
                project_root=temp_dir,
      
        1474
            )
      
        1475
        
        1476
            dod = run.agent.last_turn_summary.definition_of_done
      
        1477
            assert dod is not None
      
        1478
            assert workflow_modes(run)[:3] == ["plan", "execute", "verify"]
      
        1479
            assert artifact_kinds(run) == ["implementation_plan", "verification_plan"]
      
        1480
            assert not any(event.type == "decomposition" for event in run.events)
      
        1481
            assert not any(event.type == "subtask" for event in run.events)
      
        1482
            assert dod.verification_commands == [f"test -f {target}"]
      
        1483
            assert verification_commands(run) == [f"test -f {target}"]
      
        1484
        
        1485
        
        1486
        @pytest.mark.asyncio
      
        1487
        async def test_verify_failure_fix_loop_does_not_reroute_workflow(temp_dir: Path) -> None:
      
        1488
            target = temp_dir / "retry.txt"
      
        1489
            backend = ScriptedBackend(
      
        1490
                completions=[
      
        1491
                    final_response(
      
        1492
                        "\n".join(
      
        1493
                            [
      
        1494
                                "# Implementation Plan",
      
        1495
                                "",
      
        1496
                                "## File Changes",
      
        1497
                                f"- Create {target.name}.",
      
        1498
                                "",
      
        1499
                                "## Execution Order",
      
        1500
                                f"1. Write {target.name}.",
      
        1501
                                "2. Fix it if verification fails.",
      
        1502
                                "",
      
        1503
                                "## Risks",
      
        1504
                                "- Initial content may be wrong.",
      
        1505
                                "",
      
        1506
                                "<<<VERIFICATION>>>",
      
        1507
                                "",
      
        1508
                                "# Verification Plan",
      
        1509
                                "",
      
        1510
                                "## Acceptance Criteria",
      
        1511
                                "- The file contains the word fixed.",
      
        1512
                                "",
      
        1513
                                "## Verification Commands",
      
        1514
                                f"- `grep -q fixed {target}`",
      
        1515
                                "",
      
        1516
                                "## Notes",
      
        1517
                                "- Retry if the first write misses the target string.",
      
        1518
                            ]
      
        1519
                        )
      
        1520
                    ),
      
        1521
                    native_tool_response(
      
        1522
                        ToolCall(
      
        1523
                            id="write-1",
      
        1524
                            name="write",
      
        1525
                            arguments={"file_path": str(target), "content": "draft output\n"},
      
        1526
                        ),
      
        1527
                        content="I'll write the first draft.",
      
        1528
                    ),
      
        1529
                    final_response("First draft is written."),
      
        1530
                    native_tool_response(
      
        1531
                        ToolCall(
      
        1532
                            id="write-2",
      
        1533
                            name="write",
      
        1534
                            arguments={"file_path": str(target), "content": "fixed output\n"},
      
        1535
                        ),
      
        1536
                        content="I'll correct the file.",
      
        1537
                    ),
      
        1538
                    final_response("The file now contains the fixed output."),
      
        1539
                ]
      
        1540
            )
      
        1541
        
        1542
            run = await run_scenario(
      
        1543
                "Implement a persistent workflow mode router with clarify artifacts, "
      
        1544
                "planning artifacts, and verification-plan wiring in the runtime.",
      
        1545
                backend,
      
        1546
                config=non_streaming_config(),
      
        1547
                project_root=temp_dir,
      
        1548
            )
      
        1549
        
        1550
            modes = workflow_modes(run)
      
        1551
            assert modes.count("plan") == 1
      
        1552
            assert modes.count("clarify") == 0
      
        1553
            assert modes.count("execute") >= 2
      
        1554
            assert modes.count("verify") >= 2
      
        1555
        
        1556
        
        1557
        @pytest.mark.asyncio
      
        1558
        async def test_conversational_task_skips_verify_phase() -> None:
      
        1559
            backend = ScriptedBackend(
      
        1560
                streams=[
      
        1561
                    [
      
        1562
                        StreamChunk(content="Hello there.", full_content="Hello there.", is_done=True),
      
        1563
                    ]
      
        1564
                ]
      
        1565
            )
      
        1566
        
        1567
            run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
      
        1568
        
        1569
            assert run.response == "Hello there."
      
        1570
            assert not dod_statuses(run)
      
        1571
            assert run.agent.last_turn_summary is None
      
        1572
        
        1573
        
        1574
        @pytest.mark.asyncio
      
        1575
        async def test_explore_mode_skips_dod_and_router(temp_dir: Path) -> None:
      
        1576
            target = temp_dir / "feature.py"
      
        1577
            target.write_text("def important_helper():\n    return 1\n")
      
        1578
            backend = ScriptedBackend(
      
        1579
                completions=[
      
        1580
                    native_tool_response(
      
        1581
                        ToolCall(
      
        1582
                            id="grep-1",
      
        1583
                            name="grep",
      
        1584
                            arguments={
      
        1585
                                "pattern": "important_helper",
      
        1586
                                "path": str(temp_dir),
      
        1587
                                "include": "*.py",
      
        1588
                            },
      
        1589
                        ),
      
        1590
                        content="I'll search for that helper.",
      
        1591
                    ),
      
        1592
                    final_response("important_helper is defined in feature.py."),
      
        1593
                ]
      
        1594
            )
      
        1595
        
        1596
            run = await run_explore_scenario(
      
        1597
                "Where is important_helper defined?",
      
        1598
                backend,
      
        1599
                config=non_streaming_config(),
      
        1600
                project_root=temp_dir,
      
        1601
            )
      
        1602
        
        1603
            assert "feature.py" in run.response
      
        1604
            assert tool_event_names(run) == ["grep"]
      
        1605
            assert not dod_statuses(run)
      
        1606
            assert not workflow_modes(run)
      
        1607
            assert run.agent.last_turn_summary is not None
      
        1608
            assert run.agent.last_turn_summary.definition_of_done is None
      
        1609
            assert run.agent.last_turn_summary.workflow_mode == "explore"
      
        1610
            assert "explore.completed" in trace_event_names(run)
      
        1611
            assert not (temp_dir / ".loader" / "dod").exists()
      
        1612
            assert run.invocations[0].tools is not None
      
        1613
            assert "write" not in {tool["name"] for tool in run.invocations[0].tools or []}
      
        1614
        
        1615
        
        1616
        @pytest.mark.asyncio
      
        1617
        async def test_explore_mode_denies_write(temp_dir: Path) -> None:
      
        1618
            target = temp_dir / "new.txt"
      
        1619
            config = non_streaming_config()
      
        1620
            config.permission_mode = PermissionMode.WORKSPACE_WRITE
      
        1621
            backend = ScriptedBackend(
      
        1622
                completions=[
      
        1623
                    native_tool_response(
      
        1624
                        ToolCall(
      
        1625
                            id="write-1",
      
        1626
                            name="write",
      
        1627
                            arguments={
      
        1628
                                "file_path": str(target),
      
        1629
                                "content": "not allowed\n",
      
        1630
                            },
      
        1631
                        ),
      
        1632
                        content="I'll write a file.",
      
        1633
                    ),
      
        1634
                    final_response("Explore mode is read-only, so I cannot make that change here."),
      
        1635
                ]
      
        1636
            )
      
        1637
        
        1638
            run = await run_explore_scenario(
      
        1639
                "Create a new file anyway.",
      
        1640
                backend,
      
        1641
                config=config,
      
        1642
                project_root=temp_dir,
      
        1643
            )
      
        1644
        
        1645
            assert not target.exists()
      
        1646
            assert tool_event_names(run) == ["write"]
      
        1647
            assert any("read-only" in message.lower() for message in tool_result_messages(run))
      
        1648
            assert "cannot make that change" in run.response.lower()
      
        1649
            assert "tool.permission_denied" in trace_event_names(run)
      
        1650
            assert not dod_statuses(run)
      
        1651
            assert not workflow_modes(run)
      
        1652
            assert not (temp_dir / ".loader" / "dod").exists()
      
        1653
        
        1654
        
        1655
        @pytest.mark.asyncio
      
        1656
        async def test_explore_mode_ignores_global_allow_policy(temp_dir: Path) -> None:
      
        1657
            loader_root = temp_dir / ".loader"
      
        1658
            loader_root.mkdir()
      
        1659
            (loader_root / "permission-rules.json").write_text(
      
        1660
                '{"allow": [{"tool": "write", "path_contains": "new.txt"}]}\n'
      
        1661
            )
      
        1662
            target = temp_dir / "new.txt"
      
        1663
            config = non_streaming_config()
      
        1664
            config.permission_mode = PermissionMode.ALLOW
      
        1665
            backend = ScriptedBackend(
      
        1666
                completions=[
      
        1667
                    native_tool_response(
      
        1668
                        ToolCall(
      
        1669
                            id="write-1",
      
        1670
                            name="write",
      
        1671
                            arguments={
      
        1672
                                "file_path": str(target),
      
        1673
                                "content": "still denied\n",
      
        1674
                            },
      
        1675
                        ),
      
        1676
                        content="I'll write a file.",
      
        1677
                    ),
      
        1678
                    final_response("Explore mode is read-only, so I cannot make that change here."),
      
        1679
                ]
      
        1680
            )
      
        1681
        
        1682
            run = await run_explore_scenario(
      
        1683
                "Create a new file anyway.",
      
        1684
                backend,
      
        1685
                config=config,
      
        1686
                project_root=temp_dir,
      
        1687
            )
      
        1688
        
        1689
            assert not target.exists()
      
        1690
            assert any("read-only" in message.lower() for message in tool_result_messages(run))
      
        1691
            assert "tool.permission_denied" in trace_event_names(run)
      
        1692
            assert not dod_statuses(run)
      
        1693
            assert not workflow_modes(run)
      
        1694
        
        1695
        
        1696
        @pytest.mark.asyncio
      
        1697
        async def test_informational_completion_allows_explicit_done_without_continuation(
      
        1698
            temp_dir: Path,
      
        1699
            monkeypatch: pytest.MonkeyPatch,
      
        1700
        ) -> None:
      
        1701
            monkeypatch.chdir(temp_dir)
      
        1702
            target = temp_dir / "hello.py"
      
        1703
            backend = ScriptedBackend(
      
        1704
                completions=[
      
        1705
                    final_response("Done."),
      
        1706
                ]
      
        1707
            )
      
        1708
            config = non_streaming_config(completion_check=True)
      
        1709
        
        1710
            run = await run_scenario(
      
        1711
                "Explain how a hello.py file would work.",
      
        1712
                backend,
      
        1713
                config=config,
      
        1714
                project_root=temp_dir,
      
        1715
            )
      
        1716
        
        1717
            assert not target.exists()
      
        1718
            assert not any(event.type == "completion_check" for event in run.events)
      
        1719
            assert tool_event_names(run) == []
      
        1720
            assert run.response == "Done."
      
        1721
        
        1722
        
        1723
        @pytest.mark.asyncio
      
        1724
        async def test_tool_result_contract_regression() -> None:
      
        1725
            errors: list[str] = []
      
        1726
            duplicate_path = "/tmp/already-created.txt"
      
        1727
        
        1728
            duplicate_backend = ScriptedBackend(
      
        1729
                completions=[
      
        1730
                    native_tool_response(
      
        1731
                        ToolCall(
      
        1732
                            id="dup-1",
      
        1733
                            name="write",
      
        1734
                            arguments={"file_path": duplicate_path, "content": "already there\n"},
      
        1735
                        ),
      
        1736
                        content="I'll create the file again.",
      
        1737
                    ),
      
        1738
                    final_response("Skipped the duplicate write."),
      
        1739
                ]
      
        1740
            )
      
        1741
            duplicate_agent = Agent(duplicate_backend, config=non_streaming_config())
      
        1742
            duplicate_agent.safeguards.record_action(
      
        1743
                "write",
      
        1744
                {"file_path": duplicate_path, "content": "already there\n"},
      
        1745
            )
      
        1746
        
        1747
            try:
      
        1748
                await duplicate_agent.run("Create /tmp/already-created.txt again.")
      
        1749
            except TypeError as exc:
      
        1750
                errors.append(f"duplicate branch raised {exc}")
      
        1751
        
        1752
            validation_backend = ScriptedBackend(
      
        1753
                completions=[
      
        1754
                    native_tool_response(
      
        1755
                        ToolCall(id="invalid-1", name="bash", arguments={"command": ""}),
      
        1756
                        content="I'll run that command.",
      
        1757
                    ),
      
        1758
                    final_response("Blocked the invalid command."),
      
        1759
                ]
      
        1760
            )
      
        1761
            validation_agent = Agent(validation_backend, config=non_streaming_config())
      
        1762
        
        1763
            try:
      
        1764
                await validation_agent.run("Run an empty command.")
      
        1765
            except TypeError as exc:
      
        1766
                errors.append(f"validation branch raised {exc}")
      
        1767
        
        1768
            assert not errors, "\n".join(errors)
      
        1769
        
        1770
        
        1771
        @pytest.mark.asyncio
      
        1772
        async def test_duplicate_read_is_skipped_without_intervening_mutation(
      
        1773
            temp_dir: Path,
      
        1774
        ) -> None:
      
        1775
            fixture = temp_dir / "index.html"
      
        1776
            fixture.write_text("alpha parity line\n")
      
        1777
        
        1778
            backend = ScriptedBackend(
      
        1779
                completions=[
      
        1780
                    native_tool_response(
      
        1781
                        ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
      
        1782
                        content="I'll inspect the file.",
      
        1783
                    ),
      
        1784
                    native_tool_response(
      
        1785
                        ToolCall(id="read-2", name="read", arguments={"file_path": str(fixture)}),
      
        1786
                        content="I'll reread the same file.",
      
        1787
                    ),
      
        1788
                    final_response("I'll use the existing file contents instead of rereading."),
      
        1789
                ]
      
        1790
            )
      
        1791
        
        1792
            run = await run_scenario(
      
        1793
                "Inspect index.html and keep moving.",
      
        1794
                backend,
      
        1795
                config=non_streaming_config(),
      
        1796
                project_root=temp_dir,
      
        1797
            )
      
        1798
        
        1799
            assert tool_event_names(run) == ["read", "read"]
      
        1800
            messages = tool_result_messages(run)
      
        1801
            assert any("alpha parity line" in message for message in messages)
      
        1802
            assert any(
      
        1803
                "Skipped - duplicate action" in message and "Already read" in message
      
        1804
                for message in messages
      
        1805
            )
      
        1806
            assert "existing file contents" in run.response
      
        1807
        
        1808
        
        1809
        @pytest.mark.asyncio
      
        1810
        async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence(
      
        1811
            temp_dir: Path,
      
        1812
        ) -> None:
      
        1813
            chapters = temp_dir / "chapters"
      
        1814
            chapters.mkdir()
      
        1815
            (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n")
      
        1816
            (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n")
      
        1817
            index_file = temp_dir / "index.html"
      
        1818
            index_file.write_text("broken table of contents\n")
      
        1819
        
        1820
            backend = ScriptedBackend(
      
        1821
                completions=[
      
        1822
                    native_tool_response(
      
        1823
                        ToolCall(
      
        1824
                            id="glob-1",
      
        1825
                            name="glob",
      
        1826
                            arguments={"path": str(chapters), "pattern": "*.html"},
      
        1827
                        ),
      
        1828
                        content="I'll inspect the chapter inventory first.",
      
        1829
                    ),
      
        1830
                    native_tool_response(
      
        1831
                        ToolCall(
      
        1832
                            id="read-1",
      
        1833
                            name="read",
      
        1834
                            arguments={"file_path": str(index_file)},
      
        1835
                        ),
      
        1836
                        content="I'll inspect the index next.",
      
        1837
                    ),
      
        1838
                    native_tool_response(
      
        1839
                        ToolCall(
      
        1840
                            id="read-2",
      
        1841
                            name="read",
      
        1842
                            arguments={"file_path": str(index_file)},
      
        1843
                        ),
      
        1844
                        content="I'll reopen the index.",
      
        1845
                    ),
      
        1846
                    final_response("I'll reuse the earlier evidence and patch the index next."),
      
        1847
                ]
      
        1848
            )
      
        1849
        
        1850
            run = await run_scenario(
      
        1851
                "Update index.html so the table of contents links are correct.",
      
        1852
                backend,
      
        1853
                config=non_streaming_config(),
      
        1854
                project_root=temp_dir,
      
        1855
            )
      
        1856
        
        1857
            messages = tool_result_messages(run)
      
        1858
            steering_messages = [
      
        1859
                event.content
      
        1860
                for event in run.events
      
        1861
                if event.type == "steering" and event.content
      
        1862
            ]
      
        1863
        
        1864
            assert any("reuse the earlier read result instead of rereading" in message for message in messages)
      
        1865
            assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages)
      
        1866
            assert any("index.html" in message for message in steering_messages)
      
        1867
        
        1868
        
        1869
        @pytest.mark.asyncio
      
        1870
        async def test_relative_file_read_stays_on_recent_external_context(
      
        1871
            temp_dir: Path,
      
        1872
        ) -> None:
      
        1873
            external_dir = temp_dir.parent / f"{temp_dir.name}-external-guide"
      
        1874
            external_dir.mkdir(exist_ok=True)
      
        1875
            external_index = external_dir / "index.html"
      
        1876
            external_index.write_text("external guide index\n")
      
        1877
        
        1878
            backend = ScriptedBackend(
      
        1879
                completions=[
      
        1880
                    native_tool_response(
      
        1881
                        ToolCall(
      
        1882
                            id="read-1",
      
        1883
                            name="read",
      
        1884
                            arguments={"file_path": str(external_index)},
      
        1885
                        ),
      
        1886
                        content="I'll inspect the external index first.",
      
        1887
                    ),
      
        1888
                    native_tool_response(
      
        1889
                        ToolCall(
      
        1890
                            id="read-2",
      
        1891
                            name="read",
      
        1892
                            arguments={"file_path": "index.html"},
      
        1893
                        ),
      
        1894
                        content="I'll reopen index.html in the same guide.",
      
        1895
                    ),
      
        1896
                    final_response("I stayed on the external guide instead of snapping back to the repo."),
      
        1897
                ]
      
        1898
            )
      
        1899
        
        1900
            run = await run_scenario(
      
        1901
                "Inspect the external guide index twice.",
      
        1902
                backend,
      
        1903
                config=non_streaming_config(),
      
        1904
                project_root=temp_dir,
      
        1905
            )
      
        1906
        
        1907
            assert tool_event_names(run) == ["read", "read"]
      
        1908
            messages = tool_result_messages(run)
      
        1909
            assert any("external guide index" in message for message in messages)
      
        1910
            assert not any("File not found: index.html" in message for message in messages)
      
        1911
            assert any(
      
        1912
                "Skipped - duplicate action" in message or "external guide index" in message
      
        1913
                for message in messages[1:]
      
        1914
            )
      
        1915
        
        1916
        
        1917
        @pytest.mark.asyncio
      
        1918
        async def test_blocked_shell_text_rewrite_queues_file_tool_steering(
      
        1919
            temp_dir: Path,
      
        1920
        ) -> None:
      
        1921
            target = temp_dir / "notes.txt"
      
        1922
            target.write_text("old value\n")
      
        1923
        
        1924
            backend = ScriptedBackend(
      
        1925
                completions=[
      
        1926
                    native_tool_response(
      
        1927
                        ToolCall(
      
        1928
                            id="bash-1",
      
        1929
                            name="bash",
      
        1930
                            arguments={"command": "sed -i '1s/old/new/' notes.txt"},
      
        1931
                        ),
      
        1932
                        content="I'll update the file with sed.",
      
        1933
                    ),
      
        1934
                    native_tool_response(
      
        1935
                        ToolCall(
      
        1936
                            id="edit-1",
      
        1937
                            name="edit",
      
        1938
                            arguments={
      
        1939
                                "file_path": str(target),
      
        1940
                                "old_string": "old value",
      
        1941
                                "new_string": "new value",
      
        1942
                            },
      
        1943
                        ),
      
        1944
                        content="I'll switch to the edit tool instead.",
      
        1945
                    ),
      
        1946
                    final_response("Updated the file with Loader's file tools."),
      
        1947
                ]
      
        1948
            )
      
        1949
        
        1950
            run = await run_scenario(
      
        1951
                "Update notes.txt from old value to new value.",
      
        1952
                backend,
      
        1953
                config=non_streaming_config(),
      
        1954
                project_root=temp_dir,
      
        1955
            )
      
        1956
        
        1957
            assert tool_event_names(run) == ["bash", "edit"]
      
        1958
            assert target.read_text() == "new value\n"
      
        1959
            messages = tool_result_messages(run)
      
        1960
            assert any("Shell-based text rewrites are brittle" in message for message in messages)
      
        1961
            steering_messages = [
      
        1962
                event.content
      
        1963
                for event in run.events
      
        1964
                if event.type == "steering" and event.content
      
        1965
            ]
      
        1966
            assert any("Use Loader's file tools for this text edit" in message for message in steering_messages)
      
        1967
        
        1968
        
        1969
        @pytest.mark.asyncio
      
        1970
        async def test_blocked_html_index_edit_queues_inventory_reuse_steering(
      
        1971
            temp_dir: Path,
      
        1972
        ) -> None:
      
        1973
            chapters = temp_dir / "chapters"
      
        1974
            chapters.mkdir()
      
        1975
            (chapters / "05-input-output.html").write_text("<h1>Chapter 5: Input and Output</h1>\n")
      
        1976
            index_file = temp_dir / "index.html"
      
        1977
            index_file.write_text(
      
        1978
                '<ul class="chapter-list">\n'
      
        1979
                '    <li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>\n'
      
        1980
                '</ul>\n'
      
        1981
            )
      
        1982
        
        1983
            backend = ScriptedBackend(
      
        1984
                completions=[
      
        1985
                    native_tool_response(
      
        1986
                        ToolCall(
      
        1987
                            id="glob-1",
      
        1988
                            name="glob",
      
        1989
                            arguments={"path": str(chapters), "pattern": "*.html"},
      
        1990
                        ),
      
        1991
                        content="I'll check which chapter files exist first.",
      
        1992
                    ),
      
        1993
                    native_tool_response(
      
        1994
                        ToolCall(
      
        1995
                            id="edit-1",
      
        1996
                            name="edit",
      
        1997
                            arguments={
      
        1998
                                "file_path": str(index_file),
      
        1999
                                "old_string": '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>',
      
        2000
                                "new_string": '<li><a href="chapters/05-control-structures.html">Chapter 5: Control Structures</a></li>',
      
        2001
                            },
      
        2002
                        ),
      
        2003
                        content="I'll update the TOC entry.",
      
        2004
                    ),
      
        2005
                    final_response("I'll reuse the known chapter inventory and correct the TOC."),
      
        2006
                ]
      
        2007
            )
      
        2008
        
        2009
            run = await run_scenario(
      
        2010
                "Fix the index table of contents so it matches the chapters directory.",
      
        2011
                backend,
      
        2012
                config=non_streaming_config(),
      
        2013
                project_root=temp_dir,
      
        2014
            )
      
        2015
        
        2016
            messages = tool_result_messages(run)
      
        2017
            steering_messages = [
      
        2018
                event.content
      
        2019
                for event in run.events
      
        2020
                if event.type == "steering" and event.content
      
        2021
            ]
      
        2022
        
        2023
            assert any("Edited HTML links point to files that do not exist" in message for message in messages)
      
        2024
            assert steering_messages == []
      
        2025
        
        2026
        
        2027
        @pytest.mark.asyncio
      
        2028
        async def test_blocked_root_html_write_cannot_drop_existing_local_pages(
      
        2029
            temp_dir: Path,
      
        2030
        ) -> None:
      
        2031
            guide_root = temp_dir / "guide"
      
        2032
            chapters = guide_root / "chapters"
      
        2033
            chapters.mkdir(parents=True)
      
        2034
            index_file = guide_root / "index.html"
      
        2035
            (chapters / "introduction.html").write_text("<h1>Introduction</h1>\n")
      
        2036
            (chapters / "installation.html").write_text("<h1>Installation</h1>\n")
      
        2037
            index_file.write_text(
      
        2038
                "\n".join(
      
        2039
                    [
      
        2040
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        2041
                        '<a href="chapters/installation.html">Installation</a>',
      
        2042
                    ]
      
        2043
                )
      
        2044
                + "\n"
      
        2045
            )
      
        2046
        
        2047
            backend = ScriptedBackend(
      
        2048
                completions=[
      
        2049
                    native_tool_response(
      
        2050
                        ToolCall(
      
        2051
                            id="write-1",
      
        2052
                            name="write",
      
        2053
                            arguments={
      
        2054
                                "file_path": str(index_file),
      
        2055
                                "content": (
      
        2056
                                    "<html><body>"
      
        2057
                                    '<a href="chapters/installation.html">Installation</a>'
      
        2058
                                    "</body></html>\n"
      
        2059
                                ),
      
        2060
                            },
      
        2061
                        ),
      
        2062
                        content="I'll rewrite the root page.",
      
        2063
                    ),
      
        2064
                    final_response("I'll keep the guide coherent."),
      
        2065
                ]
      
        2066
            )
      
        2067
        
        2068
            run = await run_scenario(
      
        2069
                "Update the guide root page.",
      
        2070
                backend,
      
        2071
                config=non_streaming_config(),
      
        2072
                project_root=temp_dir,
      
        2073
            )
      
        2074
        
        2075
            messages = tool_result_messages(run)
      
        2076
            assert any(
      
        2077
                "Edited HTML root page drops links to existing local pages" in message
      
        2078
                for message in messages
      
        2079
            )
      
        2080
        
        2081
        
        2082
        @pytest.mark.asyncio
      
        2083
        async def test_full_path_glob_pattern_still_injects_verified_html_inventory(
      
        2084
            temp_dir: Path,
      
        2085
        ) -> None:
      
        2086
            chapters = temp_dir / "chapters"
      
        2087
            chapters.mkdir()
      
        2088
            (chapters / "01-introduction.html").write_text(
      
        2089
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        2090
            )
      
        2091
            (chapters / "02-setup.html").write_text(
      
        2092
                "<h1>Chapter 2: Setting Up Fortran</h1>\n"
      
        2093
            )
      
        2094
            index_file = temp_dir / "index.html"
      
        2095
            index_file.write_text("broken table of contents\n")
      
        2096
        
        2097
            backend = ScriptedBackend(
      
        2098
                completions=[
      
        2099
                    native_tool_response(
      
        2100
                        ToolCall(
      
        2101
                            id="glob-1",
      
        2102
                            name="glob",
      
        2103
                            arguments={"pattern": f"{chapters}/*.html"},
      
        2104
                        ),
      
        2105
                        content="I'll inspect the chapter inventory first.",
      
        2106
                    ),
      
        2107
                    final_response("I'll update index.html using the verified inventory."),
      
        2108
                ]
      
        2109
            )
      
        2110
        
        2111
            run = await run_scenario(
      
        2112
                "Fix index.html so the chapter links match the real chapter files.",
      
        2113
                backend,
      
        2114
                config=non_streaming_config(),
      
        2115
                project_root=temp_dir,
      
        2116
            )
      
        2117
        
        2118
            assert tool_event_names(run) == ["glob"]
      
        2119
            messages = tool_result_messages(run)
      
        2120
            assert all("Verified chapter inventory:" not in message for message in messages)
      
        2121
        
        2122
        
        2123
        @pytest.mark.asyncio
      
        2124
        async def test_verified_html_inventory_blocks_redundant_chapter_reread(
      
        2125
            temp_dir: Path,
      
        2126
        ) -> None:
      
        2127
            chapters = temp_dir / "chapters"
      
        2128
            chapters.mkdir()
      
        2129
            (chapters / "01-introduction.html").write_text(
      
        2130
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        2131
            )
      
        2132
            (chapters / "02-setup.html").write_text(
      
        2133
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        2134
            )
      
        2135
            index_file = temp_dir / "index.html"
      
        2136
            index_file.write_text("broken table of contents\n")
      
        2137
        
        2138
            backend = ScriptedBackend(
      
        2139
                completions=[
      
        2140
                    native_tool_response(
      
        2141
                        ToolCall(
      
        2142
                            id="glob-1",
      
        2143
                            name="glob",
      
        2144
                            arguments={"path": str(chapters), "pattern": "*.html"},
      
        2145
                        ),
      
        2146
                        content="I'll inspect the chapter inventory first.",
      
        2147
                    ),
      
        2148
                    native_tool_response(
      
        2149
                        ToolCall(
      
        2150
                            id="read-1",
      
        2151
                            name="read",
      
        2152
                            arguments={"file_path": str(chapters / '01-introduction.html')},
      
        2153
                        ),
      
        2154
                        content="I'll open the first chapter file to extract its title.",
      
        2155
                    ),
      
        2156
                    final_response("I'll update index.html using the verified chapter inventory."),
      
        2157
                ]
      
        2158
            )
      
        2159
        
        2160
            run = await run_scenario(
      
        2161
                "Fix index.html so the chapter links and titles match the real chapter files.",
      
        2162
                backend,
      
        2163
                config=non_streaming_config(),
      
        2164
                project_root=temp_dir,
      
        2165
            )
      
        2166
        
        2167
            messages = tool_result_messages(run)
      
        2168
            assert all("Verified chapter inventory:" not in message for message in messages)
      
        2169
            assert all("verified sibling chapter inventory" not in message for message in messages)
      
        2170
        
        2171
        
        2172
        @pytest.mark.asyncio
      
        2173
        async def test_successful_html_toc_edit_blocks_post_success_reread_and_steers_to_finish(
      
        2174
            temp_dir: Path,
      
        2175
        ) -> None:
      
        2176
            chapters = temp_dir / "chapters"
      
        2177
            chapters.mkdir()
      
        2178
            (chapters / "01-introduction.html").write_text(
      
        2179
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        2180
            )
      
        2181
            (chapters / "02-setup.html").write_text(
      
        2182
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        2183
            )
      
        2184
            index_file = temp_dir / "index.html"
      
        2185
            old_block = (
      
        2186
                '<h2>Table of Contents</h2>\n'
      
        2187
                '<ul class="chapter-list">\n'
      
        2188
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        2189
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        2190
                '</ul>\n'
      
        2191
            )
      
        2192
            new_block = (
      
        2193
                '<h2>Table of Contents</h2>\n'
      
        2194
                '<ul class="chapter-list">\n'
      
        2195
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        2196
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        2197
                '</ul>\n'
      
        2198
            )
      
        2199
            index_file.write_text(new_block.replace("01-introduction.html", "01-old.html").replace("02-setup.html", "02-old.html").replace("Introduction to Fortran", "Old").replace("Setting Up Your Environment", "Old"))
      
        2200
        
        2201
            backend = ScriptedBackend(
      
        2202
                completions=[
      
        2203
                    native_tool_response(
      
        2204
                        ToolCall(
      
        2205
                            id="glob-1",
      
        2206
                            name="glob",
      
        2207
                            arguments={"path": str(chapters), "pattern": "*.html"},
      
        2208
                        ),
      
        2209
                        content="I'll inspect the chapter inventory first.",
      
        2210
                    ),
      
        2211
                    native_tool_response(
      
        2212
                        ToolCall(
      
        2213
                            id="read-1",
      
        2214
                            name="read",
      
        2215
                            arguments={"file_path": str(index_file)},
      
        2216
                        ),
      
        2217
                        content="I'll inspect index.html next.",
      
        2218
                    ),
      
        2219
                    native_tool_response(
      
        2220
                        ToolCall(
      
        2221
                            id="edit-1",
      
        2222
                            name="edit",
      
        2223
                            arguments={
      
        2224
                                "file_path": str(index_file),
      
        2225
                                "old_string": old_block,
      
        2226
                                "new_string": new_block,
      
        2227
                            },
      
        2228
                        ),
      
        2229
                        content="I'll fix the TOC now.",
      
        2230
                    ),
      
        2231
                    native_tool_response(
      
        2232
                        ToolCall(
      
        2233
                            id="read-2",
      
        2234
                            name="read",
      
        2235
                            arguments={"file_path": str(index_file)},
      
        2236
                        ),
      
        2237
                        content="I'll reread index.html to confirm the change.",
      
        2238
                    ),
      
        2239
                    final_response(
      
        2240
                        "I updated index.html so the table of contents matches the real chapter files."
      
        2241
                    ),
      
        2242
                ]
      
        2243
            )
      
        2244
        
        2245
            run = await run_scenario(
      
        2246
                "Update index.html so every chapter link and title matches the real HTML files in chapters/.",
      
        2247
                backend,
      
        2248
                config=non_streaming_config(),
      
        2249
                project_root=temp_dir,
      
        2250
            )
      
        2251
        
        2252
            messages = tool_result_messages(run)
      
        2253
            steering_messages = [
      
        2254
                event.content
      
        2255
                for event in run.events
      
        2256
                if event.type == "steering" and event.content
      
        2257
            ]
      
        2258
        
        2259
            assert all(
      
        2260
                "Semantic verification preview:" not in message
      
        2261
                for message in messages
      
        2262
            )
      
        2263
            assert steering_messages == []
      
        2264
            assert "updated index.html" in run.response.lower()
      
        2265
        
        2266
        
        2267
        @pytest.mark.asyncio
      
        2268
        async def test_exact_prompt_finishes_when_index_toc_is_already_correct(
      
        2269
            temp_dir: Path,
      
        2270
        ) -> None:
      
        2271
            chapters = temp_dir / "chapters"
      
        2272
            chapters.mkdir()
      
        2273
            (chapters / "01-introduction.html").write_text(
      
        2274
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        2275
            )
      
        2276
            (chapters / "02-setup.html").write_text(
      
        2277
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        2278
            )
      
        2279
            index_file = temp_dir / "index.html"
      
        2280
            index_file.write_text(
      
        2281
                "\n".join(
      
        2282
                    [
      
        2283
                        "<h2>Table of Contents</h2>",
      
        2284
                        '        <ul class="chapter-list">',
      
        2285
                        '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
      
        2286
                        '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>',
      
        2287
                        "        </ul>",
      
        2288
                        "",
      
        2289
                    ]
      
        2290
                )
      
        2291
            )
      
        2292
        
        2293
            backend = ScriptedBackend(
      
        2294
                completions=[
      
        2295
                    native_tool_response(
      
        2296
                        ToolCall(
      
        2297
                            id="read-1",
      
        2298
                            name="read",
      
        2299
                            arguments={"file_path": str(index_file)},
      
        2300
                        ),
      
        2301
                        content="I'll inspect index.html first.",
      
        2302
                    ),
      
        2303
                    native_tool_response(
      
        2304
                        ToolCall(
      
        2305
                            id="read-2",
      
        2306
                            name="read",
      
        2307
                            arguments={"file_path": str(index_file), "offset": 1, "limit": 8},
      
        2308
                        ),
      
        2309
                        content="I'll reread just the table-of-contents lines.",
      
        2310
                    ),
      
        2311
                    final_response(
      
        2312
                        "The table of contents is already correct, so no edit is needed."
      
        2313
                    ),
      
        2314
                ]
      
        2315
            )
      
        2316
        
        2317
            prompt = (
      
        2318
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        2319
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        2320
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        2321
                "links and their link texts to be correct."
      
        2322
            )
      
        2323
            run = await run_scenario(
      
        2324
                prompt,
      
        2325
                backend,
      
        2326
                config=non_streaming_config(),
      
        2327
                project_root=temp_dir,
      
        2328
            )
      
        2329
        
        2330
            messages = tool_result_messages(run)
      
        2331
            steering_messages = [
      
        2332
                event.content
      
        2333
                for event in run.events
      
        2334
                if event.type == "steering" and event.content
      
        2335
            ]
      
        2336
        
        2337
            assert all(
      
        2338
                "Semantic verification preview:" not in message
      
        2339
                for message in messages
      
        2340
            )
      
        2341
            assert steering_messages == []
      
        2342
            assert (
      
        2343
                sum(
      
        2344
                    1
      
        2345
                    for event in run.events
      
        2346
                    if event.type == "tool_call"
      
        2347
                    and event.tool_name == "read"
      
        2348
                    and event.phase != "verification"
      
        2349
                )
      
        2350
                == 1
      
        2351
            )
      
        2352
            assert "no edit is needed" in run.response.lower()
      
        2353
        
        2354
        
        2355
        @pytest.mark.asyncio
      
        2356
        async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
      
        2357
            temp_dir: Path,
      
        2358
        ) -> None:
      
        2359
            index_file = temp_dir / "index.html"
      
        2360
            chapter_file = temp_dir / "chapter-1.html"
      
        2361
            index_file.write_text("table of contents\n")
      
        2362
            chapter_file.write_text("chapter body\n")
      
        2363
        
        2364
            backend = ScriptedBackend(
      
        2365
                completions=[
      
        2366
                    native_tool_response(
      
        2367
                        ToolCall(
      
        2368
                            id="read-1",
      
        2369
                            name="read",
      
        2370
                            arguments={"file_path": str(index_file)},
      
        2371
                        ),
      
        2372
                        content="I'll inspect the index first.",
      
        2373
                    ),
      
        2374
                    native_tool_response(
      
        2375
                        ToolCall(
      
        2376
                            id="read-2",
      
        2377
                            name="read",
      
        2378
                            arguments={"file_path": str(chapter_file)},
      
        2379
                        ),
      
        2380
                        content="I'll inspect the chapter next.",
      
        2381
                    ),
      
        2382
                    native_tool_response(
      
        2383
                        ToolCall(
      
        2384
                            id="read-3",
      
        2385
                            name="read",
      
        2386
                            arguments={"file_path": str(index_file)},
      
        2387
                        ),
      
        2388
                        content="I'll reopen the index to reconcile the findings.",
      
        2389
                    ),
      
        2390
                    final_response("I re-opened the index after checking the chapter."),
      
        2391
                ]
      
        2392
            )
      
        2393
        
        2394
            run = await run_scenario(
      
        2395
                "Inspect the index, inspect a chapter, then return to the index.",
      
        2396
                backend,
      
        2397
                config=non_streaming_config(),
      
        2398
                project_root=temp_dir,
      
        2399
            )
      
        2400
        
        2401
            assert tool_event_names(run) == ["read", "read", "read"]
      
        2402
            messages = tool_result_messages(run)
      
        2403
            assert not any("Skipped - duplicate action" in message for message in messages)
      
        2404
            assert sum("table of contents" in message for message in messages) == 2
      
        2405
            assert any("chapter body" in message for message in messages)
      
        2406
        
        2407
        
        2408
        @pytest.mark.asyncio
      
        2409
        async def test_repeated_bash_probe_is_allowed_after_mutation(
      
        2410
            temp_dir: Path,
      
        2411
        ) -> None:
      
        2412
            target = temp_dir / "notes.txt"
      
        2413
            target.write_text("old value\n")
      
        2414
            list_command = f"ls -1 {temp_dir}"
      
        2415
        
        2416
            backend = ScriptedBackend(
      
        2417
                completions=[
      
        2418
                    native_tool_response(
      
        2419
                        ToolCall(id="bash-1", name="bash", arguments={"command": list_command}),
      
        2420
                        content="I'll inspect the directory first.",
      
        2421
                    ),
      
        2422
                    native_tool_response(
      
        2423
                        ToolCall(
      
        2424
                            id="edit-1",
      
        2425
                            name="edit",
      
        2426
                            arguments={
      
        2427
                                "file_path": str(target),
      
        2428
                                "old_string": "old value",
      
        2429
                                "new_string": "new value",
      
        2430
                            },
      
        2431
                        ),
      
        2432
                        content="I'll update the file.",
      
        2433
                    ),
      
        2434
                    native_tool_response(
      
        2435
                        ToolCall(id="bash-2", name="bash", arguments={"command": list_command}),
      
        2436
                        content="I'll list the directory again after the edit.",
      
        2437
                    ),
      
        2438
                    final_response("I re-ran ls after the edit without hitting duplicate rejection."),
      
        2439
                ]
      
        2440
            )
      
        2441
        
        2442
            run = await run_scenario(
      
        2443
                "Inspect the directory, edit the file, then inspect again.",
      
        2444
                backend,
      
        2445
                config=non_streaming_config(),
      
        2446
                project_root=temp_dir,
      
        2447
            )
      
        2448
        
        2449
            assert tool_event_names(run) == ["bash", "edit", "bash"]
      
        2450
            messages = tool_result_messages(run)
      
        2451
            assert not any("Skipped - duplicate action" in message for message in messages)
      
        2452
            assert sum("notes.txt" in message for message in messages) >= 2
      
        2453
            assert target.read_text() == "new value\n"