loader Public

Watch 0 Fork 0 Star 0
Python · 173054 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.permissions import (
      
        20
            PermissionMode,
      
        21
            build_permission_policy,
      
        22
            load_permission_rules,
      
        23
        )
      
        24
        from loader.runtime.reasoning_types import (
      
        25
            ActionVerification,
      
        26
            ConfidenceAssessment,
      
        27
            ConfidenceLevel,
      
        28
        )
      
        29
        from loader.runtime.recovery import RecoveryContext
      
        30
        from loader.runtime.tool_batches import (
      
        31
            ToolBatchRunner,
      
        32
        )
      
        33
        from loader.runtime.tool_batches import (
      
        34
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        35
        )
      
        36
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        37
        from loader.tools.base import ToolResult as RegistryToolResult
      
        38
        from loader.tools.base import create_default_registry
      
        39
        from tests.helpers.runtime_harness import ScriptedBackend
      
        40
        
        41
        
        42
        class FakeSession:
      
        43
            def __init__(self, messages: list[Message]) -> None:
      
        44
                self.messages = list(messages)
      
        45
                self.workflow_timeline = []
      
        46
        
        47
            def append(self, message: Message) -> None:
      
        48
                self.messages.append(message)
      
        49
        
        50
            def append_workflow_timeline_entry(self, entry) -> None:
      
        51
                self.workflow_timeline.append(entry)
      
        52
        
        53
        
        54
        class FakeCodeFilter:
      
        55
            def reset(self) -> None:
      
        56
                return None
      
        57
        
        58
        
        59
        class FakeSafeguards:
      
        60
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        61
                self.action_tracker = object()
      
        62
                self.validator = object()
      
        63
                self.code_filter = FakeCodeFilter()
      
        64
                self._detect_loop_result = detect_loop_result
      
        65
        
        66
            def filter_stream_chunk(self, content: str) -> str:
      
        67
                return content
      
        68
        
        69
            def filter_complete_content(self, content: str) -> str:
      
        70
                return content
      
        71
        
        72
            def should_steer(self) -> bool:
      
        73
                return False
      
        74
        
        75
            def get_steering_message(self) -> str | None:
      
        76
                return None
      
        77
        
        78
            def record_response(self, content: str) -> None:
      
        79
                return None
      
        80
        
        81
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        82
                return False, ""
      
        83
        
        84
            def detect_loop(self) -> tuple[bool, str]:
      
        85
                return self._detect_loop_result
      
        86
        
        87
        
        88
        class FakeExecutor:
      
        89
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        90
                self._outcomes = list(outcomes)
      
        91
                self.calls: list[ToolCall] = []
      
        92
        
        93
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        94
                self.calls.append(tool_call)
      
        95
                if not self._outcomes:
      
        96
                    raise AssertionError("No fake tool outcome queued")
      
        97
                return self._outcomes.pop(0)
      
        98
        
        99
        
        100
        def build_context(
      
        101
            *,
      
        102
            temp_dir: Path,
      
        103
            messages: list[Message],
      
        104
            safeguards: FakeSafeguards,
      
        105
            assess_confidence,
      
        106
            verify_action,
      
        107
            recovery_context: RecoveryContext | None = None,
      
        108
            confidence_scoring: bool = False,
      
        109
            verification: bool = False,
      
        110
            auto_recover: bool = True,
      
        111
            min_confidence_for_action: int = 3,
      
        112
        ) -> RuntimeContext:
      
        113
            registry = create_default_registry(temp_dir)
      
        114
            registry.configure_workspace_root(temp_dir)
      
        115
            rule_status = load_permission_rules(temp_dir)
      
        116
            policy = build_permission_policy(
      
        117
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        118
                workspace_root=temp_dir,
      
        119
                tool_requirements=registry.get_tool_requirements(),
      
        120
                rules=rule_status.rules,
      
        121
            )
      
        122
            context = RuntimeContext(
      
        123
                project_root=temp_dir,
      
        124
                backend=ScriptedBackend(),
      
        125
                registry=registry,
      
        126
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        127
                config=SimpleNamespace(
      
        128
                    force_react=False,
      
        129
                    max_recovery_attempts=2,
      
        130
                    auto_recover=auto_recover,
      
        131
                    reasoning=SimpleNamespace(
      
        132
                        rollback=False,
      
        133
                        show_rollback_plan=False,
      
        134
                        completion_check=True,
      
        135
                        max_continuation_prompts=5,
      
        136
                        self_critique=False,
      
        137
                        confidence_scoring=confidence_scoring,
      
        138
                        min_confidence_for_action=min_confidence_for_action,
      
        139
                        verification=verification,
      
        140
                    ),
      
        141
                ),
      
        142
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        143
                project_context=None,
      
        144
                permission_policy=policy,
      
        145
                permission_config_status=rule_status,
      
        146
                workflow_mode="execute",
      
        147
                safeguards=safeguards,
      
        148
                reasoning=SimpleNamespace(
      
        149
                    assess_confidence=assess_confidence,
      
        150
                    verify_action=verify_action,
      
        151
                ),
      
        152
                recovery_context=recovery_context,
      
        153
            )
      
        154
            return context
      
        155
        
        156
        
        157
        def tool_outcome(
      
        158
            *,
      
        159
            tool_call: ToolCall,
      
        160
            output: str,
      
        161
            is_error: bool,
      
        162
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        163
            metadata: dict[str, object] | None = None,
      
        164
        ) -> ToolExecutionOutcome:
      
        165
            return ToolExecutionOutcome(
      
        166
                tool_call=tool_call,
      
        167
                state=state,
      
        168
                message=Message.tool_result_message(
      
        169
                    tool_call_id=tool_call.id,
      
        170
                    display_content=output,
      
        171
                    result_content=output,
      
        172
                    is_error=is_error,
      
        173
                ),
      
        174
                event_content=output,
      
        175
                is_error=is_error,
      
        176
                result_output=output,
      
        177
                registry_result=RegistryToolResult(
      
        178
                    output=output,
      
        179
                    is_error=is_error,
      
        180
                    metadata=metadata or {},
      
        181
                ),
      
        182
            )
      
        183
        
        184
        
        185
        @pytest.mark.asyncio
      
        186
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        187
            captured: dict[str, str] = {}
      
        188
        
        189
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        190
                captured["context"] = context
      
        191
                return ConfidenceAssessment(
      
        192
                    action=f"{tool_name} with {tool_args}",
      
        193
                    tool_name=tool_name,
      
        194
                    tool_args=tool_args,
      
        195
                    level=ConfidenceLevel.LOW,
      
        196
                    reasoning="Need to inspect the target first.",
      
        197
                    risks=["Unknown target file"],
      
        198
                )
      
        199
        
        200
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        201
                raise AssertionError("Verification should not run for skipped actions")
      
        202
        
        203
            context = build_context(
      
        204
                temp_dir=temp_dir,
      
        205
                messages=[
      
        206
                    Message(role=Role.USER, content="Please inspect the project."),
      
        207
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        208
                ],
      
        209
                safeguards=FakeSafeguards(),
      
        210
                assess_confidence=assess_confidence,
      
        211
                verify_action=verify_action,
      
        212
                confidence_scoring=True,
      
        213
                min_confidence_for_action=3,
      
        214
            )
      
        215
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        216
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        217
            events: list[AgentEvent] = []
      
        218
        
        219
            async def emit(event: AgentEvent) -> None:
      
        220
                events.append(event)
      
        221
        
        222
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        223
            result = await runner.execute_batch(
      
        224
                tool_calls=[tool_call],
      
        225
                tool_source="assistant",
      
        226
                pending_tool_calls_seen=set(),
      
        227
                emit=emit,
      
        228
                summary=TurnSummary(final_response=""),
      
        229
                dod=create_definition_of_done("Read the docs"),
      
        230
                executor=executor,  # type: ignore[arg-type]
      
        231
                on_confirmation=None,
      
        232
                on_user_question=None,
      
        233
                emit_confirmation=None,
      
        234
                consecutive_errors=0,
      
        235
            )
      
        236
        
        237
            assert result.actions_taken == []
      
        238
            assert executor.calls == []
      
        239
            assert "Please inspect the project." in captured["context"]
      
        240
            assert context.session.messages[-1].role == Role.USER
      
        241
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        242
            event_types = [event.type for event in events]
      
        243
            assert "confidence" in event_types
      
        244
        
        245
        
        246
        @pytest.mark.asyncio
      
        247
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        248
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        250
        
        251
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        252
                raise AssertionError("Verification should not run for failed actions")
      
        253
        
        254
            context = build_context(
      
        255
                temp_dir=temp_dir,
      
        256
                messages=[],
      
        257
                safeguards=FakeSafeguards(),
      
        258
                assess_confidence=assess_confidence,
      
        259
                verify_action=verify_action,
      
        260
                auto_recover=True,
      
        261
            )
      
        262
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        263
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        264
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        265
            summary = TurnSummary(final_response="")
      
        266
            events: list[AgentEvent] = []
      
        267
        
        268
            async def emit(event: AgentEvent) -> None:
      
        269
                events.append(event)
      
        270
        
        271
            await runner.execute_batch(
      
        272
                tool_calls=[tool_call],
      
        273
                tool_source="assistant",
      
        274
                pending_tool_calls_seen=set(),
      
        275
                emit=emit,
      
        276
                summary=summary,
      
        277
                dod=create_definition_of_done("Run tests"),
      
        278
                executor=executor,  # type: ignore[arg-type]
      
        279
                on_confirmation=None,
      
        280
                on_user_question=None,
      
        281
                emit_confirmation=None,
      
        282
                consecutive_errors=0,
      
        283
            )
      
        284
        
        285
            assert context.recovery_context is not None
      
        286
            assert summary.tool_result_messages
      
        287
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        288
            assert any(event.type == "recovery" for event in events)
      
        289
        
        290
        
        291
        @pytest.mark.asyncio
      
        292
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        293
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        294
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        295
        
        296
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        297
                raise AssertionError("Verification should not run for this scenario")
      
        298
        
        299
            context = build_context(
      
        300
                temp_dir=temp_dir,
      
        301
                messages=[],
      
        302
                safeguards=FakeSafeguards(),
      
        303
                assess_confidence=assess_confidence,
      
        304
                verify_action=verify_action,
      
        305
                auto_recover=False,
      
        306
            )
      
        307
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        308
            tool_call = ToolCall(
      
        309
                id="bash-1",
      
        310
                name="bash",
      
        311
                arguments={"command": "python -m http.server 8000", "background": True},
      
        312
            )
      
        313
            metadata = {
      
        314
                "job_id": "bash-1",
      
        315
                "status": "running",
      
        316
                "background": True,
      
        317
            }
      
        318
            executor = FakeExecutor(
      
        319
                [
      
        320
                    tool_outcome(
      
        321
                        tool_call=tool_call,
      
        322
                        output="Started bash job bash-1",
      
        323
                        is_error=False,
      
        324
                        metadata=metadata,
      
        325
                    )
      
        326
                ]
      
        327
            )
      
        328
            events: list[AgentEvent] = []
      
        329
        
        330
            async def emit(event: AgentEvent) -> None:
      
        331
                events.append(event)
      
        332
        
        333
            await runner.execute_batch(
      
        334
                tool_calls=[tool_call],
      
        335
                tool_source="assistant",
      
        336
                pending_tool_calls_seen=set(),
      
        337
                emit=emit,
      
        338
                summary=TurnSummary(final_response=""),
      
        339
                dod=create_definition_of_done("Launch a preview server"),
      
        340
                executor=executor,  # type: ignore[arg-type]
      
        341
                on_confirmation=None,
      
        342
                on_user_question=None,
      
        343
                emit_confirmation=None,
      
        344
                consecutive_errors=0,
      
        345
            )
      
        346
        
        347
            tool_result = next(event for event in events if event.type == "tool_result")
      
        348
            assert tool_result.tool_metadata == metadata
      
        349
        
        350
        
        351
        @pytest.mark.asyncio
      
        352
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        353
            verification_calls: list[str] = []
      
        354
        
        355
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        356
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        357
        
        358
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        359
                verification_calls.append(result)
      
        360
                return ActionVerification(
      
        361
                    tool_name=tool_name,
      
        362
                    tool_args=tool_args,
      
        363
                    expected_outcome="Success",
      
        364
                    actual_result=result,
      
        365
                    verified=False,
      
        366
                    discrepancies=["File contents did not match"],
      
        367
                    needs_correction=True,
      
        368
                    correction_suggestion="Read the file before editing again.",
      
        369
                )
      
        370
        
        371
            existing_recovery = RecoveryContext(
      
        372
                original_tool="edit",
      
        373
                original_args={"file_path": "README.md"},
      
        374
            )
      
        375
            context = build_context(
      
        376
                temp_dir=temp_dir,
      
        377
                messages=[],
      
        378
                safeguards=FakeSafeguards(),
      
        379
                assess_confidence=assess_confidence,
      
        380
                verify_action=verify_action,
      
        381
                recovery_context=existing_recovery,
      
        382
                verification=True,
      
        383
            )
      
        384
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        385
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        386
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        387
            events: list[AgentEvent] = []
      
        388
        
        389
            async def emit(event: AgentEvent) -> None:
      
        390
                events.append(event)
      
        391
        
        392
            await runner.execute_batch(
      
        393
                tool_calls=[tool_call],
      
        394
                tool_source="assistant",
      
        395
                pending_tool_calls_seen=set(),
      
        396
                emit=emit,
      
        397
                summary=TurnSummary(final_response=""),
      
        398
                dod=create_definition_of_done("Read the docs"),
      
        399
                executor=executor,  # type: ignore[arg-type]
      
        400
                on_confirmation=None,
      
        401
                on_user_question=None,
      
        402
                emit_confirmation=None,
      
        403
                consecutive_errors=0,
      
        404
            )
      
        405
        
        406
            assert verification_calls == ["file contents"]
      
        407
            assert context.recovery_context is existing_recovery
      
        408
            assert existing_recovery.successful_steps == [
      
        409
                ("read", {"file_path": "README.md"})
      
        410
            ]
      
        411
            assert context.session.messages[-1].role == Role.TOOL
      
        412
            assert context.session.messages[-1].content == "file contents"
      
        413
            assert any(event.type == "verification" for event in events)
      
        414
        
        415
        
        416
        @pytest.mark.asyncio
      
        417
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        418
            temp_dir: Path,
      
        419
        ) -> None:
      
        420
            async def assess_confidence(
      
        421
                tool_name: str,
      
        422
                tool_args: dict,
      
        423
                context: str,
      
        424
            ) -> ConfidenceAssessment:
      
        425
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        426
        
        427
            async def verify_action(
      
        428
                tool_name: str,
      
        429
                tool_args: dict,
      
        430
                result: str,
      
        431
                expected: str = "",
      
        432
            ) -> ActionVerification:
      
        433
                raise AssertionError("Verification should not run for this scenario")
      
        434
        
        435
            existing_recovery = RecoveryContext(
      
        436
                original_tool="read",
      
        437
                original_args={"file_path": "chapters/04-data-types.html"},
      
        438
            )
      
        439
            existing_recovery.add_attempt(
      
        440
                "read",
      
        441
                {"file_path": "chapters/04-data-types.html"},
      
        442
                "File not found",
      
        443
            )
      
        444
            context = build_context(
      
        445
                temp_dir=temp_dir,
      
        446
                messages=[],
      
        447
                safeguards=FakeSafeguards(),
      
        448
                assess_confidence=assess_confidence,
      
        449
                verify_action=verify_action,
      
        450
                recovery_context=existing_recovery,
      
        451
                auto_recover=False,
      
        452
            )
      
        453
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        454
            tool_call = ToolCall(
      
        455
                id="bash-1",
      
        456
                name="bash",
      
        457
                arguments={"command": "ls chapters"},
      
        458
            )
      
        459
            executor = FakeExecutor(
      
        460
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        461
            )
      
        462
        
        463
            summary = TurnSummary(final_response="")
      
        464
            await runner.execute_batch(
      
        465
                tool_calls=[tool_call],
      
        466
                tool_source="assistant",
      
        467
                pending_tool_calls_seen=set(),
      
        468
                emit=_noop_emit,
      
        469
                summary=summary,
      
        470
                dod=create_definition_of_done("Fix the chapter links"),
      
        471
                executor=executor,  # type: ignore[arg-type]
      
        472
                on_confirmation=None,
      
        473
                on_user_question=None,
      
        474
                emit_confirmation=None,
      
        475
                consecutive_errors=0,
      
        476
            )
      
        477
        
        478
            assert context.recovery_context is existing_recovery
      
        479
            assert existing_recovery.successful_steps == [
      
        480
                ("bash", {"command": "ls chapters"})
      
        481
            ]
      
        482
        
        483
        
        484
        @pytest.mark.asyncio
      
        485
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        486
            temp_dir: Path,
      
        487
        ) -> None:
      
        488
            async def assess_confidence(
      
        489
                tool_name: str,
      
        490
                tool_args: dict,
      
        491
                context: str,
      
        492
            ) -> ConfidenceAssessment:
      
        493
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        494
        
        495
            async def verify_action(
      
        496
                tool_name: str,
      
        497
                tool_args: dict,
      
        498
                result: str,
      
        499
                expected: str = "",
      
        500
            ) -> ActionVerification:
      
        501
                raise AssertionError("Verification should not run for this scenario")
      
        502
        
        503
            existing_recovery = RecoveryContext(
      
        504
                original_tool="read",
      
        505
                original_args={"file_path": "chapters/04-data-types.html"},
      
        506
            )
      
        507
            existing_recovery.add_attempt(
      
        508
                "read",
      
        509
                {"file_path": "chapters/04-data-types.html"},
      
        510
                "File not found",
      
        511
            )
      
        512
            context = build_context(
      
        513
                temp_dir=temp_dir,
      
        514
                messages=[],
      
        515
                safeguards=FakeSafeguards(),
      
        516
                assess_confidence=assess_confidence,
      
        517
                verify_action=verify_action,
      
        518
                recovery_context=existing_recovery,
      
        519
                auto_recover=False,
      
        520
            )
      
        521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        522
            tool_call = ToolCall(
      
        523
                id="patch-1",
      
        524
                name="patch",
      
        525
                arguments={
      
        526
                    "file_path": "index.html",
      
        527
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        528
                },
      
        529
            )
      
        530
            executor = FakeExecutor(
      
        531
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        532
            )
      
        533
        
        534
            summary = TurnSummary(final_response="")
      
        535
            await runner.execute_batch(
      
        536
                tool_calls=[tool_call],
      
        537
                tool_source="assistant",
      
        538
                pending_tool_calls_seen=set(),
      
        539
                emit=_noop_emit,
      
        540
                summary=summary,
      
        541
                dod=create_definition_of_done("Fix the chapter links"),
      
        542
                executor=executor,  # type: ignore[arg-type]
      
        543
                on_confirmation=None,
      
        544
                on_user_question=None,
      
        545
                emit_confirmation=None,
      
        546
                consecutive_errors=0,
      
        547
            )
      
        548
        
        549
            assert context.recovery_context is None
      
        550
        
        551
        
        552
        @pytest.mark.asyncio
      
        553
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        554
            temp_dir: Path,
      
        555
        ) -> None:
      
        556
            async def assess_confidence(
      
        557
                tool_name: str,
      
        558
                tool_args: dict,
      
        559
                context: str,
      
        560
            ) -> ConfidenceAssessment:
      
        561
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        562
        
        563
            async def verify_action(
      
        564
                tool_name: str,
      
        565
                tool_args: dict,
      
        566
                result: str,
      
        567
                expected: str = "",
      
        568
            ) -> ActionVerification:
      
        569
                raise AssertionError("Verification should not run for this scenario")
      
        570
        
        571
            messages = [
      
        572
                Message(
      
        573
                    role=Role.TOOL,
      
        574
                    content=(
      
        575
                        "Observation [glob]: Result: "
      
        576
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        577
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        578
                        f"{temp_dir}/chapters/03-basics.html"
      
        579
                    ),
      
        580
                    tool_results=[],
      
        581
                ),
      
        582
                Message(
      
        583
                    role=Role.ASSISTANT,
      
        584
                    content="I already inspected the first chapter title.",
      
        585
                    tool_calls=[
      
        586
                        ToolCall(
      
        587
                            id="read-ch1",
      
        588
                            name="read",
      
        589
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        590
                        )
      
        591
                    ],
      
        592
                ),
      
        593
                Message.tool_result_message(
      
        594
                    tool_call_id="read-ch1",
      
        595
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        596
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                ),
      
        598
                Message(
      
        599
                    role=Role.ASSISTANT,
      
        600
                    content="I should update the index now.",
      
        601
                    tool_calls=[
      
        602
                        ToolCall(
      
        603
                            id="read-index",
      
        604
                            name="read",
      
        605
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        606
                        )
      
        607
                    ],
      
        608
                ),
      
        609
            ]
      
        610
            context = build_context(
      
        611
                temp_dir=temp_dir,
      
        612
                messages=messages,
      
        613
                safeguards=FakeSafeguards(),
      
        614
                assess_confidence=assess_confidence,
      
        615
                verify_action=verify_action,
      
        616
                auto_recover=False,
      
        617
            )
      
        618
            (temp_dir / "chapters").mkdir()
      
        619
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        620
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        621
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        622
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        623
            implementation_plan = temp_dir / "implementation.md"
      
        624
            implementation_plan.write_text(
      
        625
                "\n".join(
      
        626
                    [
      
        627
                        "# Implementation Plan",
      
        628
                        "",
      
        629
                        "## File Changes",
      
        630
                        f"- `{temp_dir / 'index.html'}`",
      
        631
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        635
                    ]
      
        636
                )
      
        637
            )
      
        638
            context.session.current_task = (
      
        639
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        640
            )
      
        641
            persistent_messages: list[str] = []
      
        642
            ephemeral_messages: list[str] = []
      
        643
            context.queue_steering_message_callback = persistent_messages.append
      
        644
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        645
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        646
            tool_call = ToolCall(
      
        647
                id="read-dup",
      
        648
                name="read",
      
        649
                arguments={"file_path": str(temp_dir / "index.html")},
      
        650
            )
      
        651
            duplicate_message = (
      
        652
                "[Skipped - duplicate action: Already read "
      
        653
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        654
                "reuse the earlier read result instead of rereading]"
      
        655
            )
      
        656
            executor = FakeExecutor(
      
        657
                [
      
        658
                    ToolExecutionOutcome(
      
        659
                        tool_call=tool_call,
      
        660
                        state=ToolExecutionState.DUPLICATE,
      
        661
                        message=Message.tool_result_message(
      
        662
                            tool_call_id=tool_call.id,
      
        663
                            display_content=duplicate_message,
      
        664
                            result_content=duplicate_message,
      
        665
                        ),
      
        666
                        event_content=duplicate_message,
      
        667
                        is_error=False,
      
        668
                        result_output=duplicate_message,
      
        669
                    )
      
        670
                ]
      
        671
            )
      
        672
        
        673
            summary = TurnSummary(final_response="")
      
        674
            dod = create_definition_of_done("Fix the chapter links")
      
        675
            dod.implementation_plan = str(implementation_plan)
      
        676
            dod.pending_items.append("Create the remaining chapter files")
      
        677
            await runner.execute_batch(
      
        678
                tool_calls=[tool_call],
      
        679
                tool_source="assistant",
      
        680
                pending_tool_calls_seen=set(),
      
        681
                emit=_noop_emit,
      
        682
                summary=summary,
      
        683
                dod=dod,
      
        684
                executor=executor,  # type: ignore[arg-type]
      
        685
                on_confirmation=None,
      
        686
                on_user_question=None,
      
        687
                emit_confirmation=None,
      
        688
                consecutive_errors=0,
      
        689
            )
      
        690
        
        691
            assert len(persistent_messages) == 1
      
        692
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        693
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        694
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        695
            assert (
      
        696
                f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
      
        697
                in persistent_messages[0]
      
        698
            )
      
        699
            assert ephemeral_messages == []
      
        700
        
        701
        
        702
        @pytest.mark.asyncio
      
        703
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        704
            temp_dir: Path,
      
        705
        ) -> None:
      
        706
            async def assess_confidence(
      
        707
                tool_name: str,
      
        708
                tool_args: dict,
      
        709
                context: str,
      
        710
            ) -> ConfidenceAssessment:
      
        711
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        712
        
        713
            async def verify_action(
      
        714
                tool_name: str,
      
        715
                tool_args: dict,
      
        716
                result: str,
      
        717
                expected: str = "",
      
        718
            ) -> ActionVerification:
      
        719
                raise AssertionError("Verification should not run for this scenario")
      
        720
        
        721
            context = build_context(
      
        722
                temp_dir=temp_dir,
      
        723
                messages=[],
      
        724
                safeguards=FakeSafeguards(),
      
        725
                assess_confidence=assess_confidence,
      
        726
                verify_action=verify_action,
      
        727
                auto_recover=False,
      
        728
            )
      
        729
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        730
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        731
            sync_todos_to_definition_of_done(
      
        732
                dod,
      
        733
                [
      
        734
                    {
      
        735
                        "content": "Create 03-first-website.html",
      
        736
                        "active_form": "Creating 03-first-website.html",
      
        737
                        "status": "pending",
      
        738
                    },
      
        739
                    {
      
        740
                        "content": "Create 04-configuration-basics.html",
      
        741
                        "active_form": "Creating 04-configuration-basics.html",
      
        742
                        "status": "pending",
      
        743
                    },
      
        744
                ],
      
        745
            )
      
        746
        
        747
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        748
            chapter_path.parent.mkdir(parents=True)
      
        749
            write_call = ToolCall(
      
        750
                id="write-ch3",
      
        751
                name="write",
      
        752
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        753
            )
      
        754
            stale_todo_call = ToolCall(
      
        755
                id="todo-stale",
      
        756
                name="TodoWrite",
      
        757
                arguments={
      
        758
                    "todos": [
      
        759
                        {
      
        760
                            "content": "Create 03-first-website.html",
      
        761
                            "active_form": "Creating 03-first-website.html",
      
        762
                            "status": "pending",
      
        763
                        },
      
        764
                        {
      
        765
                            "content": "Create 04-configuration-basics.html",
      
        766
                            "active_form": "Creating 04-configuration-basics.html",
      
        767
                            "status": "pending",
      
        768
                        },
      
        769
                    ]
      
        770
                },
      
        771
            )
      
        772
            executor = FakeExecutor(
      
        773
                [
      
        774
                    tool_outcome(
      
        775
                        tool_call=write_call,
      
        776
                        output=f"Successfully wrote {chapter_path}",
      
        777
                        is_error=False,
      
        778
                    ),
      
        779
                    tool_outcome(
      
        780
                        tool_call=stale_todo_call,
      
        781
                        output="Todos updated",
      
        782
                        is_error=False,
      
        783
                        metadata={
      
        784
                            "new_todos": [
      
        785
                                {
      
        786
                                    "content": "Create 03-first-website.html",
      
        787
                                    "active_form": "Creating 03-first-website.html",
      
        788
                                    "status": "pending",
      
        789
                                },
      
        790
                                {
      
        791
                                    "content": "Create 04-configuration-basics.html",
      
        792
                                    "active_form": "Creating 04-configuration-basics.html",
      
        793
                                    "status": "pending",
      
        794
                                },
      
        795
                            ]
      
        796
                        },
      
        797
                    ),
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            await runner.execute_batch(
      
        803
                tool_calls=[write_call, stale_todo_call],
      
        804
                tool_source="assistant",
      
        805
                pending_tool_calls_seen=set(),
      
        806
                emit=_noop_emit,
      
        807
                summary=summary,
      
        808
                dod=dod,
      
        809
                executor=executor,  # type: ignore[arg-type]
      
        810
                on_confirmation=None,
      
        811
                on_user_question=None,
      
        812
                emit_confirmation=None,
      
        813
                consecutive_errors=0,
      
        814
            )
      
        815
        
        816
            assert "Create 03-first-website.html" in dod.completed_items
      
        817
            assert "Create 03-first-website.html" not in dod.pending_items
      
        818
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        819
        
        820
        
        821
        @pytest.mark.asyncio
      
        822
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        823
            temp_dir: Path,
      
        824
        ) -> None:
      
        825
            async def assess_confidence(
      
        826
                tool_name: str,
      
        827
                tool_args: dict,
      
        828
                context: str,
      
        829
            ) -> ConfidenceAssessment:
      
        830
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        831
        
        832
            async def verify_action(
      
        833
                tool_name: str,
      
        834
                tool_args: dict,
      
        835
                result: str,
      
        836
                expected: str = "",
      
        837
            ) -> ActionVerification:
      
        838
                raise AssertionError("Verification should not run for this scenario")
      
        839
        
        840
            chapters = temp_dir / "chapters"
      
        841
            chapters.mkdir()
      
        842
            (chapters / "01-introduction.html").write_text(
      
        843
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        844
            )
      
        845
            (chapters / "02-setup.html").write_text(
      
        846
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        847
            )
      
        848
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        849
        
        850
            context = build_context(
      
        851
                temp_dir=temp_dir,
      
        852
                messages=[],
      
        853
                safeguards=FakeSafeguards(),
      
        854
                assess_confidence=assess_confidence,
      
        855
                verify_action=verify_action,
      
        856
                auto_recover=False,
      
        857
            )
      
        858
            context.session.current_task = (
      
        859
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        860
            )
      
        861
            persistent_messages: list[str] = []
      
        862
            ephemeral_messages: list[str] = []
      
        863
            context.queue_steering_message_callback = persistent_messages.append
      
        864
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        865
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        866
            tool_call = ToolCall(
      
        867
                id="glob-1",
      
        868
                name="glob",
      
        869
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        870
            )
      
        871
            executor = FakeExecutor(
      
        872
                [
      
        873
                    tool_outcome(
      
        874
                        tool_call=tool_call,
      
        875
                        output="\n".join(
      
        876
                            [
      
        877
                                str(chapters / "01-introduction.html"),
      
        878
                                str(chapters / "02-setup.html"),
      
        879
                            ]
      
        880
                        ),
      
        881
                        is_error=False,
      
        882
                    )
      
        883
                ]
      
        884
            )
      
        885
        
        886
            summary = TurnSummary(final_response="")
      
        887
            await runner.execute_batch(
      
        888
                tool_calls=[tool_call],
      
        889
                tool_source="assistant",
      
        890
                pending_tool_calls_seen=set(),
      
        891
                emit=_noop_emit,
      
        892
                summary=summary,
      
        893
                dod=create_definition_of_done("Fix the chapter links"),
      
        894
                executor=executor,  # type: ignore[arg-type]
      
        895
                on_confirmation=None,
      
        896
                on_user_question=None,
      
        897
                emit_confirmation=None,
      
        898
                consecutive_errors=0,
      
        899
            )
      
        900
        
        901
            assert persistent_messages == []
      
        902
            assert ephemeral_messages == []
      
        903
            assert len(summary.tool_result_messages) == 1
      
        904
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        905
        
        906
        
        907
        @pytest.mark.asyncio
      
        908
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        909
            temp_dir: Path,
      
        910
        ) -> None:
      
        911
            async def assess_confidence(
      
        912
                tool_name: str,
      
        913
                tool_args: dict,
      
        914
                context: str,
      
        915
            ) -> ConfidenceAssessment:
      
        916
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        917
        
        918
            async def verify_action(
      
        919
                tool_name: str,
      
        920
                tool_args: dict,
      
        921
                result: str,
      
        922
                expected: str = "",
      
        923
            ) -> ActionVerification:
      
        924
                raise AssertionError("Verification should not run for this scenario")
      
        925
        
        926
            chapters = temp_dir / "chapters"
      
        927
            chapters.mkdir()
      
        928
            (chapters / "01-introduction.html").write_text(
      
        929
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        930
            )
      
        931
            (chapters / "02-setup.html").write_text(
      
        932
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        933
            )
      
        934
            index_path = temp_dir / "index.html"
      
        935
            old_block = (
      
        936
                '<ul class="chapter-list">\n'
      
        937
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        938
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        939
                "</ul>\n"
      
        940
            )
      
        941
            new_block = (
      
        942
                '<ul class="chapter-list">\n'
      
        943
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        944
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        945
                "</ul>\n"
      
        946
            )
      
        947
            index_path.write_text(new_block)
      
        948
        
        949
            context = build_context(
      
        950
                temp_dir=temp_dir,
      
        951
                messages=[],
      
        952
                safeguards=FakeSafeguards(),
      
        953
                assess_confidence=assess_confidence,
      
        954
                verify_action=verify_action,
      
        955
                auto_recover=False,
      
        956
            )
      
        957
            context.session.current_task = (
      
        958
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        959
            )
      
        960
            persistent_messages: list[str] = []
      
        961
            ephemeral_messages: list[str] = []
      
        962
            context.queue_steering_message_callback = persistent_messages.append
      
        963
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        964
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        965
            tool_call = ToolCall(
      
        966
                id="edit-1",
      
        967
                name="edit",
      
        968
                arguments={
      
        969
                    "file_path": str(index_path),
      
        970
                    "old_string": old_block,
      
        971
                    "new_string": new_block,
      
        972
                },
      
        973
            )
      
        974
            executor = FakeExecutor(
      
        975
                [
      
        976
                    tool_outcome(
      
        977
                        tool_call=tool_call,
      
        978
                        output=f"Successfully edited {index_path}",
      
        979
                        is_error=False,
      
        980
                    )
      
        981
                ]
      
        982
            )
      
        983
        
        984
            summary = TurnSummary(final_response="")
      
        985
            await runner.execute_batch(
      
        986
                tool_calls=[tool_call],
      
        987
                tool_source="assistant",
      
        988
                pending_tool_calls_seen=set(),
      
        989
                emit=_noop_emit,
      
        990
                summary=summary,
      
        991
                dod=create_definition_of_done(
      
        992
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        993
                ),
      
        994
                executor=executor,  # type: ignore[arg-type]
      
        995
                on_confirmation=None,
      
        996
                on_user_question=None,
      
        997
                emit_confirmation=None,
      
        998
                consecutive_errors=0,
      
        999
            )
      
        1000
        
        1001
            assert all(
      
        1002
                "Semantic verification preview:" not in message.content
      
        1003
                for message in summary.tool_result_messages
      
        1004
            )
      
        1005
            assert persistent_messages == []
      
        1006
            assert ephemeral_messages == []
      
        1007
        
        1008
        
        1009
        @pytest.mark.asyncio
      
        1010
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1011
            temp_dir: Path,
      
        1012
        ) -> None:
      
        1013
            async def assess_confidence(
      
        1014
                tool_name: str,
      
        1015
                tool_args: dict,
      
        1016
                context: str,
      
        1017
            ) -> ConfidenceAssessment:
      
        1018
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1019
        
        1020
            async def verify_action(
      
        1021
                tool_name: str,
      
        1022
                tool_args: dict,
      
        1023
                result: str,
      
        1024
                expected: str = "",
      
        1025
            ) -> ActionVerification:
      
        1026
                raise AssertionError("Verification should not run for this scenario")
      
        1027
        
        1028
            chapters = temp_dir / "chapters"
      
        1029
            chapters.mkdir()
      
        1030
            (chapters / "01-introduction.html").write_text(
      
        1031
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1032
            )
      
        1033
            (chapters / "02-setup.html").write_text(
      
        1034
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1035
            )
      
        1036
            index_path = temp_dir / "index.html"
      
        1037
            index_path.write_text(
      
        1038
                "<h2>Table of Contents</h2>\n"
      
        1039
                '<ul class="chapter-list">\n'
      
        1040
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1041
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1042
                "</ul>\n"
      
        1043
            )
      
        1044
        
        1045
            prompt = (
      
        1046
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1047
                "for the structure and cadence of the guide. We are going to make an all "
      
        1048
                "new equally thorough guide on how to use the nginx tool."
      
        1049
            )
      
        1050
        
        1051
            context = build_context(
      
        1052
                temp_dir=temp_dir,
      
        1053
                messages=[],
      
        1054
                safeguards=FakeSafeguards(),
      
        1055
                assess_confidence=assess_confidence,
      
        1056
                verify_action=verify_action,
      
        1057
                auto_recover=False,
      
        1058
            )
      
        1059
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1060
            persistent_messages: list[str] = []
      
        1061
            ephemeral_messages: list[str] = []
      
        1062
            context.queue_steering_message_callback = persistent_messages.append
      
        1063
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1064
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1065
            tool_call = ToolCall(
      
        1066
                id="read-index",
      
        1067
                name="read",
      
        1068
                arguments={"file_path": str(index_path)},
      
        1069
            )
      
        1070
            executor = FakeExecutor(
      
        1071
                [
      
        1072
                    tool_outcome(
      
        1073
                        tool_call=tool_call,
      
        1074
                        output=index_path.read_text(),
      
        1075
                        is_error=False,
      
        1076
                    )
      
        1077
                ]
      
        1078
            )
      
        1079
        
        1080
            summary = TurnSummary(final_response="")
      
        1081
            await runner.execute_batch(
      
        1082
                tool_calls=[tool_call],
      
        1083
                tool_source="assistant",
      
        1084
                pending_tool_calls_seen=set(),
      
        1085
                emit=_noop_emit,
      
        1086
                summary=summary,
      
        1087
                dod=create_definition_of_done(prompt),
      
        1088
                executor=executor,  # type: ignore[arg-type]
      
        1089
                on_confirmation=None,
      
        1090
                on_user_question=None,
      
        1091
                emit_confirmation=None,
      
        1092
                consecutive_errors=0,
      
        1093
            )
      
        1094
        
        1095
            assert persistent_messages == []
      
        1096
            assert ephemeral_messages == []
      
        1097
            assert all(
      
        1098
                "Semantic verification preview:" not in message.content
      
        1099
                for message in summary.tool_result_messages
      
        1100
            )
      
        1101
        
        1102
        
        1103
        @pytest.mark.asyncio
      
        1104
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1105
            temp_dir: Path,
      
        1106
        ) -> None:
      
        1107
            async def assess_confidence(
      
        1108
                tool_name: str,
      
        1109
                tool_args: dict,
      
        1110
                context: str,
      
        1111
            ) -> ConfidenceAssessment:
      
        1112
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1113
        
        1114
            async def verify_action(
      
        1115
                tool_name: str,
      
        1116
                tool_args: dict,
      
        1117
                result: str,
      
        1118
                expected: str = "",
      
        1119
            ) -> ActionVerification:
      
        1120
                raise AssertionError("Verification should not run for this scenario")
      
        1121
        
        1122
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1123
            reference.parent.mkdir(parents=True)
      
        1124
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1125
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1126
            chapters = nginx_root / "chapters"
      
        1127
            implementation_plan = temp_dir / "implementation.md"
      
        1128
            implementation_plan.write_text(
      
        1129
                "\n".join(
      
        1130
                    [
      
        1131
                        "# Implementation Plan",
      
        1132
                        "",
      
        1133
                        "## File Changes",
      
        1134
                        f"- `{chapters}/`",
      
        1135
                        f"- `{nginx_root / 'index.html'}`",
      
        1136
                        "",
      
        1137
                    ]
      
        1138
                )
      
        1139
            )
      
        1140
        
        1141
            context = build_context(
      
        1142
                temp_dir=temp_dir,
      
        1143
                messages=[],
      
        1144
                safeguards=FakeSafeguards(),
      
        1145
                assess_confidence=assess_confidence,
      
        1146
                verify_action=verify_action,
      
        1147
                auto_recover=False,
      
        1148
            )
      
        1149
            persistent_messages: list[str] = []
      
        1150
            ephemeral_messages: list[str] = []
      
        1151
            context.queue_steering_message_callback = persistent_messages.append
      
        1152
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1153
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1154
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1155
            dod.implementation_plan = str(implementation_plan)
      
        1156
            sync_todos_to_definition_of_done(
      
        1157
                dod,
      
        1158
                [
      
        1159
                    {
      
        1160
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1161
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1162
                        "status": "pending",
      
        1163
                    },
      
        1164
                    {
      
        1165
                        "content": "Create the nginx directory structure",
      
        1166
                        "active_form": "Working on: Create the nginx directory structure",
      
        1167
                        "status": "pending",
      
        1168
                    },
      
        1169
                    {
      
        1170
                        "content": "Create the nginx index.html file",
      
        1171
                        "active_form": "Working on: Create the nginx index.html file",
      
        1172
                        "status": "pending",
      
        1173
                    },
      
        1174
                ],
      
        1175
            )
      
        1176
            tool_call = ToolCall(
      
        1177
                id="read-reference",
      
        1178
                name="read",
      
        1179
                arguments={"file_path": str(reference)},
      
        1180
            )
      
        1181
            executor = FakeExecutor(
      
        1182
                [
      
        1183
                    tool_outcome(
      
        1184
                        tool_call=tool_call,
      
        1185
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1186
                        is_error=False,
      
        1187
                    )
      
        1188
                ]
      
        1189
            )
      
        1190
        
        1191
            summary = TurnSummary(final_response="")
      
        1192
            await runner.execute_batch(
      
        1193
                tool_calls=[tool_call],
      
        1194
                tool_source="assistant",
      
        1195
                pending_tool_calls_seen=set(),
      
        1196
                emit=_noop_emit,
      
        1197
                summary=summary,
      
        1198
                dod=dod,
      
        1199
                executor=executor,  # type: ignore[arg-type]
      
        1200
                on_confirmation=None,
      
        1201
                on_user_question=None,
      
        1202
                emit_confirmation=None,
      
        1203
                consecutive_errors=0,
      
        1204
            )
      
        1205
        
        1206
            assert (
      
        1207
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1208
                in dod.completed_items
      
        1209
            )
      
        1210
            assert any(
      
        1211
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1212
                in message
      
        1213
                for message in persistent_messages
      
        1214
            )
      
        1215
            assert any(
      
        1216
                "Resume by creating `chapters/` now." in message
      
        1217
                for message in persistent_messages
      
        1218
            )
      
        1219
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1220
            assert ephemeral_messages == []
      
        1221
        
        1222
        
        1223
        @pytest.mark.asyncio
      
        1224
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1225
            temp_dir: Path,
      
        1226
        ) -> None:
      
        1227
            async def assess_confidence(
      
        1228
                tool_name: str,
      
        1229
                tool_args: dict,
      
        1230
                context: str,
      
        1231
            ) -> ConfidenceAssessment:
      
        1232
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1233
        
        1234
            async def verify_action(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                result: str,
      
        1238
                expected: str = "",
      
        1239
            ) -> ActionVerification:
      
        1240
                raise AssertionError("Verification should not run for this scenario")
      
        1241
        
        1242
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1243
            reference.parent.mkdir(parents=True)
      
        1244
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1245
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1246
            chapters = nginx_root / "chapters"
      
        1247
            implementation_plan = temp_dir / "implementation.md"
      
        1248
            implementation_plan.write_text(
      
        1249
                "\n".join(
      
        1250
                    [
      
        1251
                        "# Implementation Plan",
      
        1252
                        "",
      
        1253
                        "## File Changes",
      
        1254
                        f"- `{nginx_root / 'index.html'}`",
      
        1255
                        f"- `{chapters}/`",
      
        1256
                        "",
      
        1257
                    ]
      
        1258
                )
      
        1259
            )
      
        1260
        
        1261
            context = build_context(
      
        1262
                temp_dir=temp_dir,
      
        1263
                messages=[],
      
        1264
                safeguards=FakeSafeguards(),
      
        1265
                assess_confidence=assess_confidence,
      
        1266
                verify_action=verify_action,
      
        1267
                auto_recover=False,
      
        1268
            )
      
        1269
            persistent_messages: list[str] = []
      
        1270
            ephemeral_messages: list[str] = []
      
        1271
            context.queue_steering_message_callback = persistent_messages.append
      
        1272
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1273
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1274
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1275
            dod.implementation_plan = str(implementation_plan)
      
        1276
            sync_todos_to_definition_of_done(
      
        1277
                dod,
      
        1278
                [
      
        1279
                    {
      
        1280
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1281
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1282
                        "status": "pending",
      
        1283
                    },
      
        1284
                    {
      
        1285
                        "content": "Create the nginx directory structure",
      
        1286
                        "active_form": "Working on: Create the nginx directory structure",
      
        1287
                        "status": "pending",
      
        1288
                    },
      
        1289
                    {
      
        1290
                        "content": "Create the nginx index.html file",
      
        1291
                        "active_form": "Working on: Create the nginx index.html file",
      
        1292
                        "status": "pending",
      
        1293
                    },
      
        1294
                ],
      
        1295
                project_root=temp_dir,
      
        1296
            )
      
        1297
            tool_call = ToolCall(
      
        1298
                id="read-reference-index-first",
      
        1299
                name="read",
      
        1300
                arguments={"file_path": str(reference)},
      
        1301
            )
      
        1302
            executor = FakeExecutor(
      
        1303
                [
      
        1304
                    tool_outcome(
      
        1305
                        tool_call=tool_call,
      
        1306
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1307
                        is_error=False,
      
        1308
                    )
      
        1309
                ]
      
        1310
            )
      
        1311
        
        1312
            summary = TurnSummary(final_response="")
      
        1313
            await runner.execute_batch(
      
        1314
                tool_calls=[tool_call],
      
        1315
                tool_source="assistant",
      
        1316
                pending_tool_calls_seen=set(),
      
        1317
                emit=_noop_emit,
      
        1318
                summary=summary,
      
        1319
                dod=dod,
      
        1320
                executor=executor,  # type: ignore[arg-type]
      
        1321
                on_confirmation=None,
      
        1322
                on_user_question=None,
      
        1323
                emit_confirmation=None,
      
        1324
                consecutive_errors=0,
      
        1325
            )
      
        1326
        
        1327
            assert persistent_messages
      
        1328
            assert any(
      
        1329
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1330
                in message
      
        1331
                for message in persistent_messages
      
        1332
            )
      
        1333
            assert any(
      
        1334
                "Resume by creating `chapters/` now." in message
      
        1335
                for message in persistent_messages
      
        1336
            )
      
        1337
            assert all(
      
        1338
                "Next step: create `index.html`." not in message
      
        1339
                for message in persistent_messages
      
        1340
            )
      
        1341
            assert ephemeral_messages == []
      
        1342
        
        1343
        
        1344
        @pytest.mark.asyncio
      
        1345
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1346
            temp_dir: Path,
      
        1347
        ) -> None:
      
        1348
            async def assess_confidence(
      
        1349
                tool_name: str,
      
        1350
                tool_args: dict,
      
        1351
                context: str,
      
        1352
            ) -> ConfidenceAssessment:
      
        1353
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1354
        
        1355
            async def verify_action(
      
        1356
                tool_name: str,
      
        1357
                tool_args: dict,
      
        1358
                result: str,
      
        1359
                expected: str = "",
      
        1360
            ) -> ActionVerification:
      
        1361
                raise AssertionError("Verification should not run for this scenario")
      
        1362
        
        1363
            reference = temp_dir / "fortran" / "index.html"
      
        1364
            reference.parent.mkdir(parents=True)
      
        1365
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1366
        
        1367
            messages = [
      
        1368
                Message(
      
        1369
                    role=Role.TOOL,
      
        1370
                    content=(
      
        1371
                        "Observation [read]: Result: "
      
        1372
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1373
                    ),
      
        1374
                )
      
        1375
            ]
      
        1376
            context = build_context(
      
        1377
                temp_dir=temp_dir,
      
        1378
                messages=messages,
      
        1379
                safeguards=FakeSafeguards(),
      
        1380
                assess_confidence=assess_confidence,
      
        1381
                verify_action=verify_action,
      
        1382
                auto_recover=False,
      
        1383
            )
      
        1384
            prompt = (
      
        1385
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1386
                "for the structure and cadence of the guide. We are going to make an all "
      
        1387
                "new equally thorough guide on how to use the nginx tool."
      
        1388
            )
      
        1389
            context.session.current_task = prompt
      
        1390
            persistent_messages: list[str] = []
      
        1391
            ephemeral_messages: list[str] = []
      
        1392
            context.queue_steering_message_callback = persistent_messages.append
      
        1393
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1394
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1395
            dod = create_definition_of_done(prompt)
      
        1396
            sync_todos_to_definition_of_done(
      
        1397
                dod,
      
        1398
                [
      
        1399
                    {
      
        1400
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1401
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1402
                        "status": "completed",
      
        1403
                    },
      
        1404
                    {
      
        1405
                        "content": "Create the nginx directory structure",
      
        1406
                        "active_form": "Working on: Create the nginx directory structure",
      
        1407
                        "status": "pending",
      
        1408
                    },
      
        1409
                    {
      
        1410
                        "content": "Create the nginx index.html file",
      
        1411
                        "active_form": "Working on: Create the nginx index.html file",
      
        1412
                        "status": "pending",
      
        1413
                    },
      
        1414
                ],
      
        1415
            )
      
        1416
            tool_call = ToolCall(
      
        1417
                id="read-dup",
      
        1418
                name="read",
      
        1419
                arguments={"file_path": str(reference)},
      
        1420
            )
      
        1421
            duplicate_message = (
      
        1422
                "[Skipped - duplicate action: Already read "
      
        1423
                f"{reference} recently without any intervening changes; "
      
        1424
                "reuse the earlier read result instead of rereading]"
      
        1425
            )
      
        1426
            executor = FakeExecutor(
      
        1427
                [
      
        1428
                    ToolExecutionOutcome(
      
        1429
                        tool_call=tool_call,
      
        1430
                        state=ToolExecutionState.DUPLICATE,
      
        1431
                        message=Message.tool_result_message(
      
        1432
                            tool_call_id=tool_call.id,
      
        1433
                            display_content=duplicate_message,
      
        1434
                            result_content=duplicate_message,
      
        1435
                        ),
      
        1436
                        event_content=duplicate_message,
      
        1437
                        is_error=False,
      
        1438
                        result_output=duplicate_message,
      
        1439
                    )
      
        1440
                ]
      
        1441
            )
      
        1442
        
        1443
            summary = TurnSummary(final_response="")
      
        1444
            await runner.execute_batch(
      
        1445
                tool_calls=[tool_call],
      
        1446
                tool_source="assistant",
      
        1447
                pending_tool_calls_seen=set(),
      
        1448
                emit=_noop_emit,
      
        1449
                summary=summary,
      
        1450
                dod=dod,
      
        1451
                executor=executor,  # type: ignore[arg-type]
      
        1452
                on_confirmation=None,
      
        1453
                on_user_question=None,
      
        1454
                emit_confirmation=None,
      
        1455
                consecutive_errors=0,
      
        1456
            )
      
        1457
        
        1458
            assert len(persistent_messages) == 1
      
        1459
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1460
            assert (
      
        1461
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1462
                in persistent_messages[0]
      
        1463
            )
      
        1464
            assert "Update `" not in persistent_messages[0]
      
        1465
            assert ephemeral_messages == []
      
        1466
        
        1467
        
        1468
        @pytest.mark.asyncio
      
        1469
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1470
            temp_dir: Path,
      
        1471
        ) -> None:
      
        1472
            async def assess_confidence(
      
        1473
                tool_name: str,
      
        1474
                tool_args: dict,
      
        1475
                context: str,
      
        1476
            ) -> ConfidenceAssessment:
      
        1477
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1478
        
        1479
            async def verify_action(
      
        1480
                tool_name: str,
      
        1481
                tool_args: dict,
      
        1482
                result: str,
      
        1483
                expected: str = "",
      
        1484
            ) -> ActionVerification:
      
        1485
                raise AssertionError("Verification should not run for this scenario")
      
        1486
        
        1487
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1488
            chapters = guide_root / "chapters"
      
        1489
            chapters.mkdir(parents=True)
      
        1490
            chapter_one = chapters / "01-introduction.html"
      
        1491
            chapter_one.write_text("<html></html>\n")
      
        1492
            index_path = guide_root / "index.html"
      
        1493
        
        1494
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1495
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1496
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1497
        
        1498
            implementation_plan = temp_dir / "implementation.md"
      
        1499
            implementation_plan.write_text(
      
        1500
                "\n".join(
      
        1501
                    [
      
        1502
                        "# Implementation Plan",
      
        1503
                        "",
      
        1504
                        "## File Changes",
      
        1505
                        f"- `{guide_root}/`",
      
        1506
                        f"- `{chapters}/`",
      
        1507
                        f"- `{index_path}`",
      
        1508
                        f"- `{chapter_one}`",
      
        1509
                        f"- `{chapters / '02-installation.html'}`",
      
        1510
                        "",
      
        1511
                    ]
      
        1512
                )
      
        1513
            )
      
        1514
        
        1515
            context = build_context(
      
        1516
                temp_dir=temp_dir,
      
        1517
                messages=[],
      
        1518
                safeguards=FakeSafeguards(),
      
        1519
                assess_confidence=assess_confidence,
      
        1520
                verify_action=verify_action,
      
        1521
                auto_recover=False,
      
        1522
            )
      
        1523
            persistent_messages: list[str] = []
      
        1524
            ephemeral_messages: list[str] = []
      
        1525
            context.queue_steering_message_callback = persistent_messages.append
      
        1526
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1527
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1528
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1529
            dod.implementation_plan = str(implementation_plan)
      
        1530
            dod.touched_files.append(str(chapter_one))
      
        1531
            sync_todos_to_definition_of_done(
      
        1532
                dod,
      
        1533
                [
      
        1534
                    {
      
        1535
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1536
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1537
                        "status": "pending",
      
        1538
                    },
      
        1539
                    {
      
        1540
                        "content": "Create each chapter file with appropriate content",
      
        1541
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1542
                        "status": "pending",
      
        1543
                    },
      
        1544
                    {
      
        1545
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1546
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1547
                        "status": "pending",
      
        1548
                    },
      
        1549
                ],
      
        1550
            )
      
        1551
            tool_call = ToolCall(
      
        1552
                id="read-reference-chapter",
      
        1553
                name="read",
      
        1554
                arguments={"file_path": str(reference)},
      
        1555
            )
      
        1556
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1557
            executor = FakeExecutor(
      
        1558
                [
      
        1559
                    ToolExecutionOutcome(
      
        1560
                        tool_call=tool_call,
      
        1561
                        state=ToolExecutionState.EXECUTED,
      
        1562
                        message=Message.tool_result_message(
      
        1563
                            tool_call_id=tool_call.id,
      
        1564
                            display_content=read_output,
      
        1565
                            result_content=read_output,
      
        1566
                        ),
      
        1567
                        event_content=read_output,
      
        1568
                        is_error=False,
      
        1569
                        result_output=read_output,
      
        1570
                    )
      
        1571
                ]
      
        1572
            )
      
        1573
        
        1574
            summary = TurnSummary(final_response="")
      
        1575
            await runner.execute_batch(
      
        1576
                tool_calls=[tool_call],
      
        1577
                tool_source="assistant",
      
        1578
                pending_tool_calls_seen=set(),
      
        1579
                emit=_noop_emit,
      
        1580
                summary=summary,
      
        1581
                dod=dod,
      
        1582
                executor=executor,  # type: ignore[arg-type]
      
        1583
                on_confirmation=None,
      
        1584
                on_user_question=None,
      
        1585
                emit_confirmation=None,
      
        1586
                consecutive_errors=0,
      
        1587
            )
      
        1588
        
        1589
            assert persistent_messages
      
        1590
            assert any(
      
        1591
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1592
                in message
      
        1593
                for message in persistent_messages
      
        1594
            )
      
        1595
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1596
            assert not any(
      
        1597
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1598
                in message
      
        1599
                for message in persistent_messages
      
        1600
            )
      
        1601
            assert ephemeral_messages == []
      
        1602
        
        1603
        
        1604
        @pytest.mark.asyncio
      
        1605
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1606
            temp_dir: Path,
      
        1607
        ) -> None:
      
        1608
            async def assess_confidence(
      
        1609
                tool_name: str,
      
        1610
                tool_args: dict,
      
        1611
                context: str,
      
        1612
            ) -> ConfidenceAssessment:
      
        1613
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1614
        
        1615
            async def verify_action(
      
        1616
                tool_name: str,
      
        1617
                tool_args: dict,
      
        1618
                result: str,
      
        1619
                expected: str = "",
      
        1620
            ) -> ActionVerification:
      
        1621
                raise AssertionError("Verification should not run for this scenario")
      
        1622
        
        1623
            guide_root = temp_dir / "guides" / "nginx"
      
        1624
            chapters = guide_root / "chapters"
      
        1625
            guide_root.mkdir(parents=True)
      
        1626
            chapters.mkdir()
      
        1627
            index_path = guide_root / "index.html"
      
        1628
            chapter_one = chapters / "01-getting-started.html"
      
        1629
            chapter_two = chapters / "02-installation.html"
      
        1630
            index_path.write_text("<html></html>\n")
      
        1631
            chapter_one.write_text("<h1>One</h1>\n")
      
        1632
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1633
        
        1634
            implementation_plan = temp_dir / "implementation.md"
      
        1635
            implementation_plan.write_text(
      
        1636
                "\n".join(
      
        1637
                    [
      
        1638
                        "# Implementation Plan",
      
        1639
                        "",
      
        1640
                        "## File Changes",
      
        1641
                        f"- `{guide_root}/`",
      
        1642
                        f"- `{chapters}/`",
      
        1643
                        f"- `{index_path}`",
      
        1644
                        f"- `{chapter_one}`",
      
        1645
                        f"- `{chapter_two}`",
      
        1646
                        "",
      
        1647
                    ]
      
        1648
                )
      
        1649
            )
      
        1650
        
        1651
            context = build_context(
      
        1652
                temp_dir=temp_dir,
      
        1653
                messages=[],
      
        1654
                safeguards=FakeSafeguards(),
      
        1655
                assess_confidence=assess_confidence,
      
        1656
                verify_action=verify_action,
      
        1657
                auto_recover=False,
      
        1658
            )
      
        1659
            persistent_messages: list[str] = []
      
        1660
            ephemeral_messages: list[str] = []
      
        1661
            context.queue_steering_message_callback = persistent_messages.append
      
        1662
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1663
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1664
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1665
            dod.implementation_plan = str(implementation_plan)
      
        1666
            dod.pending_items = [
      
        1667
                "Create 07-performance-tuning.html",
      
        1668
                "Verify all guide files are linked and complete",
      
        1669
                "Complete the requested work",
      
        1670
            ]
      
        1671
        
        1672
            tool_call = ToolCall(
      
        1673
                id="read-dup",
      
        1674
                name="read",
      
        1675
                arguments={"file_path": str(chapter_one)},
      
        1676
            )
      
        1677
            duplicate_message = (
      
        1678
                "[Skipped - duplicate action: Already read "
      
        1679
                f"{chapter_one} recently without any intervening changes; "
      
        1680
                "reuse the earlier read result instead of rereading]"
      
        1681
            )
      
        1682
            executor = FakeExecutor(
      
        1683
                [
      
        1684
                    ToolExecutionOutcome(
      
        1685
                        tool_call=tool_call,
      
        1686
                        state=ToolExecutionState.DUPLICATE,
      
        1687
                        message=Message.tool_result_message(
      
        1688
                            tool_call_id=tool_call.id,
      
        1689
                            display_content=duplicate_message,
      
        1690
                            result_content=duplicate_message,
      
        1691
                        ),
      
        1692
                        event_content=duplicate_message,
      
        1693
                        is_error=False,
      
        1694
                        result_output=duplicate_message,
      
        1695
                    )
      
        1696
                ]
      
        1697
            )
      
        1698
        
        1699
            summary = TurnSummary(final_response="")
      
        1700
            await runner.execute_batch(
      
        1701
                tool_calls=[tool_call],
      
        1702
                tool_source="assistant",
      
        1703
                pending_tool_calls_seen=set(),
      
        1704
                emit=_noop_emit,
      
        1705
                summary=summary,
      
        1706
                dod=dod,
      
        1707
                executor=executor,  # type: ignore[arg-type]
      
        1708
                on_confirmation=None,
      
        1709
                on_user_question=None,
      
        1710
                emit_confirmation=None,
      
        1711
                consecutive_errors=0,
      
        1712
            )
      
        1713
        
        1714
            assert len(persistent_messages) == 1
      
        1715
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1716
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1717
            assert ephemeral_messages == []
      
        1718
        
        1719
        
        1720
        @pytest.mark.asyncio
      
        1721
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1722
            temp_dir: Path,
      
        1723
        ) -> None:
      
        1724
            async def assess_confidence(
      
        1725
                tool_name: str,
      
        1726
                tool_args: dict,
      
        1727
                context: str,
      
        1728
            ) -> ConfidenceAssessment:
      
        1729
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1730
        
        1731
            async def verify_action(
      
        1732
                tool_name: str,
      
        1733
                tool_args: dict,
      
        1734
                result: str,
      
        1735
                expected: str = "",
      
        1736
            ) -> ActionVerification:
      
        1737
                raise AssertionError("Verification should not run for this scenario")
      
        1738
        
        1739
            guide_root = temp_dir / "guides" / "nginx"
      
        1740
            chapters = guide_root / "chapters"
      
        1741
            guide_root.mkdir(parents=True)
      
        1742
            chapters.mkdir()
      
        1743
            index_path = guide_root / "index.html"
      
        1744
            chapter_one = chapters / "01-getting-started.html"
      
        1745
            chapter_two = chapters / "02-installation.html"
      
        1746
            index_path.write_text("<html></html>\n")
      
        1747
            chapter_one.write_text("<h1>One</h1>\n")
      
        1748
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1749
        
        1750
            implementation_plan = temp_dir / "implementation.md"
      
        1751
            implementation_plan.write_text(
      
        1752
                "\n".join(
      
        1753
                    [
      
        1754
                        "# Implementation Plan",
      
        1755
                        "",
      
        1756
                        "## File Changes",
      
        1757
                        f"- `{guide_root}/`",
      
        1758
                        f"- `{chapters}/`",
      
        1759
                        f"- `{index_path}`",
      
        1760
                        f"- `{chapter_one}`",
      
        1761
                        f"- `{chapter_two}`",
      
        1762
                        "",
      
        1763
                    ]
      
        1764
                )
      
        1765
            )
      
        1766
        
        1767
            context = build_context(
      
        1768
                temp_dir=temp_dir,
      
        1769
                messages=[],
      
        1770
                safeguards=FakeSafeguards(),
      
        1771
                assess_confidence=assess_confidence,
      
        1772
                verify_action=verify_action,
      
        1773
                auto_recover=False,
      
        1774
            )
      
        1775
            persistent_messages: list[str] = []
      
        1776
            ephemeral_messages: list[str] = []
      
        1777
            context.queue_steering_message_callback = persistent_messages.append
      
        1778
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1779
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1780
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1781
            dod.implementation_plan = str(implementation_plan)
      
        1782
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1783
            dod.pending_items = [
      
        1784
                "Create 07-performance-tuning.html",
      
        1785
                "Complete the requested work",
      
        1786
            ]
      
        1787
        
        1788
            tool_call = ToolCall(
      
        1789
                id="read-dup",
      
        1790
                name="read",
      
        1791
                arguments={"file_path": str(chapter_one)},
      
        1792
            )
      
        1793
            duplicate_message = (
      
        1794
                "[Skipped - duplicate action: Already read "
      
        1795
                f"{chapter_one} recently without any intervening changes; "
      
        1796
                "reuse the earlier read result instead of rereading]"
      
        1797
            )
      
        1798
            executor = FakeExecutor(
      
        1799
                [
      
        1800
                    ToolExecutionOutcome(
      
        1801
                        tool_call=tool_call,
      
        1802
                        state=ToolExecutionState.DUPLICATE,
      
        1803
                        message=Message.tool_result_message(
      
        1804
                            tool_call_id=tool_call.id,
      
        1805
                            display_content=duplicate_message,
      
        1806
                            result_content=duplicate_message,
      
        1807
                        ),
      
        1808
                        event_content=duplicate_message,
      
        1809
                        is_error=False,
      
        1810
                        result_output=duplicate_message,
      
        1811
                    )
      
        1812
                ]
      
        1813
            )
      
        1814
        
        1815
            summary = TurnSummary(final_response="")
      
        1816
            await runner.execute_batch(
      
        1817
                tool_calls=[tool_call],
      
        1818
                tool_source="assistant",
      
        1819
                pending_tool_calls_seen=set(),
      
        1820
                emit=_noop_emit,
      
        1821
                summary=summary,
      
        1822
                dod=dod,
      
        1823
                executor=executor,  # type: ignore[arg-type]
      
        1824
                on_confirmation=None,
      
        1825
                on_user_question=None,
      
        1826
                emit_confirmation=None,
      
        1827
                consecutive_errors=0,
      
        1828
            )
      
        1829
        
        1830
            assert len(persistent_messages) == 1
      
        1831
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1832
            assert (
      
        1833
                "Move to verification or final confirmation using the files already on disk."
      
        1834
                in persistent_messages[0]
      
        1835
            )
      
        1836
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1837
            assert ephemeral_messages == []
      
        1838
        
        1839
        
        1840
        @pytest.mark.asyncio
      
        1841
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1842
            temp_dir: Path,
      
        1843
        ) -> None:
      
        1844
            async def assess_confidence(
      
        1845
                tool_name: str,
      
        1846
                tool_args: dict,
      
        1847
                context: str,
      
        1848
            ) -> ConfidenceAssessment:
      
        1849
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1850
        
        1851
            async def verify_action(
      
        1852
                tool_name: str,
      
        1853
                tool_args: dict,
      
        1854
                result: str,
      
        1855
                expected: str = "",
      
        1856
            ) -> ActionVerification:
      
        1857
                raise AssertionError("Verification should not run for this scenario")
      
        1858
        
        1859
            guide_root = temp_dir / "guides" / "nginx"
      
        1860
            chapters = guide_root / "chapters"
      
        1861
            guide_root.mkdir(parents=True)
      
        1862
            chapters.mkdir()
      
        1863
            index_path = guide_root / "index.html"
      
        1864
            chapter_one = chapters / "01-getting-started.html"
      
        1865
            chapter_two = chapters / "02-installation.html"
      
        1866
            index_path.write_text("<html></html>\n")
      
        1867
            chapter_one.write_text("<h1>One</h1>\n")
      
        1868
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1869
        
        1870
            implementation_plan = temp_dir / "implementation.md"
      
        1871
            implementation_plan.write_text(
      
        1872
                "\n".join(
      
        1873
                    [
      
        1874
                        "# Implementation Plan",
      
        1875
                        "",
      
        1876
                        "## File Changes",
      
        1877
                        f"- `{guide_root}/`",
      
        1878
                        f"- `{chapters}/`",
      
        1879
                        f"- `{index_path}`",
      
        1880
                        f"- `{chapter_one}`",
      
        1881
                        f"- `{chapter_two}`",
      
        1882
                        "",
      
        1883
                    ]
      
        1884
                )
      
        1885
            )
      
        1886
        
        1887
            context = build_context(
      
        1888
                temp_dir=temp_dir,
      
        1889
                messages=[],
      
        1890
                safeguards=FakeSafeguards(),
      
        1891
                assess_confidence=assess_confidence,
      
        1892
                verify_action=verify_action,
      
        1893
                auto_recover=False,
      
        1894
            )
      
        1895
            persistent_messages: list[str] = []
      
        1896
            ephemeral_messages: list[str] = []
      
        1897
            context.queue_steering_message_callback = persistent_messages.append
      
        1898
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1899
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1900
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1901
            dod.implementation_plan = str(implementation_plan)
      
        1902
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1903
            dod.pending_items = [
      
        1904
                "Create 01-getting-started.html",
      
        1905
                "Creating 02-installation.html",
      
        1906
                "Complete the requested work",
      
        1907
            ]
      
        1908
        
        1909
            tool_call = ToolCall(
      
        1910
                id="read-dup-built-stale",
      
        1911
                name="read",
      
        1912
                arguments={"file_path": str(chapter_one)},
      
        1913
            )
      
        1914
            duplicate_message = (
      
        1915
                "[Skipped - duplicate action: Already read "
      
        1916
                f"{chapter_one} recently without any intervening changes; "
      
        1917
                "reuse the earlier read result instead of rereading]"
      
        1918
            )
      
        1919
            executor = FakeExecutor(
      
        1920
                [
      
        1921
                    ToolExecutionOutcome(
      
        1922
                        tool_call=tool_call,
      
        1923
                        state=ToolExecutionState.DUPLICATE,
      
        1924
                        message=Message.tool_result_message(
      
        1925
                            tool_call_id=tool_call.id,
      
        1926
                            display_content=duplicate_message,
      
        1927
                            result_content=duplicate_message,
      
        1928
                        ),
      
        1929
                        event_content=duplicate_message,
      
        1930
                        is_error=False,
      
        1931
                        result_output=duplicate_message,
      
        1932
                    )
      
        1933
                ]
      
        1934
            )
      
        1935
        
        1936
            summary = TurnSummary(final_response="")
      
        1937
            await runner.execute_batch(
      
        1938
                tool_calls=[tool_call],
      
        1939
                tool_source="assistant",
      
        1940
                pending_tool_calls_seen=set(),
      
        1941
                emit=_noop_emit,
      
        1942
                summary=summary,
      
        1943
                dod=dod,
      
        1944
                executor=executor,  # type: ignore[arg-type]
      
        1945
                on_confirmation=None,
      
        1946
                on_user_question=None,
      
        1947
                emit_confirmation=None,
      
        1948
                consecutive_errors=0,
      
        1949
            )
      
        1950
        
        1951
            assert len(persistent_messages) == 1
      
        1952
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1953
            assert (
      
        1954
                "Move to verification or final confirmation using the files already on disk."
      
        1955
                in persistent_messages[0]
      
        1956
            )
      
        1957
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        1958
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        1959
            assert ephemeral_messages == []
      
        1960
        
        1961
        
        1962
        @pytest.mark.asyncio
      
        1963
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        1964
            temp_dir: Path,
      
        1965
        ) -> None:
      
        1966
            async def assess_confidence(
      
        1967
                tool_name: str,
      
        1968
                tool_args: dict,
      
        1969
                context: str,
      
        1970
            ) -> ConfidenceAssessment:
      
        1971
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1972
        
        1973
            async def verify_action(
      
        1974
                tool_name: str,
      
        1975
                tool_args: dict,
      
        1976
                result: str,
      
        1977
                expected: str = "",
      
        1978
            ) -> ActionVerification:
      
        1979
                raise AssertionError("Verification should not run for this scenario")
      
        1980
        
        1981
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1982
            reference.parent.mkdir(parents=True)
      
        1983
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1984
        
        1985
            context = build_context(
      
        1986
                temp_dir=temp_dir,
      
        1987
                messages=[],
      
        1988
                safeguards=FakeSafeguards(),
      
        1989
                assess_confidence=assess_confidence,
      
        1990
                verify_action=verify_action,
      
        1991
                auto_recover=False,
      
        1992
            )
      
        1993
            persistent_messages: list[str] = []
      
        1994
            ephemeral_messages: list[str] = []
      
        1995
            context.queue_steering_message_callback = persistent_messages.append
      
        1996
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1997
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1998
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1999
            sync_todos_to_definition_of_done(
      
        2000
                dod,
      
        2001
                [
      
        2002
                    {
      
        2003
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2004
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2005
                        "status": "pending",
      
        2006
                    },
      
        2007
                    {
      
        2008
                        "content": "Create the nginx index.html file",
      
        2009
                        "active_form": "Working on: Create the nginx index.html file",
      
        2010
                        "status": "pending",
      
        2011
                    },
      
        2012
                ],
      
        2013
            )
      
        2014
            tool_call = ToolCall(
      
        2015
                id="read-reference",
      
        2016
                name="read",
      
        2017
                arguments={"file_path": str(reference)},
      
        2018
            )
      
        2019
            executor = FakeExecutor(
      
        2020
                [
      
        2021
                    tool_outcome(
      
        2022
                        tool_call=tool_call,
      
        2023
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2024
                        is_error=False,
      
        2025
                    )
      
        2026
                ]
      
        2027
            )
      
        2028
        
        2029
            summary = TurnSummary(final_response="")
      
        2030
            await runner.execute_batch(
      
        2031
                tool_calls=[tool_call],
      
        2032
                tool_source="assistant",
      
        2033
                pending_tool_calls_seen=set(),
      
        2034
                emit=_noop_emit,
      
        2035
                summary=summary,
      
        2036
                dod=dod,
      
        2037
                executor=executor,  # type: ignore[arg-type]
      
        2038
                on_confirmation=None,
      
        2039
                on_user_question=None,
      
        2040
                emit_confirmation=None,
      
        2041
                consecutive_errors=0,
      
        2042
            )
      
        2043
        
        2044
            assert any(
      
        2045
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2046
                in message
      
        2047
                for message in persistent_messages
      
        2048
            )
      
        2049
            assert any(
      
        2050
                "stop gathering more reference material and perform the change now" in message
      
        2051
                for message in persistent_messages
      
        2052
            )
      
        2053
            assert ephemeral_messages == []
      
        2054
        
        2055
        
        2056
        @pytest.mark.asyncio
      
        2057
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2058
            temp_dir: Path,
      
        2059
        ) -> None:
      
        2060
            async def assess_confidence(
      
        2061
                tool_name: str,
      
        2062
                tool_args: dict,
      
        2063
                context: str,
      
        2064
            ) -> ConfidenceAssessment:
      
        2065
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2066
        
        2067
            async def verify_action(
      
        2068
                tool_name: str,
      
        2069
                tool_args: dict,
      
        2070
                result: str,
      
        2071
                expected: str = "",
      
        2072
            ) -> ActionVerification:
      
        2073
                raise AssertionError("Verification should not run for this scenario")
      
        2074
        
        2075
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2076
            reference.parent.mkdir(parents=True)
      
        2077
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2078
        
        2079
            context = build_context(
      
        2080
                temp_dir=temp_dir,
      
        2081
                messages=[],
      
        2082
                safeguards=FakeSafeguards(),
      
        2083
                assess_confidence=assess_confidence,
      
        2084
                verify_action=verify_action,
      
        2085
                auto_recover=False,
      
        2086
            )
      
        2087
            persistent_messages: list[str] = []
      
        2088
            ephemeral_messages: list[str] = []
      
        2089
            context.queue_steering_message_callback = persistent_messages.append
      
        2090
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2091
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2092
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2093
            sync_todos_to_definition_of_done(
      
        2094
                dod,
      
        2095
                [
      
        2096
                    {
      
        2097
                        "content": "First, examine the existing fortran guide structure and content",
      
        2098
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2099
                        "status": "pending",
      
        2100
                    },
      
        2101
                    {
      
        2102
                        "content": "Create the nginx directory structure",
      
        2103
                        "active_form": "Working on: Create the nginx directory structure",
      
        2104
                        "status": "pending",
      
        2105
                    },
      
        2106
                ],
      
        2107
            )
      
        2108
            tool_call = ToolCall(
      
        2109
                id="read-reference",
      
        2110
                name="read",
      
        2111
                arguments={"file_path": str(reference)},
      
        2112
            )
      
        2113
            executor = FakeExecutor(
      
        2114
                [
      
        2115
                    tool_outcome(
      
        2116
                        tool_call=tool_call,
      
        2117
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2118
                        is_error=False,
      
        2119
                    )
      
        2120
                ]
      
        2121
            )
      
        2122
        
        2123
            summary = TurnSummary(final_response="")
      
        2124
            await runner.execute_batch(
      
        2125
                tool_calls=[tool_call],
      
        2126
                tool_source="assistant",
      
        2127
                pending_tool_calls_seen=set(),
      
        2128
                emit=_noop_emit,
      
        2129
                summary=summary,
      
        2130
                dod=dod,
      
        2131
                executor=executor,  # type: ignore[arg-type]
      
        2132
                on_confirmation=None,
      
        2133
                on_user_question=None,
      
        2134
                emit_confirmation=None,
      
        2135
                consecutive_errors=0,
      
        2136
            )
      
        2137
        
        2138
            assert persistent_messages
      
        2139
            assert any(
      
        2140
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2141
                in message
      
        2142
                for message in persistent_messages
      
        2143
            )
      
        2144
            assert ephemeral_messages == []
      
        2145
        
        2146
        
        2147
        @pytest.mark.asyncio
      
        2148
        async def test_tool_batch_runner_missing_artifact_nudge_stays_quiet_after_setup_mkdir(
      
        2149
            temp_dir: Path,
      
        2150
        ) -> None:
      
        2151
            async def assess_confidence(
      
        2152
                tool_name: str,
      
        2153
                tool_args: dict,
      
        2154
                context: str,
      
        2155
            ) -> ConfidenceAssessment:
      
        2156
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2157
        
        2158
            async def verify_action(
      
        2159
                tool_name: str,
      
        2160
                tool_args: dict,
      
        2161
                result: str,
      
        2162
                expected: str = "",
      
        2163
            ) -> ActionVerification:
      
        2164
                raise AssertionError("Verification should not run for this scenario")
      
        2165
        
        2166
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2167
            chapters = nginx_root / "chapters"
      
        2168
            implementation_plan = temp_dir / "implementation.md"
      
        2169
            implementation_plan.write_text(
      
        2170
                "\n".join(
      
        2171
                    [
      
        2172
                        "# Implementation Plan",
      
        2173
                        "",
      
        2174
                        "## File Changes",
      
        2175
                        f"- `{chapters}/`",
      
        2176
                        f"- `{nginx_root / 'index.html'}`",
      
        2177
                        "",
      
        2178
                    ]
      
        2179
                )
      
        2180
            )
      
        2181
        
        2182
            context = build_context(
      
        2183
                temp_dir=temp_dir,
      
        2184
                messages=[],
      
        2185
                safeguards=FakeSafeguards(),
      
        2186
                assess_confidence=assess_confidence,
      
        2187
                verify_action=verify_action,
      
        2188
                auto_recover=False,
      
        2189
            )
      
        2190
            persistent_messages: list[str] = []
      
        2191
            ephemeral_messages: list[str] = []
      
        2192
            context.queue_steering_message_callback = persistent_messages.append
      
        2193
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2194
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2195
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2196
            dod.implementation_plan = str(implementation_plan)
      
        2197
            sync_todos_to_definition_of_done(
      
        2198
                dod,
      
        2199
                [
      
        2200
                    {
      
        2201
                        "content": "Create the nginx directory structure",
      
        2202
                        "active_form": "Creating the nginx directory structure",
      
        2203
                        "status": "pending",
      
        2204
                    },
      
        2205
                    {
      
        2206
                        "content": "Develop the main index.html file with proper structure",
      
        2207
                        "active_form": "Developing the main index.html file with proper structure",
      
        2208
                        "status": "pending",
      
        2209
                    },
      
        2210
                ],
      
        2211
            )
      
        2212
        
        2213
            tool_call = ToolCall(
      
        2214
                id="mkdir-nginx",
      
        2215
                name="bash",
      
        2216
                arguments={"command": f"mkdir -p {chapters}"},
      
        2217
            )
      
        2218
            executor = FakeExecutor(
      
        2219
                [
      
        2220
                    tool_outcome(
      
        2221
                        tool_call=tool_call,
      
        2222
                        output="",
      
        2223
                        is_error=False,
      
        2224
                    )
      
        2225
                ]
      
        2226
            )
      
        2227
        
        2228
            summary = TurnSummary(final_response="")
      
        2229
            await runner.execute_batch(
      
        2230
                tool_calls=[tool_call],
      
        2231
                tool_source="assistant",
      
        2232
                pending_tool_calls_seen=set(),
      
        2233
                emit=_noop_emit,
      
        2234
                summary=summary,
      
        2235
                dod=dod,
      
        2236
                executor=executor,  # type: ignore[arg-type]
      
        2237
                on_confirmation=None,
      
        2238
                on_user_question=None,
      
        2239
                emit_confirmation=None,
      
        2240
                consecutive_errors=0,
      
        2241
            )
      
        2242
        
        2243
            assert persistent_messages == []
      
        2244
            assert ephemeral_messages == []
      
        2245
        
        2246
        
        2247
        @pytest.mark.asyncio
      
        2248
        async def test_tool_batch_runner_first_file_handoff_stays_persistent(
      
        2249
            temp_dir: Path,
      
        2250
        ) -> None:
      
        2251
            async def assess_confidence(
      
        2252
                tool_name: str,
      
        2253
                tool_args: dict,
      
        2254
                context: str,
      
        2255
            ) -> ConfidenceAssessment:
      
        2256
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2257
        
        2258
            async def verify_action(
      
        2259
                tool_name: str,
      
        2260
                tool_args: dict,
      
        2261
                result: str,
      
        2262
                expected: str = "",
      
        2263
            ) -> ActionVerification:
      
        2264
                raise AssertionError("Verification should not run for this scenario")
      
        2265
        
        2266
            nginx_root = temp_dir / "guides" / "nginx"
      
        2267
            chapters = nginx_root / "chapters"
      
        2268
            chapters.mkdir(parents=True)
      
        2269
            index_path = nginx_root / "index.html"
      
        2270
        
        2271
            implementation_plan = temp_dir / "implementation.md"
      
        2272
            implementation_plan.write_text(
      
        2273
                "\n".join(
      
        2274
                    [
      
        2275
                        "# Implementation Plan",
      
        2276
                        "",
      
        2277
                        "## File Changes",
      
        2278
                        f"- `{chapters}/`",
      
        2279
                        f"- `{index_path}`",
      
        2280
                        f"- `{chapters / '01-introduction.html'}`",
      
        2281
                        "",
      
        2282
                    ]
      
        2283
                )
      
        2284
            )
      
        2285
        
        2286
            context = build_context(
      
        2287
                temp_dir=temp_dir,
      
        2288
                messages=[],
      
        2289
                safeguards=FakeSafeguards(),
      
        2290
                assess_confidence=assess_confidence,
      
        2291
                verify_action=verify_action,
      
        2292
                auto_recover=False,
      
        2293
            )
      
        2294
            persistent_messages: list[str] = []
      
        2295
            ephemeral_messages: list[str] = []
      
        2296
            context.queue_steering_message_callback = persistent_messages.append
      
        2297
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2298
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2299
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2300
            dod.implementation_plan = str(implementation_plan)
      
        2301
            sync_todos_to_definition_of_done(
      
        2302
                dod,
      
        2303
                [
      
        2304
                    {
      
        2305
                        "content": "Create the main index.html file with proper structure",
      
        2306
                        "active_form": "Creating the main index.html file with proper structure",
      
        2307
                        "status": "pending",
      
        2308
                    },
      
        2309
                    {
      
        2310
                        "content": "Create each chapter file with appropriate content",
      
        2311
                        "active_form": "Creating each chapter file with appropriate content",
      
        2312
                        "status": "pending",
      
        2313
                    },
      
        2314
                ],
      
        2315
            )
      
        2316
        
        2317
            tool_call = ToolCall(
      
        2318
                id="write-index",
      
        2319
                name="write",
      
        2320
                arguments={
      
        2321
                    "file_path": str(index_path),
      
        2322
                    "content": "<html></html>\n",
      
        2323
                },
      
        2324
            )
      
        2325
            executor = FakeExecutor(
      
        2326
                [
      
        2327
                    tool_outcome(
      
        2328
                        tool_call=tool_call,
      
        2329
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2330
                        is_error=False,
      
        2331
                    )
      
        2332
                ]
      
        2333
            )
      
        2334
        
        2335
            summary = TurnSummary(final_response="")
      
        2336
            await runner.execute_batch(
      
        2337
                tool_calls=[tool_call],
      
        2338
                tool_source="assistant",
      
        2339
                pending_tool_calls_seen=set(),
      
        2340
                emit=_noop_emit,
      
        2341
                summary=summary,
      
        2342
                dod=dod,
      
        2343
                executor=executor,  # type: ignore[arg-type]
      
        2344
                on_confirmation=None,
      
        2345
                on_user_question=None,
      
        2346
                emit_confirmation=None,
      
        2347
                consecutive_errors=0,
      
        2348
            )
      
        2349
        
        2350
            assert persistent_messages
      
        2351
            message = persistent_messages[-1]
      
        2352
            assert "Confirmed progress:" in message
      
        2353
            assert "Next step: create `01-introduction.html`." in message
      
        2354
            assert (
      
        2355
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2356
                in message
      
        2357
            )
      
        2358
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2359
            assert ephemeral_messages == []
      
        2360
        
        2361
        
        2362
        @pytest.mark.asyncio
      
        2363
        async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
      
        2364
            temp_dir: Path,
      
        2365
        ) -> None:
      
        2366
            async def assess_confidence(
      
        2367
                tool_name: str,
      
        2368
                tool_args: dict,
      
        2369
                context: str,
      
        2370
            ) -> ConfidenceAssessment:
      
        2371
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2372
        
        2373
            async def verify_action(
      
        2374
                tool_name: str,
      
        2375
                tool_args: dict,
      
        2376
                result: str,
      
        2377
                expected: str = "",
      
        2378
            ) -> ActionVerification:
      
        2379
                raise AssertionError("Verification should not run for this scenario")
      
        2380
        
        2381
            nginx_root = temp_dir / "guides" / "nginx"
      
        2382
            chapters = nginx_root / "chapters"
      
        2383
            chapters.mkdir(parents=True)
      
        2384
            index_path = nginx_root / "index.html"
      
        2385
        
        2386
            implementation_plan = temp_dir / "implementation.md"
      
        2387
            implementation_plan.write_text(
      
        2388
                "\n".join(
      
        2389
                    [
      
        2390
                        "# Implementation Plan",
      
        2391
                        "",
      
        2392
                        "## File Changes",
      
        2393
                        f"- `{chapters}/`",
      
        2394
                        f"- `{index_path}`",
      
        2395
                        f"- `{chapters / '01-introduction.html'}`",
      
        2396
                        "",
      
        2397
                    ]
      
        2398
                )
      
        2399
            )
      
        2400
        
        2401
            context = build_context(
      
        2402
                temp_dir=temp_dir,
      
        2403
                messages=[
      
        2404
                    Message(
      
        2405
                        role=Role.USER,
      
        2406
                        content=(
      
        2407
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        2408
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        2409
                        ),
      
        2410
                    )
      
        2411
                ],
      
        2412
                safeguards=FakeSafeguards(),
      
        2413
                assess_confidence=assess_confidence,
      
        2414
                verify_action=verify_action,
      
        2415
                auto_recover=False,
      
        2416
            )
      
        2417
            persistent_messages: list[str] = []
      
        2418
            ephemeral_messages: list[str] = []
      
        2419
            context.queue_steering_message_callback = persistent_messages.append
      
        2420
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2421
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2422
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2423
            dod.implementation_plan = str(implementation_plan)
      
        2424
            sync_todos_to_definition_of_done(
      
        2425
                dod,
      
        2426
                [
      
        2427
                    {
      
        2428
                        "content": "Create the main index.html file with proper structure",
      
        2429
                        "active_form": "Creating the main index.html file with proper structure",
      
        2430
                        "status": "pending",
      
        2431
                    },
      
        2432
                    {
      
        2433
                        "content": "Create each chapter file with appropriate content",
      
        2434
                        "active_form": "Creating each chapter file with appropriate content",
      
        2435
                        "status": "pending",
      
        2436
                    },
      
        2437
                ],
      
        2438
            )
      
        2439
        
        2440
            tool_call = ToolCall(
      
        2441
                id="write-index-recovered",
      
        2442
                name="write",
      
        2443
                arguments={
      
        2444
                    "file_path": str(index_path),
      
        2445
                    "content": "<html></html>\n",
      
        2446
                },
      
        2447
            )
      
        2448
            executor = FakeExecutor(
      
        2449
                [
      
        2450
                    tool_outcome(
      
        2451
                        tool_call=tool_call,
      
        2452
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2453
                        is_error=False,
      
        2454
                    )
      
        2455
                ]
      
        2456
            )
      
        2457
        
        2458
            summary = TurnSummary(final_response="")
      
        2459
            await runner.execute_batch(
      
        2460
                tool_calls=[tool_call],
      
        2461
                tool_source="assistant",
      
        2462
                pending_tool_calls_seen=set(),
      
        2463
                emit=_noop_emit,
      
        2464
                summary=summary,
      
        2465
                dod=dod,
      
        2466
                executor=executor,  # type: ignore[arg-type]
      
        2467
                on_confirmation=None,
      
        2468
                on_user_question=None,
      
        2469
                emit_confirmation=None,
      
        2470
                consecutive_errors=0,
      
        2471
            )
      
        2472
        
        2473
            assert persistent_messages == []
      
        2474
            assert ephemeral_messages
      
        2475
            message = ephemeral_messages[-1]
      
        2476
            assert "Resume by creating `01-introduction.html` now." in message
      
        2477
        
        2478
        
        2479
        @pytest.mark.asyncio
      
        2480
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        2481
            temp_dir: Path,
      
        2482
        ) -> None:
      
        2483
            async def assess_confidence(
      
        2484
                tool_name: str,
      
        2485
                tool_args: dict,
      
        2486
                context: str,
      
        2487
            ) -> ConfidenceAssessment:
      
        2488
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2489
        
        2490
            async def verify_action(
      
        2491
                tool_name: str,
      
        2492
                tool_args: dict,
      
        2493
                result: str,
      
        2494
                expected: str = "",
      
        2495
            ) -> ActionVerification:
      
        2496
                raise AssertionError("Verification should not run for this scenario")
      
        2497
        
        2498
            guide_root = temp_dir / "guides" / "nginx"
      
        2499
            chapters = guide_root / "chapters"
      
        2500
            chapters.mkdir(parents=True)
      
        2501
            index_path = guide_root / "index.html"
      
        2502
            chapter_one = chapters / "01-getting-started.html"
      
        2503
            chapter_one.write_text("<h1>One</h1>\n")
      
        2504
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2505
        
        2506
            implementation_plan = temp_dir / "implementation.md"
      
        2507
            implementation_plan.write_text(
      
        2508
                "\n".join(
      
        2509
                    [
      
        2510
                        "# Implementation Plan",
      
        2511
                        "",
      
        2512
                        "## File Changes",
      
        2513
                        f"- `{index_path}`",
      
        2514
                        f"- `{chapter_one}`",
      
        2515
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        2516
                        "",
      
        2517
                    ]
      
        2518
                )
      
        2519
            )
      
        2520
        
        2521
            context = build_context(
      
        2522
                temp_dir=temp_dir,
      
        2523
                messages=[],
      
        2524
                safeguards=FakeSafeguards(),
      
        2525
                assess_confidence=assess_confidence,
      
        2526
                verify_action=verify_action,
      
        2527
                auto_recover=False,
      
        2528
            )
      
        2529
            persistent_messages: list[str] = []
      
        2530
            ephemeral_messages: list[str] = []
      
        2531
            context.queue_steering_message_callback = persistent_messages.append
      
        2532
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2533
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2534
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2535
            dod.implementation_plan = str(implementation_plan)
      
        2536
            sync_todos_to_definition_of_done(
      
        2537
                dod,
      
        2538
                [
      
        2539
                    {
      
        2540
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2541
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2542
                        "status": "pending",
      
        2543
                    },
      
        2544
                    {
      
        2545
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        2546
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        2547
                        "status": "pending",
      
        2548
                    },
      
        2549
                ],
      
        2550
            )
      
        2551
            assert tool_batches_should_prioritize_missing_artifact(
      
        2552
                dod=dod,
      
        2553
                next_pending=dod.pending_items[0],
      
        2554
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        2555
                project_root=temp_dir,
      
        2556
            )
      
        2557
        
        2558
            tool_call = ToolCall(
      
        2559
                id="dup-read",
      
        2560
                name="read",
      
        2561
                arguments={"file_path": str(index_path)},
      
        2562
            )
      
        2563
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        2564
        
        2565
            assert persistent_messages
      
        2566
            message = persistent_messages[-1]
      
        2567
            assert "06-ssl-configuration.html" in message
      
        2568
            assert "Do not switch into review or consistency-check mode" in message
      
        2569
            assert (
      
        2570
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        2571
                not in message
      
        2572
            )
      
        2573
        
        2574
        
        2575
        @pytest.mark.asyncio
      
        2576
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        2577
            temp_dir: Path,
      
        2578
        ) -> None:
      
        2579
            async def assess_confidence(
      
        2580
                tool_name: str,
      
        2581
                tool_args: dict,
      
        2582
                context: str,
      
        2583
            ) -> ConfidenceAssessment:
      
        2584
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2585
        
        2586
            async def verify_action(
      
        2587
                tool_name: str,
      
        2588
                tool_args: dict,
      
        2589
                result: str,
      
        2590
                expected: str = "",
      
        2591
            ) -> ActionVerification:
      
        2592
                raise AssertionError("Verification should not run for this scenario")
      
        2593
        
        2594
            guide_root = temp_dir / "guides" / "nginx"
      
        2595
            chapters = guide_root / "chapters"
      
        2596
            chapters.mkdir(parents=True)
      
        2597
            index_path = guide_root / "index.html"
      
        2598
            chapter_one = chapters / "01-getting-started.html"
      
        2599
            chapter_two = chapters / "02-installation.html"
      
        2600
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2601
            chapter_one.write_text("<h1>One</h1>\n")
      
        2602
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2603
        
        2604
            implementation_plan = temp_dir / "implementation.md"
      
        2605
            implementation_plan.write_text(
      
        2606
                "\n".join(
      
        2607
                    [
      
        2608
                        "# Implementation Plan",
      
        2609
                        "",
      
        2610
                        "## File Changes",
      
        2611
                        f"- `{chapters}/`",
      
        2612
                        f"- `{index_path}`",
      
        2613
                        f"- `{chapter_one}`",
      
        2614
                        f"- `{chapter_two}`",
      
        2615
                        "",
      
        2616
                    ]
      
        2617
                )
      
        2618
            )
      
        2619
        
        2620
            context = build_context(
      
        2621
                temp_dir=temp_dir,
      
        2622
                messages=[],
      
        2623
                safeguards=FakeSafeguards(),
      
        2624
                assess_confidence=assess_confidence,
      
        2625
                verify_action=verify_action,
      
        2626
                auto_recover=False,
      
        2627
            )
      
        2628
            persistent_messages: list[str] = []
      
        2629
            ephemeral_messages: list[str] = []
      
        2630
            context.queue_steering_message_callback = persistent_messages.append
      
        2631
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2632
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2633
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2634
            dod.implementation_plan = str(implementation_plan)
      
        2635
            sync_todos_to_definition_of_done(
      
        2636
                dod,
      
        2637
                [
      
        2638
                    {
      
        2639
                        "content": "Create the guide files",
      
        2640
                        "active_form": "Working on: Create the guide files",
      
        2641
                        "status": "completed",
      
        2642
                    },
      
        2643
                    {
      
        2644
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2645
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2646
                        "status": "pending",
      
        2647
                    },
      
        2648
                ],
      
        2649
            )
      
        2650
            tool_call = ToolCall(
      
        2651
                id="write-final",
      
        2652
                name="write",
      
        2653
                arguments={
      
        2654
                    "file_path": str(chapter_two),
      
        2655
                    "content": "<h1>Two</h1>\n",
      
        2656
                },
      
        2657
            )
      
        2658
            executor = FakeExecutor(
      
        2659
                [
      
        2660
                    tool_outcome(
      
        2661
                        tool_call=tool_call,
      
        2662
                        output=f"Successfully wrote {chapter_two}",
      
        2663
                        is_error=False,
      
        2664
                    )
      
        2665
                ]
      
        2666
            )
      
        2667
        
        2668
            summary = TurnSummary(final_response="")
      
        2669
            await runner.execute_batch(
      
        2670
                tool_calls=[tool_call],
      
        2671
                tool_source="assistant",
      
        2672
                pending_tool_calls_seen=set(),
      
        2673
                emit=_noop_emit,
      
        2674
                summary=summary,
      
        2675
                dod=dod,
      
        2676
                executor=executor,  # type: ignore[arg-type]
      
        2677
                on_confirmation=None,
      
        2678
                on_user_question=None,
      
        2679
                emit_confirmation=None,
      
        2680
                consecutive_errors=0,
      
        2681
            )
      
        2682
        
        2683
            assert any(
      
        2684
                "All explicitly planned artifacts now exist." in message
      
        2685
                for message in persistent_messages
      
        2686
            )
      
        2687
            assert any(
      
        2688
                "Ensure all files are properly linked and formatted consistently" in message
      
        2689
                for message in persistent_messages
      
        2690
            )
      
        2691
            assert any(
      
        2692
                "Move to verification once no specific mismatch remains." in message
      
        2693
                for message in persistent_messages
      
        2694
            )
      
        2695
        
        2696
        
        2697
        @pytest.mark.asyncio
      
        2698
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        2699
            temp_dir: Path,
      
        2700
        ) -> None:
      
        2701
            async def assess_confidence(
      
        2702
                tool_name: str,
      
        2703
                tool_args: dict,
      
        2704
                context: str,
      
        2705
            ) -> ConfidenceAssessment:
      
        2706
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2707
        
        2708
            async def verify_action(
      
        2709
                tool_name: str,
      
        2710
                tool_args: dict,
      
        2711
                result: str,
      
        2712
                expected: str = "",
      
        2713
            ) -> ActionVerification:
      
        2714
                raise AssertionError("Verification should not run in this scenario")
      
        2715
        
        2716
            guide_root = temp_dir / "guides" / "nginx"
      
        2717
            chapters = guide_root / "chapters"
      
        2718
            guide_root.mkdir(parents=True)
      
        2719
            chapters.mkdir()
      
        2720
            index_path = guide_root / "index.html"
      
        2721
            index_path.write_text("<html></html>\n")
      
        2722
            chapter_one = chapters / "01-getting-started.html"
      
        2723
            chapter_two = chapters / "02-installation.html"
      
        2724
            implementation_plan = temp_dir / "implementation.md"
      
        2725
            implementation_plan.write_text(
      
        2726
                "\n".join(
      
        2727
                    [
      
        2728
                        "# Implementation Plan",
      
        2729
                        "",
      
        2730
                        "## File Changes",
      
        2731
                        f"- `{guide_root}/`",
      
        2732
                        f"- `{index_path}`",
      
        2733
                        f"- `{chapter_one}`",
      
        2734
                        f"- `{chapter_two}`",
      
        2735
                        "",
      
        2736
                    ]
      
        2737
                )
      
        2738
            )
      
        2739
        
        2740
            context = build_context(
      
        2741
                temp_dir=temp_dir,
      
        2742
                messages=[],
      
        2743
                safeguards=FakeSafeguards(),
      
        2744
                assess_confidence=assess_confidence,
      
        2745
                verify_action=verify_action,
      
        2746
                auto_recover=False,
      
        2747
            )
      
        2748
            persistent_messages: list[str] = []
      
        2749
            ephemeral_messages: list[str] = []
      
        2750
            context.queue_steering_message_callback = persistent_messages.append
      
        2751
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2752
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2753
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2754
            dod.implementation_plan = str(implementation_plan)
      
        2755
            sync_todos_to_definition_of_done(
      
        2756
                dod,
      
        2757
                [
      
        2758
                    {
      
        2759
                        "content": "Create the main index.html file with proper structure",
      
        2760
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        2761
                        "status": "pending",
      
        2762
                    },
      
        2763
                    {
      
        2764
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        2765
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        2766
                        "status": "pending",
      
        2767
                    },
      
        2768
                    {
      
        2769
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2770
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2771
                        "status": "pending",
      
        2772
                    },
      
        2773
                ],
      
        2774
            )
      
        2775
            tool_call = ToolCall(
      
        2776
                id="write-index",
      
        2777
                name="write",
      
        2778
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        2779
            )
      
        2780
            executor = FakeExecutor(
      
        2781
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        2782
            )
      
        2783
        
        2784
            summary = TurnSummary(final_response="")
      
        2785
            await runner.execute_batch(
      
        2786
                tool_calls=[tool_call],
      
        2787
                tool_source="assistant",
      
        2788
                pending_tool_calls_seen=set(),
      
        2789
                emit=_noop_emit,
      
        2790
                summary=summary,
      
        2791
                dod=dod,
      
        2792
                executor=executor,  # type: ignore[arg-type]
      
        2793
                on_confirmation=None,
      
        2794
                on_user_question=None,
      
        2795
                emit_confirmation=None,
      
        2796
                consecutive_errors=0,
      
        2797
            )
      
        2798
        
        2799
            assert persistent_messages
      
        2800
            message = persistent_messages[-1]
      
        2801
            assert "Next step: create `01-getting-started.html`." in message
      
        2802
            assert (
      
        2803
                f"Prefer one `write(file_path=..., content=...)` call for `{chapter_one.resolve(strict=False)}` now."
      
        2804
                in message
      
        2805
            )
      
        2806
            assert "refresh `TodoWrite`" not in message
      
        2807
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2808
        
        2809
        
        2810
        @pytest.mark.asyncio
      
        2811
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        2812
            temp_dir: Path,
      
        2813
        ) -> None:
      
        2814
            async def assess_confidence(
      
        2815
                tool_name: str,
      
        2816
                tool_args: dict,
      
        2817
                context: str,
      
        2818
            ) -> ConfidenceAssessment:
      
        2819
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2820
        
        2821
            async def verify_action(
      
        2822
                tool_name: str,
      
        2823
                tool_args: dict,
      
        2824
                result: str,
      
        2825
                expected: str = "",
      
        2826
            ) -> ActionVerification:
      
        2827
                raise AssertionError("Verification should not run in this scenario")
      
        2828
        
        2829
            guide_root = temp_dir / "guides" / "nginx"
      
        2830
            chapters = guide_root / "chapters"
      
        2831
            guide_root.mkdir(parents=True)
      
        2832
            chapters.mkdir()
      
        2833
            index_path = guide_root / "index.html"
      
        2834
            index_path.write_text("<html></html>\n")
      
        2835
        
        2836
            chapter_paths = [
      
        2837
                chapters / "01-getting-started.html",
      
        2838
                chapters / "02-installation.html",
      
        2839
                chapters / "03-first-website.html",
      
        2840
                chapters / "04-configuration-basics.html",
      
        2841
                chapters / "05-advanced-configurations.html",
      
        2842
                chapters / "06-performance-tuning.html",
      
        2843
                chapters / "07-security-best-practices.html",
      
        2844
            ]
      
        2845
            for chapter in chapter_paths[:4]:
      
        2846
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        2847
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        2848
        
        2849
            implementation_plan = temp_dir / "implementation.md"
      
        2850
            implementation_plan.write_text(
      
        2851
                "\n".join(
      
        2852
                    [
      
        2853
                        "# Implementation Plan",
      
        2854
                        "",
      
        2855
                        "## File Changes",
      
        2856
                        f"- `{guide_root}/`",
      
        2857
                        f"- `{chapters}/`",
      
        2858
                        f"- `{index_path}`",
      
        2859
                        *[f"- `{path}`" for path in chapter_paths],
      
        2860
                        "",
      
        2861
                    ]
      
        2862
                )
      
        2863
            )
      
        2864
        
        2865
            context = build_context(
      
        2866
                temp_dir=temp_dir,
      
        2867
                messages=[],
      
        2868
                safeguards=FakeSafeguards(),
      
        2869
                assess_confidence=assess_confidence,
      
        2870
                verify_action=verify_action,
      
        2871
                auto_recover=False,
      
        2872
            )
      
        2873
            persistent_messages: list[str] = []
      
        2874
            ephemeral_messages: list[str] = []
      
        2875
            context.queue_steering_message_callback = persistent_messages.append
      
        2876
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2877
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2878
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        2879
            dod.implementation_plan = str(implementation_plan)
      
        2880
            sync_todos_to_definition_of_done(
      
        2881
                dod,
      
        2882
                [
      
        2883
                    {
      
        2884
                        "content": "Create the nginx guide artifacts",
      
        2885
                        "active_form": "Creating nginx guide artifacts",
      
        2886
                        "status": "pending",
      
        2887
                    },
      
        2888
                    {
      
        2889
                        "content": "Verify all guide files are linked and complete",
      
        2890
                        "active_form": "Verifying guide linkage and completeness",
      
        2891
                        "status": "pending",
      
        2892
                    },
      
        2893
                ],
      
        2894
            )
      
        2895
            tool_call = ToolCall(
      
        2896
                id="write-chapter-05",
      
        2897
                name="write",
      
        2898
                arguments={
      
        2899
                    "file_path": str(chapter_paths[4]),
      
        2900
                    "content": "<h1>Advanced configurations</h1>\n",
      
        2901
                },
      
        2902
            )
      
        2903
            executor = FakeExecutor(
      
        2904
                [
      
        2905
                    tool_outcome(
      
        2906
                        tool_call=tool_call,
      
        2907
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        2908
                        is_error=False,
      
        2909
                    )
      
        2910
                ]
      
        2911
            )
      
        2912
        
        2913
            summary = TurnSummary(final_response="")
      
        2914
            await runner.execute_batch(
      
        2915
                tool_calls=[tool_call],
      
        2916
                tool_source="assistant",
      
        2917
                pending_tool_calls_seen=set(),
      
        2918
                emit=_noop_emit,
      
        2919
                summary=summary,
      
        2920
                dod=dod,
      
        2921
                executor=executor,  # type: ignore[arg-type]
      
        2922
                on_confirmation=None,
      
        2923
                on_user_question=None,
      
        2924
                emit_confirmation=None,
      
        2925
                consecutive_errors=0,
      
        2926
            )
      
        2927
        
        2928
            assert any(
      
        2929
                "Resume by creating `06-performance-tuning.html` now." in message
      
        2930
                for message in ephemeral_messages
      
        2931
            )
      
        2932
            assert not any(
      
        2933
                "All explicitly planned artifacts now exist." in message
      
        2934
                for message in ephemeral_messages
      
        2935
            )
      
        2936
        
        2937
        
        2938
        @pytest.mark.asyncio
      
        2939
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        2940
            temp_dir: Path,
      
        2941
        ) -> None:
      
        2942
            async def assess_confidence(
      
        2943
                tool_name: str,
      
        2944
                tool_args: dict,
      
        2945
                context: str,
      
        2946
            ) -> ConfidenceAssessment:
      
        2947
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2948
        
        2949
            async def verify_action(
      
        2950
                tool_name: str,
      
        2951
                tool_args: dict,
      
        2952
                result: str,
      
        2953
                expected: str = "",
      
        2954
            ) -> ActionVerification:
      
        2955
                raise AssertionError("Verification should not run in this scenario")
      
        2956
        
        2957
            guide_root = temp_dir / "guides" / "nginx"
      
        2958
            chapters = guide_root / "chapters"
      
        2959
            guide_root.mkdir(parents=True)
      
        2960
            chapters.mkdir()
      
        2961
            index_path = guide_root / "index.html"
      
        2962
            chapter_paths = [
      
        2963
                chapters / "01-introduction.html",
      
        2964
                chapters / "02-installation.html",
      
        2965
                chapters / "03-configuration.html",
      
        2966
                chapters / "04-basic-usage.html",
      
        2967
                chapters / "05-advanced-features.html",
      
        2968
            ]
      
        2969
            for path in (index_path, *chapter_paths[:4]):
      
        2970
                path.write_text("<html></html>\n")
      
        2971
        
        2972
            implementation_plan = temp_dir / "implementation.md"
      
        2973
            implementation_plan.write_text(
      
        2974
                "\n".join(
      
        2975
                    [
      
        2976
                        "# Implementation Plan",
      
        2977
                        "",
      
        2978
                        "## File Changes",
      
        2979
                        f"- `{guide_root}/`",
      
        2980
                        f"- `{chapters}/`",
      
        2981
                        f"- `{index_path}`",
      
        2982
                        *[f"- `{path}`" for path in chapter_paths],
      
        2983
                        "",
      
        2984
                    ]
      
        2985
                )
      
        2986
            )
      
        2987
        
        2988
            context = build_context(
      
        2989
                temp_dir=temp_dir,
      
        2990
                messages=[],
      
        2991
                safeguards=FakeSafeguards(),
      
        2992
                assess_confidence=assess_confidence,
      
        2993
                verify_action=verify_action,
      
        2994
                auto_recover=False,
      
        2995
            )
      
        2996
            persistent_messages: list[str] = []
      
        2997
            ephemeral_messages: list[str] = []
      
        2998
            context.queue_steering_message_callback = persistent_messages.append
      
        2999
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3000
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3001
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3002
            dod.implementation_plan = str(implementation_plan)
      
        3003
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3004
            dod.completed_items.extend(
      
        3005
                [
      
        3006
                    "Create the nginx directory structure",
      
        3007
                    "Create the main index.html file with proper structure",
      
        3008
                ]
      
        3009
            )
      
        3010
            sync_todos_to_definition_of_done(
      
        3011
                dod,
      
        3012
                [
      
        3013
                    {
      
        3014
                        "content": "Create each chapter file with appropriate content",
      
        3015
                        "active_form": "Creating each chapter file with appropriate content",
      
        3016
                        "status": "pending",
      
        3017
                    }
      
        3018
                ],
      
        3019
            )
      
        3020
            tool_call = ToolCall(
      
        3021
                id="write-chapter-04",
      
        3022
                name="write",
      
        3023
                arguments={
      
        3024
                    "file_path": str(chapter_paths[3]),
      
        3025
                    "content": "<html>updated</html>\n",
      
        3026
                },
      
        3027
            )
      
        3028
            executor = FakeExecutor(
      
        3029
                [
      
        3030
                    tool_outcome(
      
        3031
                        tool_call=tool_call,
      
        3032
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3033
                        is_error=False,
      
        3034
                    )
      
        3035
                ]
      
        3036
            )
      
        3037
        
        3038
            summary = TurnSummary(final_response="")
      
        3039
            await runner.execute_batch(
      
        3040
                tool_calls=[tool_call],
      
        3041
                tool_source="assistant",
      
        3042
                pending_tool_calls_seen=set(),
      
        3043
                emit=_noop_emit,
      
        3044
                summary=summary,
      
        3045
                dod=dod,
      
        3046
                executor=executor,  # type: ignore[arg-type]
      
        3047
                on_confirmation=None,
      
        3048
                on_user_question=None,
      
        3049
                emit_confirmation=None,
      
        3050
                consecutive_errors=0,
      
        3051
            )
      
        3052
        
        3053
            assert ephemeral_messages
      
        3054
            message = ephemeral_messages[-1]
      
        3055
            assert "Resume by creating `05-advanced-features.html` now." in message
      
        3056
            assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
      
        3057
            assert "refresh `TodoWrite`" not in message
      
        3058
        
        3059
        
        3060
        @pytest.mark.asyncio
      
        3061
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3062
            temp_dir: Path,
      
        3063
        ) -> None:
      
        3064
            async def assess_confidence(
      
        3065
                tool_name: str,
      
        3066
                tool_args: dict,
      
        3067
                context: str,
      
        3068
            ) -> ConfidenceAssessment:
      
        3069
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3070
        
        3071
            async def verify_action(
      
        3072
                tool_name: str,
      
        3073
                tool_args: dict,
      
        3074
                result: str,
      
        3075
                expected: str = "",
      
        3076
            ) -> ActionVerification:
      
        3077
                raise AssertionError("Verification should not run in this scenario")
      
        3078
        
        3079
            guide_root = temp_dir / "guides" / "nginx"
      
        3080
            chapters = guide_root / "chapters"
      
        3081
            guide_root.mkdir(parents=True)
      
        3082
            chapters.mkdir()
      
        3083
            index_path = guide_root / "index.html"
      
        3084
            index_path.write_text("<html></html>\n")
      
        3085
            chapter_one = chapters / "01-getting-started.html"
      
        3086
            chapter_two = chapters / "02-installation.html"
      
        3087
            chapter_one.write_text("<h1>One</h1>\n")
      
        3088
        
        3089
            implementation_plan = temp_dir / "implementation.md"
      
        3090
            implementation_plan.write_text(
      
        3091
                "\n".join(
      
        3092
                    [
      
        3093
                        "# Implementation Plan",
      
        3094
                        "",
      
        3095
                        "## File Changes",
      
        3096
                        f"- `{guide_root}/`",
      
        3097
                        f"- `{chapters}/`",
      
        3098
                        f"- `{index_path}`",
      
        3099
                        f"- `{chapter_one}`",
      
        3100
                        f"- `{chapter_two}`",
      
        3101
                        "",
      
        3102
                    ]
      
        3103
                )
      
        3104
            )
      
        3105
        
        3106
            context = build_context(
      
        3107
                temp_dir=temp_dir,
      
        3108
                messages=[],
      
        3109
                safeguards=FakeSafeguards(),
      
        3110
                assess_confidence=assess_confidence,
      
        3111
                verify_action=verify_action,
      
        3112
                auto_recover=False,
      
        3113
            )
      
        3114
            persistent_messages: list[str] = []
      
        3115
            ephemeral_messages: list[str] = []
      
        3116
            context.queue_steering_message_callback = persistent_messages.append
      
        3117
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3118
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3119
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3120
            dod.implementation_plan = str(implementation_plan)
      
        3121
            sync_todos_to_definition_of_done(
      
        3122
                dod,
      
        3123
                [
      
        3124
                    {
      
        3125
                        "content": "Create 01-getting-started.html",
      
        3126
                        "active_form": "Creating 01-getting-started.html",
      
        3127
                        "status": "completed",
      
        3128
                    },
      
        3129
                    {
      
        3130
                        "content": "Create 02-installation.html",
      
        3131
                        "active_form": "Creating 02-installation.html",
      
        3132
                        "status": "pending",
      
        3133
                    },
      
        3134
                ],
      
        3135
            )
      
        3136
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3137
        
        3138
            tool_call = ToolCall(
      
        3139
                id="todo-only",
      
        3140
                name="TodoWrite",
      
        3141
                arguments={
      
        3142
                    "todos": [
      
        3143
                        {
      
        3144
                            "content": "Create 01-getting-started.html",
      
        3145
                            "active_form": "Creating 01-getting-started.html",
      
        3146
                            "status": "completed",
      
        3147
                        },
      
        3148
                        {
      
        3149
                            "content": "Create 02-installation.html",
      
        3150
                            "active_form": "Creating 02-installation.html",
      
        3151
                            "status": "pending",
      
        3152
                        },
      
        3153
                    ]
      
        3154
                },
      
        3155
            )
      
        3156
            executor = FakeExecutor(
      
        3157
                [
      
        3158
                    tool_outcome(
      
        3159
                        tool_call=tool_call,
      
        3160
                        output="Todos updated",
      
        3161
                        is_error=False,
      
        3162
                        metadata={
      
        3163
                            "new_todos": [
      
        3164
                                {
      
        3165
                                    "content": "Create 01-getting-started.html",
      
        3166
                                    "active_form": "Creating 01-getting-started.html",
      
        3167
                                    "status": "completed",
      
        3168
                                },
      
        3169
                                {
      
        3170
                                    "content": "Create 02-installation.html",
      
        3171
                                    "active_form": "Creating 02-installation.html",
      
        3172
                                    "status": "pending",
      
        3173
                                },
      
        3174
                            ]
      
        3175
                        },
      
        3176
                    )
      
        3177
                ]
      
        3178
            )
      
        3179
        
        3180
            summary = TurnSummary(final_response="")
      
        3181
            await runner.execute_batch(
      
        3182
                tool_calls=[tool_call],
      
        3183
                tool_source="assistant",
      
        3184
                pending_tool_calls_seen=set(),
      
        3185
                emit=_noop_emit,
      
        3186
                summary=summary,
      
        3187
                dod=dod,
      
        3188
                executor=executor,  # type: ignore[arg-type]
      
        3189
                on_confirmation=None,
      
        3190
                on_user_question=None,
      
        3191
                emit_confirmation=None,
      
        3192
                consecutive_errors=0,
      
        3193
            )
      
        3194
        
        3195
            assert persistent_messages
      
        3196
            message = persistent_messages[-1]
      
        3197
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3198
            assert "Resume by creating `02-installation.html` now." in message
      
        3199
            assert "refresh `TodoWrite`" in message
      
        3200
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3201
            assert ephemeral_messages == []
      
        3202
        
        3203
        
        3204
        @pytest.mark.asyncio
      
        3205
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3206
            temp_dir: Path,
      
        3207
        ) -> None:
      
        3208
            async def assess_confidence(
      
        3209
                tool_name: str,
      
        3210
                tool_args: dict,
      
        3211
                context: str,
      
        3212
            ) -> ConfidenceAssessment:
      
        3213
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3214
        
        3215
            async def verify_action(
      
        3216
                tool_name: str,
      
        3217
                tool_args: dict,
      
        3218
                result: str,
      
        3219
                expected: str = "",
      
        3220
            ) -> ActionVerification:
      
        3221
                raise AssertionError("Verification should not run in this scenario")
      
        3222
        
        3223
            guide_root = temp_dir / "guides" / "nginx"
      
        3224
            chapters = guide_root / "chapters"
      
        3225
            guide_root.mkdir(parents=True)
      
        3226
            chapters.mkdir()
      
        3227
            index_path = guide_root / "index.html"
      
        3228
            chapter_one = chapters / "01-getting-started.html"
      
        3229
            chapter_two = chapters / "02-installation.html"
      
        3230
            index_path.write_text("<html></html>\n")
      
        3231
            chapter_one.write_text("<h1>One</h1>\n")
      
        3232
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3233
        
        3234
            implementation_plan = temp_dir / "implementation.md"
      
        3235
            implementation_plan.write_text(
      
        3236
                "\n".join(
      
        3237
                    [
      
        3238
                        "# Implementation Plan",
      
        3239
                        "",
      
        3240
                        "## File Changes",
      
        3241
                        f"- `{guide_root}/`",
      
        3242
                        f"- `{chapters}/`",
      
        3243
                        f"- `{index_path}`",
      
        3244
                        f"- `{chapter_one}`",
      
        3245
                        f"- `{chapter_two}`",
      
        3246
                        "",
      
        3247
                    ]
      
        3248
                )
      
        3249
            )
      
        3250
        
        3251
            context = build_context(
      
        3252
                temp_dir=temp_dir,
      
        3253
                messages=[],
      
        3254
                safeguards=FakeSafeguards(),
      
        3255
                assess_confidence=assess_confidence,
      
        3256
                verify_action=verify_action,
      
        3257
                auto_recover=False,
      
        3258
            )
      
        3259
            queued_messages: list[str] = []
      
        3260
            context.queue_steering_message_callback = queued_messages.append
      
        3261
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3262
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3263
            dod.implementation_plan = str(implementation_plan)
      
        3264
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3265
            sync_todos_to_definition_of_done(
      
        3266
                dod,
      
        3267
                [
      
        3268
                    {
      
        3269
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3270
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3271
                        "status": "pending",
      
        3272
                    },
      
        3273
                    {
      
        3274
                        "content": "Verify all guide files are linked and complete",
      
        3275
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3276
                        "status": "pending",
      
        3277
                    },
      
        3278
                ],
      
        3279
                project_root=temp_dir,
      
        3280
            )
      
        3281
        
        3282
            tool_call = ToolCall(
      
        3283
                id="todo-only",
      
        3284
                name="TodoWrite",
      
        3285
                arguments={
      
        3286
                    "todos": [
      
        3287
                        {
      
        3288
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3289
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3290
                            "status": "pending",
      
        3291
                        },
      
        3292
                        {
      
        3293
                            "content": "Verify all guide files are linked and complete",
      
        3294
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3295
                            "status": "pending",
      
        3296
                        },
      
        3297
                    ]
      
        3298
                },
      
        3299
            )
      
        3300
            executor = FakeExecutor(
      
        3301
                [
      
        3302
                    tool_outcome(
      
        3303
                        tool_call=tool_call,
      
        3304
                        output="Todos updated",
      
        3305
                        is_error=False,
      
        3306
                        metadata={
      
        3307
                            "new_todos": [
      
        3308
                                {
      
        3309
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3310
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3311
                                    "status": "pending",
      
        3312
                                },
      
        3313
                                {
      
        3314
                                    "content": "Verify all guide files are linked and complete",
      
        3315
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        3316
                                    "status": "pending",
      
        3317
                                },
      
        3318
                            ]
      
        3319
                        },
      
        3320
                    )
      
        3321
                ]
      
        3322
            )
      
        3323
        
        3324
            summary = TurnSummary(final_response="")
      
        3325
            await runner.execute_batch(
      
        3326
                tool_calls=[tool_call],
      
        3327
                tool_source="assistant",
      
        3328
                pending_tool_calls_seen=set(),
      
        3329
                emit=_noop_emit,
      
        3330
                summary=summary,
      
        3331
                dod=dod,
      
        3332
                executor=executor,  # type: ignore[arg-type]
      
        3333
                on_confirmation=None,
      
        3334
                on_user_question=None,
      
        3335
                emit_confirmation=None,
      
        3336
                consecutive_errors=0,
      
        3337
            )
      
        3338
        
        3339
            assert queued_messages
      
        3340
            message = queued_messages[-1]
      
        3341
            assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
      
        3342
            assert "Verify all guide files are linked and complete" in message
      
        3343
            assert "Move to verification once no specific mismatch remains." in message
      
        3344
            assert "reopen reference materials" in message
      
        3345
            assert "Fortran guide structure" not in message
      
        3346
        
        3347
        
        3348
        @pytest.mark.asyncio
      
        3349
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        3350
            temp_dir: Path,
      
        3351
        ) -> None:
      
        3352
            async def assess_confidence(
      
        3353
                tool_name: str,
      
        3354
                tool_args: dict,
      
        3355
                context: str,
      
        3356
            ) -> ConfidenceAssessment:
      
        3357
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3358
        
        3359
            async def verify_action(
      
        3360
                tool_name: str,
      
        3361
                tool_args: dict,
      
        3362
                result: str,
      
        3363
                expected: str = "",
      
        3364
            ) -> ActionVerification:
      
        3365
                raise AssertionError("Verification should not run in this scenario")
      
        3366
        
        3367
            guide_root = temp_dir / "guides" / "nginx"
      
        3368
            chapters = guide_root / "chapters"
      
        3369
            guide_root.mkdir(parents=True)
      
        3370
            chapters.mkdir()
      
        3371
            index_path = guide_root / "index.html"
      
        3372
            index_path.write_text(
      
        3373
                "\n".join(
      
        3374
                    [
      
        3375
                        "<!DOCTYPE html>",
      
        3376
                        "<html>",
      
        3377
                        "<body>",
      
        3378
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        3379
                        "</body>",
      
        3380
                        "</html>",
      
        3381
                        "",
      
        3382
                    ]
      
        3383
                )
      
        3384
            )
      
        3385
        
        3386
            implementation_plan = temp_dir / "implementation.md"
      
        3387
            implementation_plan.write_text(
      
        3388
                "\n".join(
      
        3389
                    [
      
        3390
                        "# Implementation Plan",
      
        3391
                        "",
      
        3392
                        "## File Changes",
      
        3393
                        f"- `{guide_root}/`",
      
        3394
                        f"- `{chapters}/`",
      
        3395
                        f"- `{index_path}`",
      
        3396
                        "",
      
        3397
                    ]
      
        3398
                )
      
        3399
            )
      
        3400
        
        3401
            context = build_context(
      
        3402
                temp_dir=temp_dir,
      
        3403
                messages=[],
      
        3404
                safeguards=FakeSafeguards(),
      
        3405
                assess_confidence=assess_confidence,
      
        3406
                verify_action=verify_action,
      
        3407
                auto_recover=False,
      
        3408
            )
      
        3409
            queued_messages: list[str] = []
      
        3410
            context.queue_steering_message_callback = queued_messages.append
      
        3411
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3412
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3413
            dod.implementation_plan = str(implementation_plan)
      
        3414
            dod.touched_files.append(str(index_path))
      
        3415
            sync_todos_to_definition_of_done(
      
        3416
                dod,
      
        3417
                [
      
        3418
                    {
      
        3419
                        "content": "Examine the existing Fortran guide structure",
      
        3420
                        "active_form": "Examining the existing Fortran guide structure",
      
        3421
                        "status": "completed",
      
        3422
                    },
      
        3423
                    {
      
        3424
                        "content": "Create the nginx directory structure",
      
        3425
                        "active_form": "Creating the nginx directory structure",
      
        3426
                        "status": "completed",
      
        3427
                    },
      
        3428
                    {
      
        3429
                        "content": "Write the introduction chapter",
      
        3430
                        "active_form": "Writing the introduction chapter",
      
        3431
                        "status": "pending",
      
        3432
                    },
      
        3433
                ],
      
        3434
                project_root=temp_dir,
      
        3435
            )
      
        3436
        
        3437
            tool_call = ToolCall(
      
        3438
                id="todo-next-mutation",
      
        3439
                name="TodoWrite",
      
        3440
                arguments={
      
        3441
                    "todos": [
      
        3442
                        {
      
        3443
                            "content": "Examine the existing Fortran guide structure",
      
        3444
                            "active_form": "Examining the existing Fortran guide structure",
      
        3445
                            "status": "completed",
      
        3446
                        },
      
        3447
                        {
      
        3448
                            "content": "Create the nginx directory structure",
      
        3449
                            "active_form": "Creating the nginx directory structure",
      
        3450
                            "status": "completed",
      
        3451
                        },
      
        3452
                        {
      
        3453
                            "content": "Write the introduction chapter",
      
        3454
                            "active_form": "Writing the introduction chapter",
      
        3455
                            "status": "pending",
      
        3456
                        },
      
        3457
                    ]
      
        3458
                },
      
        3459
            )
      
        3460
            executor = FakeExecutor(
      
        3461
                [
      
        3462
                    tool_outcome(
      
        3463
                        tool_call=tool_call,
      
        3464
                        output="Todos updated",
      
        3465
                        is_error=False,
      
        3466
                        metadata={
      
        3467
                            "new_todos": [
      
        3468
                                {
      
        3469
                                    "content": "Examine the existing Fortran guide structure",
      
        3470
                                    "active_form": "Examining the existing Fortran guide structure",
      
        3471
                                    "status": "completed",
      
        3472
                                },
      
        3473
                                {
      
        3474
                                    "content": "Create the nginx directory structure",
      
        3475
                                    "active_form": "Creating the nginx directory structure",
      
        3476
                                    "status": "completed",
      
        3477
                                },
      
        3478
                                {
      
        3479
                                    "content": "Write the introduction chapter",
      
        3480
                                    "active_form": "Writing the introduction chapter",
      
        3481
                                    "status": "pending",
      
        3482
                                },
      
        3483
                            ]
      
        3484
                        },
      
        3485
                    )
      
        3486
                ]
      
        3487
            )
      
        3488
        
        3489
            summary = TurnSummary(final_response="")
      
        3490
            await runner.execute_batch(
      
        3491
                tool_calls=[tool_call],
      
        3492
                tool_source="assistant",
      
        3493
                pending_tool_calls_seen=set(),
      
        3494
                emit=_noop_emit,
      
        3495
                summary=summary,
      
        3496
                dod=dod,
      
        3497
                executor=executor,  # type: ignore[arg-type]
      
        3498
                on_confirmation=None,
      
        3499
                on_user_question=None,
      
        3500
                emit_confirmation=None,
      
        3501
                consecutive_errors=0,
      
        3502
            )
      
        3503
        
        3504
            assert queued_messages
      
        3505
            message = queued_messages[-1]
      
        3506
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3507
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3508
            assert "Resume by creating `01-introduction.html` now." in message
      
        3509
            assert "It is the next missing declared output under `chapters/`." in message
      
        3510
            assert "Prefer one `write` call for `" in message
      
        3511
            assert "01-introduction.html` instead of more rereads." in message
      
        3512
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3513
        
        3514
        
        3515
        @pytest.mark.asyncio
      
        3516
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        3517
            temp_dir: Path,
      
        3518
        ) -> None:
      
        3519
            async def assess_confidence(
      
        3520
                tool_name: str,
      
        3521
                tool_args: dict,
      
        3522
                context: str,
      
        3523
            ) -> ConfidenceAssessment:
      
        3524
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3525
        
        3526
            async def verify_action(
      
        3527
                tool_name: str,
      
        3528
                tool_args: dict,
      
        3529
                result: str,
      
        3530
                expected: str = "",
      
        3531
            ) -> ActionVerification:
      
        3532
                raise AssertionError("Verification should not run in this scenario")
      
        3533
        
        3534
            guide_root = temp_dir / "guides" / "nginx"
      
        3535
            chapters = guide_root / "chapters"
      
        3536
            guide_root.mkdir(parents=True)
      
        3537
            chapters.mkdir()
      
        3538
            index_path = guide_root / "index.html"
      
        3539
            index_path.write_text(
      
        3540
                "\n".join(
      
        3541
                    [
      
        3542
                        "<html>",
      
        3543
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        3544
                        '<a href="chapters/installation.html">Installation</a>',
      
        3545
                        "</html>",
      
        3546
                    ]
      
        3547
                )
      
        3548
                + "\n"
      
        3549
            )
      
        3550
        
        3551
            implementation_plan = temp_dir / "implementation.md"
      
        3552
            implementation_plan.write_text(
      
        3553
                "\n".join(
      
        3554
                    [
      
        3555
                        "# Implementation Plan",
      
        3556
                        "",
      
        3557
                        "## File Changes",
      
        3558
                        f"- `{guide_root}/`",
      
        3559
                        f"- `{chapters}/`",
      
        3560
                        f"- `{index_path}`",
      
        3561
                        "",
      
        3562
                    ]
      
        3563
                )
      
        3564
            )
      
        3565
        
        3566
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3567
            dod.implementation_plan = str(implementation_plan)
      
        3568
            dod.pending_items = [
      
        3569
                "Write the introduction chapter",
      
        3570
                "Complete the requested work",
      
        3571
            ]
      
        3572
            dod.touched_files.append(str(index_path))
      
        3573
        
        3574
            queued_messages: list[str] = []
      
        3575
            context = build_context(
      
        3576
                temp_dir=temp_dir,
      
        3577
                messages=[],
      
        3578
                safeguards=FakeSafeguards(),
      
        3579
                assess_confidence=assess_confidence,
      
        3580
                verify_action=verify_action,
      
        3581
                auto_recover=False,
      
        3582
            )
      
        3583
            context.queue_steering_message_callback = queued_messages.append
      
        3584
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3585
        
        3586
            tool_call = ToolCall(
      
        3587
                id="todo-1",
      
        3588
                name="TodoWrite",
      
        3589
                arguments={
      
        3590
                    "todos": [
      
        3591
                        {
      
        3592
                            "content": "Write the introduction chapter",
      
        3593
                            "activeForm": "Writing the introduction chapter",
      
        3594
                            "status": "pending",
      
        3595
                        }
      
        3596
                    ]
      
        3597
                },
      
        3598
            )
      
        3599
            executor = FakeExecutor(
      
        3600
                [
      
        3601
                    tool_outcome(
      
        3602
                        tool_call=tool_call,
      
        3603
                        output="Todos updated",
      
        3604
                        is_error=False,
      
        3605
                        metadata={
      
        3606
                            "new_todos": [
      
        3607
                                {
      
        3608
                                    "content": "Write the introduction chapter",
      
        3609
                                    "active_form": "Writing the introduction chapter",
      
        3610
                                    "status": "pending",
      
        3611
                                }
      
        3612
                            ]
      
        3613
                        },
      
        3614
                    )
      
        3615
                ]
      
        3616
            )
      
        3617
        
        3618
            summary = TurnSummary(final_response="")
      
        3619
            await runner.execute_batch(
      
        3620
                tool_calls=[tool_call],
      
        3621
                tool_source="assistant",
      
        3622
                pending_tool_calls_seen=set(),
      
        3623
                emit=_noop_emit,
      
        3624
                summary=summary,
      
        3625
                dod=dod,
      
        3626
                executor=executor,  # type: ignore[arg-type]
      
        3627
                on_confirmation=None,
      
        3628
                on_user_question=None,
      
        3629
                emit_confirmation=None,
      
        3630
                consecutive_errors=0,
      
        3631
            )
      
        3632
        
        3633
            assert queued_messages
      
        3634
            message = queued_messages[-1]
      
        3635
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3636
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3637
            assert "Resume by creating `introduction.html` now." in message
      
        3638
            assert "It is the next missing declared output under `chapters/`." in message
      
        3639
            assert "Prefer one `write` call for `" in message
      
        3640
            assert "introduction.html` instead of more rereads." in message
      
        3641
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3642
        
        3643
        
        3644
        @pytest.mark.asyncio
      
        3645
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        3646
            temp_dir: Path,
      
        3647
        ) -> None:
      
        3648
            async def assess_confidence(
      
        3649
                tool_name: str,
      
        3650
                tool_args: dict,
      
        3651
                context: str,
      
        3652
            ) -> ConfidenceAssessment:
      
        3653
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3654
        
        3655
            async def verify_action(
      
        3656
                tool_name: str,
      
        3657
                tool_args: dict,
      
        3658
                result: str,
      
        3659
                expected: str = "",
      
        3660
            ) -> ActionVerification:
      
        3661
                raise AssertionError("Verification should not run in this scenario")
      
        3662
        
        3663
            guide_root = temp_dir / "guides" / "nginx"
      
        3664
            chapters = guide_root / "chapters"
      
        3665
            guide_root.mkdir(parents=True)
      
        3666
            chapters.mkdir()
      
        3667
            index_path = guide_root / "index.html"
      
        3668
            chapter_one = chapters / "01-introduction.html"
      
        3669
            index_path.write_text(
      
        3670
                "\n".join(
      
        3671
                    [
      
        3672
                        "<html>",
      
        3673
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        3674
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        3675
                        "</html>",
      
        3676
                    ]
      
        3677
                )
      
        3678
                + "\n"
      
        3679
            )
      
        3680
            chapter_one.write_text("<html></html>\n")
      
        3681
        
        3682
            implementation_plan = temp_dir / "implementation.md"
      
        3683
            implementation_plan.write_text(
      
        3684
                "\n".join(
      
        3685
                    [
      
        3686
                        "# Implementation Plan",
      
        3687
                        "",
      
        3688
                        "## File Changes",
      
        3689
                        f"- `{guide_root}/`",
      
        3690
                        f"- `{chapters}/`",
      
        3691
                        f"- `{index_path}`",
      
        3692
                        "",
      
        3693
                    ]
      
        3694
                )
      
        3695
            )
      
        3696
        
        3697
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3698
            dod.implementation_plan = str(implementation_plan)
      
        3699
            dod.pending_items = [
      
        3700
                "Creating Chapter 2: Installation and Setup",
      
        3701
                "Complete the requested work",
      
        3702
            ]
      
        3703
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3704
        
        3705
            queued_messages: list[str] = []
      
        3706
            context = build_context(
      
        3707
                temp_dir=temp_dir,
      
        3708
                messages=[],
      
        3709
                safeguards=FakeSafeguards(),
      
        3710
                assess_confidence=assess_confidence,
      
        3711
                verify_action=verify_action,
      
        3712
                auto_recover=False,
      
        3713
            )
      
        3714
            context.queue_steering_message_callback = queued_messages.append
      
        3715
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3716
        
        3717
            tool_call = ToolCall(
      
        3718
                id="todo-1",
      
        3719
                name="TodoWrite",
      
        3720
                arguments={
      
        3721
                    "todos": [
      
        3722
                        {
      
        3723
                            "content": "Creating Chapter 2: Installation and Setup",
      
        3724
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        3725
                            "status": "pending",
      
        3726
                        }
      
        3727
                    ]
      
        3728
                },
      
        3729
            )
      
        3730
            executor = FakeExecutor(
      
        3731
                [
      
        3732
                    tool_outcome(
      
        3733
                        tool_call=tool_call,
      
        3734
                        output="Todos updated",
      
        3735
                        is_error=False,
      
        3736
                        metadata={
      
        3737
                            "new_todos": [
      
        3738
                                {
      
        3739
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        3740
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        3741
                                    "status": "pending",
      
        3742
                                }
      
        3743
                            ]
      
        3744
                        },
      
        3745
                    )
      
        3746
                ]
      
        3747
            )
      
        3748
        
        3749
            summary = TurnSummary(final_response="")
      
        3750
            await runner.execute_batch(
      
        3751
                tool_calls=[tool_call],
      
        3752
                tool_source="assistant",
      
        3753
                pending_tool_calls_seen=set(),
      
        3754
                emit=_noop_emit,
      
        3755
                summary=summary,
      
        3756
                dod=dod,
      
        3757
                executor=executor,  # type: ignore[arg-type]
      
        3758
                on_confirmation=None,
      
        3759
                on_user_question=None,
      
        3760
                emit_confirmation=None,
      
        3761
                consecutive_errors=0,
      
        3762
            )
      
        3763
        
        3764
            assert queued_messages
      
        3765
            message = queued_messages[-1]
      
        3766
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3767
            assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
      
        3768
            assert "Resume by creating `02-installation.html` now." in message
      
        3769
            assert (
      
        3770
                f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
      
        3771
                "instead of more rereads."
      
        3772
                in message
      
        3773
            )
      
        3774
            assert "Make your next response the concrete mutation tool call itself" in message
      
        3775
        
        3776
        
        3777
        @pytest.mark.asyncio
      
        3778
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        3779
            temp_dir: Path,
      
        3780
        ) -> None:
      
        3781
            async def assess_confidence(
      
        3782
                tool_name: str,
      
        3783
                tool_args: dict,
      
        3784
                context: str,
      
        3785
            ) -> ConfidenceAssessment:
      
        3786
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3787
        
        3788
            async def verify_action(
      
        3789
                tool_name: str,
      
        3790
                tool_args: dict,
      
        3791
                result: str,
      
        3792
                expected: str = "",
      
        3793
            ) -> ActionVerification:
      
        3794
                raise AssertionError("Verification should not run in this scenario")
      
        3795
        
        3796
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        3797
            reference_chapters.mkdir(parents=True)
      
        3798
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        3799
        
        3800
            guide_root = temp_dir / "guides" / "nginx"
      
        3801
            chapters = guide_root / "chapters"
      
        3802
            guide_root.mkdir(parents=True)
      
        3803
            chapters.mkdir()
      
        3804
            index_path = guide_root / "index.html"
      
        3805
            index_path.write_text("<html></html>\n")
      
        3806
        
        3807
            implementation_plan = temp_dir / "implementation.md"
      
        3808
            implementation_plan.write_text(
      
        3809
                "\n".join(
      
        3810
                    [
      
        3811
                        "# Implementation Plan",
      
        3812
                        "",
      
        3813
                        "## File Changes",
      
        3814
                        f"- `{guide_root}/`",
      
        3815
                        f"- `{chapters}/`",
      
        3816
                        f"- `{index_path}`",
      
        3817
                        "",
      
        3818
                    ]
      
        3819
                )
      
        3820
            )
      
        3821
        
        3822
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3823
            dod.implementation_plan = str(implementation_plan)
      
        3824
            dod.pending_items = [
      
        3825
                "Write the introduction chapter",
      
        3826
                "Complete the requested work",
      
        3827
            ]
      
        3828
            dod.touched_files.append(str(index_path))
      
        3829
        
        3830
            queued_messages: list[str] = []
      
        3831
            context = build_context(
      
        3832
                temp_dir=temp_dir,
      
        3833
                messages=[
      
        3834
                    Message(
      
        3835
                        role=Role.ASSISTANT,
      
        3836
                        content="",
      
        3837
                        tool_calls=[
      
        3838
                            ToolCall(
      
        3839
                                id="read-ref-1",
      
        3840
                                name="read",
      
        3841
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        3842
                            )
      
        3843
                        ],
      
        3844
                    )
      
        3845
                ],
      
        3846
                safeguards=FakeSafeguards(),
      
        3847
                assess_confidence=assess_confidence,
      
        3848
                verify_action=verify_action,
      
        3849
                auto_recover=False,
      
        3850
            )
      
        3851
            context.queue_steering_message_callback = queued_messages.append
      
        3852
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3853
        
        3854
            tool_call = ToolCall(
      
        3855
                id="todo-observed-1",
      
        3856
                name="TodoWrite",
      
        3857
                arguments={
      
        3858
                    "todos": [
      
        3859
                        {
      
        3860
                            "content": "Write the introduction chapter",
      
        3861
                            "activeForm": "Writing the introduction chapter",
      
        3862
                            "status": "pending",
      
        3863
                        }
      
        3864
                    ]
      
        3865
                },
      
        3866
            )
      
        3867
            executor = FakeExecutor(
      
        3868
                [
      
        3869
                    tool_outcome(
      
        3870
                        tool_call=tool_call,
      
        3871
                        output="Todos updated",
      
        3872
                        is_error=False,
      
        3873
                        metadata={
      
        3874
                            "new_todos": [
      
        3875
                                {
      
        3876
                                    "content": "Write the introduction chapter",
      
        3877
                                    "active_form": "Writing the introduction chapter",
      
        3878
                                    "status": "pending",
      
        3879
                                }
      
        3880
                            ]
      
        3881
                        },
      
        3882
                    )
      
        3883
                ]
      
        3884
            )
      
        3885
        
        3886
            summary = TurnSummary(final_response="")
      
        3887
            await runner.execute_batch(
      
        3888
                tool_calls=[tool_call],
      
        3889
                tool_source="assistant",
      
        3890
                pending_tool_calls_seen=set(),
      
        3891
                emit=_noop_emit,
      
        3892
                summary=summary,
      
        3893
                dod=dod,
      
        3894
                executor=executor,  # type: ignore[arg-type]
      
        3895
                on_confirmation=None,
      
        3896
                on_user_question=None,
      
        3897
                emit_confirmation=None,
      
        3898
                consecutive_errors=0,
      
        3899
            )
      
        3900
        
        3901
            assert queued_messages
      
        3902
            message = queued_messages[-1]
      
        3903
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3904
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3905
            assert "Resume by creating `01-introduction.html` now." in message
      
        3906
            assert (
      
        3907
                "It mirrors the observed filename pattern from another `chapters/` directory "
      
        3908
                "you already inspected."
      
        3909
                in message
      
        3910
            )
      
        3911
            assert "01-introduction.html` instead of more rereads." in message
      
        3912
        
        3913
        
        3914
        @pytest.mark.asyncio
      
        3915
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        3916
            temp_dir: Path,
      
        3917
        ) -> None:
      
        3918
            async def assess_confidence(
      
        3919
                tool_name: str,
      
        3920
                tool_args: dict,
      
        3921
                context: str,
      
        3922
            ) -> ConfidenceAssessment:
      
        3923
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3924
        
        3925
            async def verify_action(
      
        3926
                tool_name: str,
      
        3927
                tool_args: dict,
      
        3928
                result: str,
      
        3929
                expected: str = "",
      
        3930
            ) -> ActionVerification:
      
        3931
                raise AssertionError("Verification should not run in this scenario")
      
        3932
        
        3933
            guide_root = temp_dir / "guides" / "nginx"
      
        3934
            chapters = guide_root / "chapters"
      
        3935
            guide_root.mkdir(parents=True)
      
        3936
            chapters.mkdir()
      
        3937
            index_path = guide_root / "index.html"
      
        3938
            chapter_one = chapters / "01-getting-started.html"
      
        3939
            chapter_two = chapters / "02-installation.html"
      
        3940
            index_path.write_text("<html></html>\n")
      
        3941
            chapter_one.write_text("<h1>One</h1>\n")
      
        3942
        
        3943
            implementation_plan = temp_dir / "implementation.md"
      
        3944
            implementation_plan.write_text(
      
        3945
                "\n".join(
      
        3946
                    [
      
        3947
                        "# Implementation Plan",
      
        3948
                        "",
      
        3949
                        "## File Changes",
      
        3950
                        f"- `{guide_root}/`",
      
        3951
                        f"- `{chapters}/`",
      
        3952
                        f"- `{index_path}`",
      
        3953
                        f"- `{chapter_one}`",
      
        3954
                        f"- `{chapter_two}`",
      
        3955
                        "",
      
        3956
                    ]
      
        3957
                )
      
        3958
            )
      
        3959
        
        3960
            context = build_context(
      
        3961
                temp_dir=temp_dir,
      
        3962
                messages=[],
      
        3963
                safeguards=FakeSafeguards(),
      
        3964
                assess_confidence=assess_confidence,
      
        3965
                verify_action=verify_action,
      
        3966
                auto_recover=False,
      
        3967
            )
      
        3968
            queued_messages: list[str] = []
      
        3969
            context.queue_steering_message_callback = queued_messages.append
      
        3970
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3971
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3972
            dod.implementation_plan = str(implementation_plan)
      
        3973
            sync_todos_to_definition_of_done(
      
        3974
                dod,
      
        3975
                [
      
        3976
                    {
      
        3977
                        "content": "Create 01-getting-started.html",
      
        3978
                        "active_form": "Creating 01-getting-started.html",
      
        3979
                        "status": "completed",
      
        3980
                    },
      
        3981
                    {
      
        3982
                        "content": "Create 02-installation.html",
      
        3983
                        "active_form": "Creating 02-installation.html",
      
        3984
                        "status": "pending",
      
        3985
                    },
      
        3986
                ],
      
        3987
                project_root=temp_dir,
      
        3988
            )
      
        3989
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3990
        
        3991
            tool_call = ToolCall(
      
        3992
                id="working-note",
      
        3993
                name="notepad_write_working",
      
        3994
                arguments={"content": "Creating the second chapter file: Installation"},
      
        3995
            )
      
        3996
            executor = FakeExecutor(
      
        3997
                [
      
        3998
                    tool_outcome(
      
        3999
                        tool_call=tool_call,
      
        4000
                        output="Working note recorded",
      
        4001
                        is_error=False,
      
        4002
                    )
      
        4003
                ]
      
        4004
            )
      
        4005
        
        4006
            summary = TurnSummary(final_response="")
      
        4007
            await runner.execute_batch(
      
        4008
                tool_calls=[tool_call],
      
        4009
                tool_source="assistant",
      
        4010
                pending_tool_calls_seen=set(),
      
        4011
                emit=_noop_emit,
      
        4012
                summary=summary,
      
        4013
                dod=dod,
      
        4014
                executor=executor,  # type: ignore[arg-type]
      
        4015
                on_confirmation=None,
      
        4016
                on_user_question=None,
      
        4017
                emit_confirmation=None,
      
        4018
                consecutive_errors=0,
      
        4019
            )
      
        4020
        
        4021
            assert queued_messages
      
        4022
            message = queued_messages[-1]
      
        4023
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4024
            assert "Resume by creating `02-installation.html` now." in message
      
        4025
            assert "Make your next response the concrete mutation tool call itself" in message
      
        4026
            assert "refresh `TodoWrite`" in message
      
        4027
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        4028
        
        4029
        
        4030
        @pytest.mark.asyncio
      
        4031
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        4032
            temp_dir: Path,
      
        4033
        ) -> None:
      
        4034
            async def assess_confidence(
      
        4035
                tool_name: str,
      
        4036
                tool_args: dict,
      
        4037
                context: str,
      
        4038
            ) -> ConfidenceAssessment:
      
        4039
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4040
        
        4041
            async def verify_action(
      
        4042
                tool_name: str,
      
        4043
                tool_args: dict,
      
        4044
                result: str,
      
        4045
                expected: str = "",
      
        4046
            ) -> ActionVerification:
      
        4047
                raise AssertionError("Verification should not run in this scenario")
      
        4048
        
        4049
            implementation_plan = temp_dir / "implementation.md"
      
        4050
            implementation_plan.write_text(
      
        4051
                "\n".join(
      
        4052
                    [
      
        4053
                        "# Implementation Plan",
      
        4054
                        "",
      
        4055
                        "## File Changes",
      
        4056
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        4057
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        4058
                        "",
      
        4059
                    ]
      
        4060
                )
      
        4061
            )
      
        4062
        
        4063
            context = build_context(
      
        4064
                temp_dir=temp_dir,
      
        4065
                messages=[],
      
        4066
                safeguards=FakeSafeguards(),
      
        4067
                assess_confidence=assess_confidence,
      
        4068
                verify_action=verify_action,
      
        4069
                auto_recover=False,
      
        4070
            )
      
        4071
            queued_messages: list[str] = []
      
        4072
            context.queue_steering_message_callback = queued_messages.append
      
        4073
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4074
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4075
            dod.implementation_plan = str(implementation_plan)
      
        4076
            dod.pending_items.extend(
      
        4077
                [
      
        4078
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4079
                    "Create the nginx directory structure",
      
        4080
                    "Develop the main index.html file for the nginx guide",
      
        4081
                ]
      
        4082
            )
      
        4083
        
        4084
            tool_call = ToolCall(
      
        4085
                id="working-note",
      
        4086
                name="notepad_write_working",
      
        4087
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        4088
            )
      
        4089
            executor = FakeExecutor(
      
        4090
                [
      
        4091
                    tool_outcome(
      
        4092
                        tool_call=tool_call,
      
        4093
                        output="Working note recorded",
      
        4094
                        is_error=False,
      
        4095
                    )
      
        4096
                ]
      
        4097
            )
      
        4098
        
        4099
            summary = TurnSummary(final_response="")
      
        4100
            await runner.execute_batch(
      
        4101
                tool_calls=[tool_call],
      
        4102
                tool_source="assistant",
      
        4103
                pending_tool_calls_seen=set(),
      
        4104
                emit=_noop_emit,
      
        4105
                summary=summary,
      
        4106
                dod=dod,
      
        4107
                executor=executor,  # type: ignore[arg-type]
      
        4108
                on_confirmation=None,
      
        4109
                on_user_question=None,
      
        4110
                emit_confirmation=None,
      
        4111
                consecutive_errors=0,
      
        4112
            )
      
        4113
        
        4114
            assert queued_messages
      
        4115
            message = queued_messages[-1]
      
        4116
            assert (
      
        4117
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        4118
                in message
      
        4119
            )
      
        4120
            assert "one concrete evidence-gathering tool call" in message
      
        4121
            assert "Resume by creating `index.html` now." not in message
      
        4122
        
        4123
        
        4124
        @pytest.mark.asyncio
      
        4125
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        4126
            temp_dir: Path,
      
        4127
        ) -> None:
      
        4128
            async def assess_confidence(
      
        4129
                tool_name: str,
      
        4130
                tool_args: dict,
      
        4131
                context: str,
      
        4132
            ) -> ConfidenceAssessment:
      
        4133
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4134
        
        4135
            async def verify_action(
      
        4136
                tool_name: str,
      
        4137
                tool_args: dict,
      
        4138
                result: str,
      
        4139
                expected: str = "",
      
        4140
            ) -> ActionVerification:
      
        4141
                raise AssertionError("Verification should not run in this scenario")
      
        4142
        
        4143
            guide_root = temp_dir / "guides" / "nginx"
      
        4144
            chapters_dir = guide_root / "chapters"
      
        4145
            chapters_dir.mkdir(parents=True)
      
        4146
            index_path = guide_root / "index.html"
      
        4147
            first_chapter = chapters_dir / "01-introduction.html"
      
        4148
            index_path.write_text(
      
        4149
                "\n".join(
      
        4150
                    [
      
        4151
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4152
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        4153
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        4154
                    ]
      
        4155
                )
      
        4156
            )
      
        4157
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        4158
        
        4159
            implementation_plan = temp_dir / "implementation.md"
      
        4160
            implementation_plan.write_text(
      
        4161
                "\n".join(
      
        4162
                    [
      
        4163
                        "# Implementation Plan",
      
        4164
                        "",
      
        4165
                        "## File Changes",
      
        4166
                        f"- `{guide_root / 'index.html'}`",
      
        4167
                        f"- `{chapters_dir}/`",
      
        4168
                        "",
      
        4169
                    ]
      
        4170
                )
      
        4171
            )
      
        4172
        
        4173
            context = build_context(
      
        4174
                temp_dir=temp_dir,
      
        4175
                messages=[],
      
        4176
                safeguards=FakeSafeguards(),
      
        4177
                assess_confidence=assess_confidence,
      
        4178
                verify_action=verify_action,
      
        4179
                auto_recover=False,
      
        4180
            )
      
        4181
            queued_messages: list[str] = []
      
        4182
            context.queue_steering_message_callback = queued_messages.append
      
        4183
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4184
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4185
            dod.implementation_plan = str(implementation_plan)
      
        4186
            dod.pending_items.extend(
      
        4187
                [
      
        4188
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4189
                    "Create chapter files following the established pattern",
      
        4190
                ]
      
        4191
            )
      
        4192
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        4193
        
        4194
            tool_call = ToolCall(
      
        4195
                id="working-note",
      
        4196
                name="notepad_write_working",
      
        4197
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        4198
            )
      
        4199
            executor = FakeExecutor(
      
        4200
                [
      
        4201
                    tool_outcome(
      
        4202
                        tool_call=tool_call,
      
        4203
                        output="Working note recorded",
      
        4204
                        is_error=False,
      
        4205
                    )
      
        4206
                ]
      
        4207
            )
      
        4208
        
        4209
            summary = TurnSummary(final_response="")
      
        4210
            await runner.execute_batch(
      
        4211
                tool_calls=[tool_call],
      
        4212
                tool_source="assistant",
      
        4213
                pending_tool_calls_seen=set(),
      
        4214
                emit=_noop_emit,
      
        4215
                summary=summary,
      
        4216
                dod=dod,
      
        4217
                executor=executor,  # type: ignore[arg-type]
      
        4218
                on_confirmation=None,
      
        4219
                on_user_question=None,
      
        4220
                emit_confirmation=None,
      
        4221
                consecutive_errors=0,
      
        4222
            )
      
        4223
        
        4224
            assert queued_messages
      
        4225
            message = queued_messages[-1]
      
        4226
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4227
            assert "Resume by creating `02-installation.html` now." in message
      
        4228
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        4229
        
        4230
        
        4231
        @pytest.mark.asyncio
      
        4232
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        4233
            temp_dir: Path,
      
        4234
        ) -> None:
      
        4235
            async def assess_confidence(
      
        4236
                tool_name: str,
      
        4237
                tool_args: dict,
      
        4238
                context: str,
      
        4239
            ) -> ConfidenceAssessment:
      
        4240
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4241
        
        4242
            async def verify_action(
      
        4243
                tool_name: str,
      
        4244
                tool_args: dict,
      
        4245
                result: str,
      
        4246
                expected: str = "",
      
        4247
            ) -> ActionVerification:
      
        4248
                raise AssertionError("Verification should not run in this scenario")
      
        4249
        
        4250
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        4251
            chapters_dir = fortran_root / "chapters"
      
        4252
            chapters_dir.mkdir(parents=True)
      
        4253
        
        4254
            implementation_plan = temp_dir / "implementation.md"
      
        4255
            implementation_plan.write_text(
      
        4256
                "\n".join(
      
        4257
                    [
      
        4258
                        "# Implementation Plan",
      
        4259
                        "",
      
        4260
                        "## File Changes",
      
        4261
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        4262
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        4263
                        "",
      
        4264
                    ]
      
        4265
                )
      
        4266
            )
      
        4267
        
        4268
            context = build_context(
      
        4269
                temp_dir=temp_dir,
      
        4270
                messages=[],
      
        4271
                safeguards=FakeSafeguards(),
      
        4272
                assess_confidence=assess_confidence,
      
        4273
                verify_action=verify_action,
      
        4274
                auto_recover=False,
      
        4275
            )
      
        4276
            queued_messages: list[str] = []
      
        4277
            context.queue_steering_message_callback = queued_messages.append
      
        4278
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4279
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4280
            dod.implementation_plan = str(implementation_plan)
      
        4281
            dod.pending_items.extend(
      
        4282
                [
      
        4283
                    "First, examine the existing fortran guide structure and content",
      
        4284
                    "Create the nginx directory structure",
      
        4285
                    "Develop the main index.html file for nginx guide",
      
        4286
                ]
      
        4287
            )
      
        4288
        
        4289
            tool_call = ToolCall(
      
        4290
                id="glob-1",
      
        4291
                name="glob",
      
        4292
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        4293
            )
      
        4294
            executor = FakeExecutor(
      
        4295
                [
      
        4296
                    tool_outcome(
      
        4297
                        tool_call=tool_call,
      
        4298
                        output=f"{fortran_root}\n{chapters_dir}",
      
        4299
                        is_error=False,
      
        4300
                    )
      
        4301
                ]
      
        4302
            )
      
        4303
        
        4304
            summary = TurnSummary(final_response="")
      
        4305
            await runner.execute_batch(
      
        4306
                tool_calls=[tool_call],
      
        4307
                tool_source="assistant",
      
        4308
                pending_tool_calls_seen=set(),
      
        4309
                emit=_noop_emit,
      
        4310
                summary=summary,
      
        4311
                dod=dod,
      
        4312
                executor=executor,  # type: ignore[arg-type]
      
        4313
                on_confirmation=None,
      
        4314
                on_user_question=None,
      
        4315
                emit_confirmation=None,
      
        4316
                consecutive_errors=0,
      
        4317
            )
      
        4318
        
        4319
            assert queued_messages == []
      
        4320
        
        4321
        
        4322
        @pytest.mark.asyncio
      
        4323
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        4324
            temp_dir: Path,
      
        4325
        ) -> None:
      
        4326
            async def assess_confidence(
      
        4327
                tool_name: str,
      
        4328
                tool_args: dict,
      
        4329
                context: str,
      
        4330
            ) -> ConfidenceAssessment:
      
        4331
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4332
        
        4333
            async def verify_action(
      
        4334
                tool_name: str,
      
        4335
                tool_args: dict,
      
        4336
                result: str,
      
        4337
                expected: str = "",
      
        4338
            ) -> ActionVerification:
      
        4339
                raise AssertionError("Verification should not run in this scenario")
      
        4340
        
        4341
            prompt = (
      
        4342
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        4343
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        4344
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        4345
                "links and their link texts to be correct."
      
        4346
            )
      
        4347
            chapters = temp_dir / "chapters"
      
        4348
            chapters.mkdir()
      
        4349
            (chapters / "01-introduction.html").write_text(
      
        4350
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        4351
            )
      
        4352
            (chapters / "02-setup.html").write_text(
      
        4353
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        4354
            )
      
        4355
            current_block = (
      
        4356
                "<h2>Table of Contents</h2>\n"
      
        4357
                '        <ul class="chapter-list">\n'
      
        4358
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        4359
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        4360
                "        </ul>\n"
      
        4361
            )
      
        4362
            index_path = temp_dir / "index.html"
      
        4363
            index_path.write_text(current_block)
      
        4364
        
        4365
            context = build_context(
      
        4366
                temp_dir=temp_dir,
      
        4367
                messages=[],
      
        4368
                safeguards=FakeSafeguards(),
      
        4369
                assess_confidence=assess_confidence,
      
        4370
                verify_action=verify_action,
      
        4371
                auto_recover=False,
      
        4372
            )
      
        4373
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        4374
            queued_messages: list[str] = []
      
        4375
            context.queue_steering_message_callback = queued_messages.append
      
        4376
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4377
            tool_call = ToolCall(
      
        4378
                id="edit-1",
      
        4379
                name="edit",
      
        4380
                arguments={
      
        4381
                    "file_path": str(index_path),
      
        4382
                    "old_string": current_block,
      
        4383
                    "new_string": current_block,
      
        4384
                },
      
        4385
            )
      
        4386
            executor = FakeExecutor(
      
        4387
                [
      
        4388
                    tool_outcome(
      
        4389
                        tool_call=tool_call,
      
        4390
                        output=(
      
        4391
                            "[Blocked - old_string and new_string are identical - no change "
      
        4392
                            "would occur] Suggestion: Provide different old and new strings"
      
        4393
                        ),
      
        4394
                        is_error=True,
      
        4395
                        state=ToolExecutionState.BLOCKED,
      
        4396
                    )
      
        4397
                ]
      
        4398
            )
      
        4399
        
        4400
            await runner.execute_batch(
      
        4401
                tool_calls=[tool_call],
      
        4402
                tool_source="assistant",
      
        4403
                pending_tool_calls_seen=set(),
      
        4404
                emit=_noop_emit,
      
        4405
                summary=TurnSummary(final_response=""),
      
        4406
                dod=create_definition_of_done(prompt),
      
        4407
                executor=executor,  # type: ignore[arg-type]
      
        4408
                on_confirmation=None,
      
        4409
                on_user_question=None,
      
        4410
                emit_confirmation=None,
      
        4411
                consecutive_errors=0,
      
        4412
            )
      
        4413
        
        4414
            assert queued_messages == []
      
        4415
        
        4416
        
        4417
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        4418
            temp_dir: Path,
      
        4419
        ) -> None:
      
        4420
            async def assess_confidence(
      
        4421
                tool_name: str,
      
        4422
                tool_args: dict,
      
        4423
                context: str,
      
        4424
            ) -> ConfidenceAssessment:
      
        4425
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4426
        
        4427
            async def verify_action(
      
        4428
                tool_name: str,
      
        4429
                tool_args: dict,
      
        4430
                result: str,
      
        4431
                expected: str = "",
      
        4432
            ) -> ActionVerification:
      
        4433
                raise AssertionError("Verification should not run in this scenario")
      
        4434
        
        4435
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        4436
            context = build_context(
      
        4437
                temp_dir=temp_dir,
      
        4438
                messages=[
      
        4439
                    Message(
      
        4440
                        role=Role.ASSISTANT,
      
        4441
                        content=(
      
        4442
                            "Repair focus:\n"
      
        4443
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        4444
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4445
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        4446
                        ),
      
        4447
                    )
      
        4448
                ],
      
        4449
                safeguards=FakeSafeguards(),
      
        4450
                assess_confidence=assess_confidence,
      
        4451
                verify_action=verify_action,
      
        4452
            )
      
        4453
            queued: list[str] = []
      
        4454
            context.queue_steering_message_callback = queued.append
      
        4455
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4456
        
        4457
            runner._queue_blocked_html_edit_nudge(
      
        4458
                ToolCall(
      
        4459
                    id="edit-1",
      
        4460
                    name="edit",
      
        4461
                    arguments={
      
        4462
                        "file_path": str(repair_target),
      
        4463
                        "old_string": "same",
      
        4464
                        "new_string": "same",
      
        4465
                    },
      
        4466
                ),
      
        4467
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        4468
            )
      
        4469
        
        4470
            assert queued
      
        4471
            assert str(repair_target) in queued[0]
      
        4472
            assert "no on-disk change" in queued[0]
      
        4473
            assert "replace the surrounding block" in queued[0]
      
        4474
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4475
        
        4476
        
        4477
        async def _noop_emit(event: AgentEvent) -> None:
      
        4478
            return None
      
        4479
        
        4480
        
        4481
        @pytest.mark.asyncio
      
        4482
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        4483
            temp_dir: Path,
      
        4484
        ) -> None:
      
        4485
            async def assess_confidence(
      
        4486
                tool_name: str,
      
        4487
                tool_args: dict,
      
        4488
                context: str,
      
        4489
            ) -> ConfidenceAssessment:
      
        4490
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4491
        
        4492
            async def verify_action(
      
        4493
                tool_name: str,
      
        4494
                tool_args: dict,
      
        4495
                result: str,
      
        4496
                expected: str = "",
      
        4497
            ) -> ActionVerification:
      
        4498
                raise AssertionError("Verification should not run for this scenario")
      
        4499
        
        4500
            context = build_context(
      
        4501
                temp_dir=temp_dir,
      
        4502
                messages=[],
      
        4503
                safeguards=FakeSafeguards(),
      
        4504
                assess_confidence=assess_confidence,
      
        4505
                verify_action=verify_action,
      
        4506
            )
      
        4507
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4508
            tool_call = ToolCall(
      
        4509
                id="write-1",
      
        4510
                name="write",
      
        4511
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4512
            )
      
        4513
            executor = FakeExecutor(
      
        4514
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4515
            )
      
        4516
            summary = TurnSummary(final_response="")
      
        4517
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4518
            events: list[AgentEvent] = []
      
        4519
        
        4520
            async def emit(event: AgentEvent) -> None:
      
        4521
                events.append(event)
      
        4522
        
        4523
            await runner.execute_batch(
      
        4524
                tool_calls=[tool_call],
      
        4525
                tool_source="assistant",
      
        4526
                pending_tool_calls_seen=set(),
      
        4527
                emit=emit,
      
        4528
                summary=summary,
      
        4529
                dod=dod,
      
        4530
                executor=executor,  # type: ignore[arg-type]
      
        4531
                on_confirmation=None,
      
        4532
                on_user_question=None,
      
        4533
                emit_confirmation=None,
      
        4534
                consecutive_errors=0,
      
        4535
            )
      
        4536
        
        4537
            assert dod.last_verification_result == "planned"
      
        4538
            assert dod.verification_commands
      
        4539
            assert "Collect verification evidence" in dod.pending_items
      
        4540
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        4541
            assert dod.active_verification_attempt_number == 1
      
        4542
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        4543
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        4544
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        4545
            assert (
      
        4546
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4547
                == "verification-attempt-1"
      
        4548
            )
      
        4549
            assert (
      
        4550
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4551
            )
      
        4552
        
        4553
        
        4554
        @pytest.mark.asyncio
      
        4555
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        4556
            temp_dir: Path,
      
        4557
        ) -> None:
      
        4558
            async def assess_confidence(
      
        4559
                tool_name: str,
      
        4560
                tool_args: dict,
      
        4561
                context: str,
      
        4562
            ) -> ConfidenceAssessment:
      
        4563
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4564
        
        4565
            async def verify_action(
      
        4566
                tool_name: str,
      
        4567
                tool_args: dict,
      
        4568
                result: str,
      
        4569
                expected: str = "",
      
        4570
            ) -> ActionVerification:
      
        4571
                raise AssertionError("Verification should not run in this scenario")
      
        4572
        
        4573
            context = build_context(
      
        4574
                temp_dir=temp_dir,
      
        4575
                messages=[],
      
        4576
                safeguards=FakeSafeguards(),
      
        4577
                assess_confidence=assess_confidence,
      
        4578
                verify_action=verify_action,
      
        4579
            )
      
        4580
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4581
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4582
            chapters = nginx_root / "chapters"
      
        4583
            implementation_plan = temp_dir / "implementation.md"
      
        4584
            implementation_plan.write_text(
      
        4585
                "\n".join(
      
        4586
                    [
      
        4587
                        "# Implementation Plan",
      
        4588
                        "",
      
        4589
                        "## File Changes",
      
        4590
                        f"- `{chapters}/`",
      
        4591
                        f"- `{nginx_root / 'index.html'}`",
      
        4592
                        "",
      
        4593
                    ]
      
        4594
                )
      
        4595
            )
      
        4596
        
        4597
            tool_call = ToolCall(
      
        4598
                id="mkdir-1",
      
        4599
                name="bash",
      
        4600
                arguments={"command": f"mkdir -p {chapters}"},
      
        4601
            )
      
        4602
            executor = FakeExecutor(
      
        4603
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        4604
            )
      
        4605
            summary = TurnSummary(final_response="")
      
        4606
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        4607
            dod.implementation_plan = str(implementation_plan)
      
        4608
            events: list[AgentEvent] = []
      
        4609
        
        4610
            async def emit(event: AgentEvent) -> None:
      
        4611
                events.append(event)
      
        4612
        
        4613
            await runner.execute_batch(
      
        4614
                tool_calls=[tool_call],
      
        4615
                tool_source="assistant",
      
        4616
                pending_tool_calls_seen=set(),
      
        4617
                emit=emit,
      
        4618
                summary=summary,
      
        4619
                dod=dod,
      
        4620
                executor=executor,  # type: ignore[arg-type]
      
        4621
                on_confirmation=None,
      
        4622
                on_user_question=None,
      
        4623
                emit_confirmation=None,
      
        4624
                consecutive_errors=0,
      
        4625
            )
      
        4626
        
        4627
            assert dod.last_verification_result is None
      
        4628
            assert "Collect verification evidence" not in dod.pending_items
      
        4629
            assert not any(
      
        4630
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        4631
            )
      
        4632
        
        4633
        
        4634
        @pytest.mark.asyncio
      
        4635
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        4636
            temp_dir: Path,
      
        4637
        ) -> None:
      
        4638
            async def assess_confidence(
      
        4639
                tool_name: str,
      
        4640
                tool_args: dict,
      
        4641
                context: str,
      
        4642
            ) -> ConfidenceAssessment:
      
        4643
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4644
        
        4645
            async def verify_action(
      
        4646
                tool_name: str,
      
        4647
                tool_args: dict,
      
        4648
                result: str,
      
        4649
                expected: str = "",
      
        4650
            ) -> ActionVerification:
      
        4651
                raise AssertionError("Verification should not run for this scenario")
      
        4652
        
        4653
            context = build_context(
      
        4654
                temp_dir=temp_dir,
      
        4655
                messages=[],
      
        4656
                safeguards=FakeSafeguards(),
      
        4657
                assess_confidence=assess_confidence,
      
        4658
                verify_action=verify_action,
      
        4659
            )
      
        4660
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4661
            tool_call = ToolCall(
      
        4662
                id="write-1",
      
        4663
                name="write",
      
        4664
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4665
            )
      
        4666
            executor = FakeExecutor(
      
        4667
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4668
            )
      
        4669
            summary = TurnSummary(final_response="")
      
        4670
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4671
            dod.verification_commands = ["uv run pytest -q"]
      
        4672
            dod.last_verification_result = "passed"
      
        4673
            dod.verification_attempt_counter = 1
      
        4674
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        4675
            dod.active_verification_attempt_number = 1
      
        4676
            dod.evidence = [
      
        4677
                VerificationEvidence(
      
        4678
                    command="uv run pytest -q",
      
        4679
                    passed=True,
      
        4680
                    stdout="401 passed",
      
        4681
                    kind="test",
      
        4682
                )
      
        4683
            ]
      
        4684
            dod.completed_items.append("Collect verification evidence")
      
        4685
            events: list[AgentEvent] = []
      
        4686
        
        4687
            async def emit(event: AgentEvent) -> None:
      
        4688
                events.append(event)
      
        4689
        
        4690
            await runner.execute_batch(
      
        4691
                tool_calls=[tool_call],
      
        4692
                tool_source="assistant",
      
        4693
                pending_tool_calls_seen=set(),
      
        4694
                emit=emit,
      
        4695
                summary=summary,
      
        4696
                dod=dod,
      
        4697
                executor=executor,  # type: ignore[arg-type]
      
        4698
                on_confirmation=None,
      
        4699
                on_user_question=None,
      
        4700
                emit_confirmation=None,
      
        4701
                consecutive_errors=0,
      
        4702
            )
      
        4703
        
        4704
            assert dod.last_verification_result == "stale"
      
        4705
            assert dod.evidence == []
      
        4706
            assert "Collect verification evidence" in dod.pending_items
      
        4707
            assert "Collect verification evidence" not in dod.completed_items
      
        4708
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        4709
            assert dod.active_verification_attempt_number == 2
      
        4710
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        4711
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        4712
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        4713
            assert (
      
        4714
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4715
                == "verification-attempt-1"
      
        4716
            )
      
        4717
            assert (
      
        4718
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4719
            )
      
        4720
            assert (
      
        4721
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        4722
                == "verification-attempt-2"
      
        4723
            )
      
        4724
            assert (
      
        4725
                summary.workflow_timeline[-1].verification_observations[0].command
      
        4726
                == "uv run pytest -q"
      
        4727
            )
      
        4728
        
        4729
        
        4730
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        4731
            async def assess_confidence(
      
        4732
                tool_name: str,
      
        4733
                tool_args: dict,
      
        4734
                context: str,
      
        4735
            ) -> ConfidenceAssessment:
      
        4736
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4737
        
        4738
            async def verify_action(
      
        4739
                tool_name: str,
      
        4740
                tool_args: dict,
      
        4741
                result: str,
      
        4742
                expected: str = "",
      
        4743
            ) -> ActionVerification:
      
        4744
                raise AssertionError("Verification should not run in this scenario")
      
        4745
        
        4746
            repair_target = temp_dir / "guide" / "index.html"
      
        4747
            context = build_context(
      
        4748
                temp_dir=temp_dir,
      
        4749
                messages=[
      
        4750
                    Message(
      
        4751
                        role=Role.ASSISTANT,
      
        4752
                        content=(
      
        4753
                            "Repair focus:\n"
      
        4754
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        4755
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4756
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        4757
                        ),
      
        4758
                    )
      
        4759
                ],
      
        4760
                safeguards=FakeSafeguards(),
      
        4761
                assess_confidence=assess_confidence,
      
        4762
                verify_action=verify_action,
      
        4763
            )
      
        4764
            queued: list[str] = []
      
        4765
            context.queue_steering_message_callback = queued.append
      
        4766
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4767
        
        4768
            runner._queue_blocked_active_repair_nudge(
      
        4769
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        4770
            )
      
        4771
        
        4772
            assert queued
      
        4773
            assert str(repair_target) in queued[0]
      
        4774
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        4775
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4776
        
        4777
        
        4778
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        4779
            temp_dir: Path,
      
        4780
        ) -> None:
      
        4781
            async def assess_confidence(
      
        4782
                tool_name: str,
      
        4783
                tool_args: dict,
      
        4784
                context: str,
      
        4785
            ) -> ConfidenceAssessment:
      
        4786
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4787
        
        4788
            async def verify_action(
      
        4789
                tool_name: str,
      
        4790
                tool_args: dict,
      
        4791
                result: str,
      
        4792
                expected: str = "",
      
        4793
            ) -> ActionVerification:
      
        4794
                raise AssertionError("Verification should not run in this scenario")
      
        4795
        
        4796
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        4797
            stylesheet = temp_dir / "guide" / "styles.css"
      
        4798
            context = build_context(
      
        4799
                temp_dir=temp_dir,
      
        4800
                messages=[
      
        4801
                    Message(
      
        4802
                        role=Role.ASSISTANT,
      
        4803
                        content=(
      
        4804
                            "Repair focus:\n"
      
        4805
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        4806
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4807
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        4808
                        ),
      
        4809
                    )
      
        4810
                ],
      
        4811
                safeguards=FakeSafeguards(),
      
        4812
                assess_confidence=assess_confidence,
      
        4813
                verify_action=verify_action,
      
        4814
            )
      
        4815
            queued: list[str] = []
      
        4816
            context.queue_steering_message_callback = queued.append
      
        4817
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4818
        
        4819
            runner._queue_blocked_active_repair_mutation_nudge(
      
        4820
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        4821
            )
      
        4822
        
        4823
            assert queued
      
        4824
            assert str(repair_target) in queued[0]
      
        4825
            assert str(stylesheet) in queued[0]
      
        4826
            assert "before widening the change set" in queued[0]
      
        4827
        
        4828
        
        4829
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        4830
            temp_dir: Path,
      
        4831
        ) -> None:
      
        4832
            async def assess_confidence(
      
        4833
                tool_name: str,
      
        4834
                tool_args: dict,
      
        4835
                context: str,
      
        4836
            ) -> ConfidenceAssessment:
      
        4837
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4838
        
        4839
            async def verify_action(
      
        4840
                tool_name: str,
      
        4841
                tool_args: dict,
      
        4842
                result: str,
      
        4843
                expected: str = "",
      
        4844
            ) -> ActionVerification:
      
        4845
                raise AssertionError("Verification should not run in this scenario")
      
        4846
        
        4847
            context = build_context(
      
        4848
                temp_dir=temp_dir,
      
        4849
                messages=[],
      
        4850
                safeguards=FakeSafeguards(),
      
        4851
                assess_confidence=assess_confidence,
      
        4852
                verify_action=verify_action,
      
        4853
            )
      
        4854
            queued: list[str] = []
      
        4855
            context.queue_steering_message_callback = queued.append
      
        4856
            store = DefinitionOfDoneStore(temp_dir)
      
        4857
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        4858
            plan_path = temp_dir / "implementation.md"
      
        4859
            plan_path.write_text(
      
        4860
                "# File Changes\n"
      
        4861
                "- `guide/index.html`\n"
      
        4862
                "- `guide/chapters/01-getting-started.html`\n"
      
        4863
                "- `guide/chapters/02-installation.html`\n"
      
        4864
                "- `guide/chapters/03-first-website.html`\n"
      
        4865
            )
      
        4866
            dod.implementation_plan = str(plan_path)
      
        4867
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        4868
            (temp_dir / "guide" / "index.html").write_text("index")
      
        4869
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        4870
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        4871
            runner = ToolBatchRunner(context, store)
      
        4872
        
        4873
            runner._queue_blocked_late_reference_drift_nudge(
      
        4874
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        4875
                dod=dod,
      
        4876
            )
      
        4877
        
        4878
            assert queued
      
        4879
            assert "03-first-website.html" in queued[0]
      
        4880
            assert "older reference materials" in queued[0]
      
        4881
        
        4882
        
        4883
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        4884
            temp_dir: Path,
      
        4885
        ) -> None:
      
        4886
            async def assess_confidence(
      
        4887
                tool_name: str,
      
        4888
                tool_args: dict,
      
        4889
                context: str,
      
        4890
            ) -> ConfidenceAssessment:
      
        4891
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4892
        
        4893
            async def verify_action(
      
        4894
                tool_name: str,
      
        4895
                tool_args: dict,
      
        4896
                result: str,
      
        4897
                expected: str = "",
      
        4898
            ) -> ActionVerification:
      
        4899
                raise AssertionError("Verification should not run in this scenario")
      
        4900
        
        4901
            guide_root = temp_dir / "guide"
      
        4902
            chapters = guide_root / "chapters"
      
        4903
            guide_root.mkdir(parents=True)
      
        4904
            chapters.mkdir()
      
        4905
            index_path = guide_root / "index.html"
      
        4906
            chapter_one = chapters / "01-getting-started.html"
      
        4907
            chapter_two = chapters / "02-installation.html"
      
        4908
            index_path.write_text("index")
      
        4909
            chapter_one.write_text("one")
      
        4910
            chapter_two.write_text("two")
      
        4911
        
        4912
            implementation_plan = temp_dir / "implementation.md"
      
        4913
            implementation_plan.write_text(
      
        4914
                "\n".join(
      
        4915
                    [
      
        4916
                        "# Implementation Plan",
      
        4917
                        "",
      
        4918
                        "## File Changes",
      
        4919
                        f"- `{guide_root}`",
      
        4920
                        f"- `{chapters}`",
      
        4921
                        f"- `{index_path}`",
      
        4922
                        f"- `{chapter_one}`",
      
        4923
                        f"- `{chapter_two}`",
      
        4924
                        "",
      
        4925
                    ]
      
        4926
                )
      
        4927
            )
      
        4928
        
        4929
            context = build_context(
      
        4930
                temp_dir=temp_dir,
      
        4931
                messages=[],
      
        4932
                safeguards=FakeSafeguards(),
      
        4933
                assess_confidence=assess_confidence,
      
        4934
                verify_action=verify_action,
      
        4935
            )
      
        4936
            queued: list[str] = []
      
        4937
            context.queue_steering_message_callback = queued.append
      
        4938
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4939
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        4940
            dod.implementation_plan = str(implementation_plan)
      
        4941
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4942
            sync_todos_to_definition_of_done(
      
        4943
                dod,
      
        4944
                [
      
        4945
                    {
      
        4946
                        "content": "Verify all guide files are linked and complete",
      
        4947
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        4948
                        "status": "pending",
      
        4949
                    }
      
        4950
                ],
      
        4951
                project_root=temp_dir,
      
        4952
            )
      
        4953
        
        4954
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        4955
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        4956
                dod=dod,
      
        4957
            )
      
        4958
        
        4959
            assert queued
      
        4960
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        4961
            assert "Verify all guide files are linked and complete" in queued[0]
      
        4962
            assert "Do not reopen earlier reference materials." in queued[0]
      
        4963
        
        4964
        
        4965
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        4966
            temp_dir: Path,
      
        4967
        ) -> None:
      
        4968
            async def assess_confidence(
      
        4969
                tool_name: str,
      
        4970
                tool_args: dict,
      
        4971
                context: str,
      
        4972
            ) -> ConfidenceAssessment:
      
        4973
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4974
        
        4975
            async def verify_action(
      
        4976
                tool_name: str,
      
        4977
                tool_args: dict,
      
        4978
                result: str,
      
        4979
                expected: str = "",
      
        4980
            ) -> ActionVerification:
      
        4981
                raise AssertionError("Verification should not run in this scenario")
      
        4982
        
        4983
            context = build_context(
      
        4984
                temp_dir=temp_dir,
      
        4985
                messages=[],
      
        4986
                safeguards=FakeSafeguards(),
      
        4987
                assess_confidence=assess_confidence,
      
        4988
                verify_action=verify_action,
      
        4989
            )
      
        4990
            queued: list[str] = []
      
        4991
            context.queue_steering_message_callback = queued.append
      
        4992
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4993
        
        4994
            runner._queue_blocked_html_declared_target_nudge(
      
        4995
                ToolCall(
      
        4996
                    id="write-ch1",
      
        4997
                    name="write",
      
        4998
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        4999
                ),
      
        5000
                (
      
        5001
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        5002
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        5003
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        5004
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        5005
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        5006
                ),
      
        5007
            )
      
        5008
        
        5009
            assert queued
      
        5010
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        5011
            assert "`chapters/02-installation.html`" in queued[0]
      
        5012
            assert "same file now" in queued[0]
      
        5013
        
        5014
        
        5015
        @pytest.mark.asyncio
      
        5016
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        5017
            temp_dir: Path,
      
        5018
        ) -> None:
      
        5019
            async def assess_confidence(
      
        5020
                tool_name: str,
      
        5021
                tool_args: dict,
      
        5022
                context: str,
      
        5023
            ) -> ConfidenceAssessment:
      
        5024
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5025
        
        5026
            async def verify_action(
      
        5027
                tool_name: str,
      
        5028
                tool_args: dict,
      
        5029
                result: str,
      
        5030
                expected: str = "",
      
        5031
            ) -> ActionVerification:
      
        5032
                raise AssertionError("Verification should not run in this scenario")
      
        5033
        
        5034
            guide_root = temp_dir / "guides" / "nginx"
      
        5035
            chapters = guide_root / "chapters"
      
        5036
            chapters.mkdir(parents=True)
      
        5037
            index_path = guide_root / "index.html"
      
        5038
            chapter_one = chapters / "01-introduction.html"
      
        5039
            chapter_two = chapters / "02-installation.html"
      
        5040
            index_path.write_text("<html></html>\n")
      
        5041
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        5042
        
        5043
            implementation_plan = temp_dir / "implementation.md"
      
        5044
            implementation_plan.write_text(
      
        5045
                "\n".join(
      
        5046
                    [
      
        5047
                        "# Implementation Plan",
      
        5048
                        "",
      
        5049
                        "## File Changes",
      
        5050
                        f"- `{index_path}`",
      
        5051
                        f"- `{chapter_one}`",
      
        5052
                        f"- `{chapter_two}`",
      
        5053
                        "",
      
        5054
                    ]
      
        5055
                )
      
        5056
            )
      
        5057
        
        5058
            context = build_context(
      
        5059
                temp_dir=temp_dir,
      
        5060
                messages=[],
      
        5061
                safeguards=FakeSafeguards(),
      
        5062
                assess_confidence=assess_confidence,
      
        5063
                verify_action=verify_action,
      
        5064
                auto_recover=False,
      
        5065
            )
      
        5066
            queued: list[str] = []
      
        5067
            context.queue_steering_message_callback = queued.append
      
        5068
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5069
            tool_call = ToolCall(
      
        5070
                id="write-2",
      
        5071
                name="write",
      
        5072
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        5073
            )
      
        5074
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        5075
            executor = FakeExecutor(
      
        5076
                [
      
        5077
                    ToolExecutionOutcome(
      
        5078
                        tool_call=tool_call,
      
        5079
                        state=ToolExecutionState.BLOCKED,
      
        5080
                        message=Message.tool_result_message(
      
        5081
                            tool_call_id=tool_call.id,
      
        5082
                            display_content=blocked_message,
      
        5083
                            result_content=blocked_message,
      
        5084
                            is_error=True,
      
        5085
                        ),
      
        5086
                        event_content=blocked_message,
      
        5087
                        is_error=True,
      
        5088
                        result_output=blocked_message,
      
        5089
                    )
      
        5090
                ]
      
        5091
            )
      
        5092
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5093
            dod.implementation_plan = str(implementation_plan)
      
        5094
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5095
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        5096
        
        5097
            await runner.execute_batch(
      
        5098
                tool_calls=[tool_call],
      
        5099
                tool_source="assistant",
      
        5100
                pending_tool_calls_seen=set(),
      
        5101
                emit=_noop_emit,
      
        5102
                summary=TurnSummary(final_response=""),
      
        5103
                dod=dod,
      
        5104
                executor=executor,  # type: ignore[arg-type]
      
        5105
                on_confirmation=None,
      
        5106
                on_user_question=None,
      
        5107
                emit_confirmation=None,
      
        5108
                consecutive_errors=0,
      
        5109
            )
      
        5110
        
        5111
            assert queued
      
        5112
            assert "did not provide a valid `file_path`" in queued[0]
      
        5113
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        5114
            assert (
      
        5115
                f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
      
        5116
                in queued[0]
      
        5117
            )
      
        5118
            assert context.recovery_context is not None
      
        5119
            assert context.recovery_context.attempts[-1].error == blocked_message