loader Public

Watch 0 Fork 0 Star 0
Python · 177638 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.permissions import (
      
        20
            PermissionMode,
      
        21
            build_permission_policy,
      
        22
            load_permission_rules,
      
        23
        )
      
        24
        from loader.runtime.reasoning_types import (
      
        25
            ActionVerification,
      
        26
            ConfidenceAssessment,
      
        27
            ConfidenceLevel,
      
        28
        )
      
        29
        from loader.runtime.recovery import RecoveryContext
      
        30
        from loader.runtime.tool_batches import (
      
        31
            ToolBatchRunner,
      
        32
        )
      
        33
        from loader.runtime.tool_batches import (
      
        34
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        35
        )
      
        36
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        37
        from loader.tools.base import ToolResult as RegistryToolResult
      
        38
        from loader.tools.base import create_default_registry
      
        39
        from tests.helpers.runtime_harness import ScriptedBackend
      
        40
        
        41
        
        42
        class FakeSession:
      
        43
            def __init__(self, messages: list[Message]) -> None:
      
        44
                self.messages = list(messages)
      
        45
                self.workflow_timeline = []
      
        46
        
        47
            def append(self, message: Message) -> None:
      
        48
                self.messages.append(message)
      
        49
        
        50
            def append_workflow_timeline_entry(self, entry) -> None:
      
        51
                self.workflow_timeline.append(entry)
      
        52
        
        53
        
        54
        class FakeCodeFilter:
      
        55
            def reset(self) -> None:
      
        56
                return None
      
        57
        
        58
        
        59
        class FakeSafeguards:
      
        60
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        61
                self.action_tracker = object()
      
        62
                self.validator = object()
      
        63
                self.code_filter = FakeCodeFilter()
      
        64
                self._detect_loop_result = detect_loop_result
      
        65
        
        66
            def filter_stream_chunk(self, content: str) -> str:
      
        67
                return content
      
        68
        
        69
            def filter_complete_content(self, content: str) -> str:
      
        70
                return content
      
        71
        
        72
            def should_steer(self) -> bool:
      
        73
                return False
      
        74
        
        75
            def get_steering_message(self) -> str | None:
      
        76
                return None
      
        77
        
        78
            def record_response(self, content: str) -> None:
      
        79
                return None
      
        80
        
        81
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        82
                return False, ""
      
        83
        
        84
            def detect_loop(self) -> tuple[bool, str]:
      
        85
                return self._detect_loop_result
      
        86
        
        87
        
        88
        class FakeExecutor:
      
        89
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        90
                self._outcomes = list(outcomes)
      
        91
                self.calls: list[ToolCall] = []
      
        92
        
        93
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        94
                self.calls.append(tool_call)
      
        95
                if not self._outcomes:
      
        96
                    raise AssertionError("No fake tool outcome queued")
      
        97
                return self._outcomes.pop(0)
      
        98
        
        99
        
        100
        def build_context(
      
        101
            *,
      
        102
            temp_dir: Path,
      
        103
            messages: list[Message],
      
        104
            safeguards: FakeSafeguards,
      
        105
            assess_confidence,
      
        106
            verify_action,
      
        107
            recovery_context: RecoveryContext | None = None,
      
        108
            confidence_scoring: bool = False,
      
        109
            verification: bool = False,
      
        110
            auto_recover: bool = True,
      
        111
            min_confidence_for_action: int = 3,
      
        112
        ) -> RuntimeContext:
      
        113
            registry = create_default_registry(temp_dir)
      
        114
            registry.configure_workspace_root(temp_dir)
      
        115
            rule_status = load_permission_rules(temp_dir)
      
        116
            policy = build_permission_policy(
      
        117
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        118
                workspace_root=temp_dir,
      
        119
                tool_requirements=registry.get_tool_requirements(),
      
        120
                rules=rule_status.rules,
      
        121
            )
      
        122
            context = RuntimeContext(
      
        123
                project_root=temp_dir,
      
        124
                backend=ScriptedBackend(),
      
        125
                registry=registry,
      
        126
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        127
                config=SimpleNamespace(
      
        128
                    force_react=False,
      
        129
                    max_recovery_attempts=2,
      
        130
                    auto_recover=auto_recover,
      
        131
                    reasoning=SimpleNamespace(
      
        132
                        rollback=False,
      
        133
                        show_rollback_plan=False,
      
        134
                        completion_check=True,
      
        135
                        max_continuation_prompts=5,
      
        136
                        self_critique=False,
      
        137
                        confidence_scoring=confidence_scoring,
      
        138
                        min_confidence_for_action=min_confidence_for_action,
      
        139
                        verification=verification,
      
        140
                    ),
      
        141
                ),
      
        142
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        143
                project_context=None,
      
        144
                permission_policy=policy,
      
        145
                permission_config_status=rule_status,
      
        146
                workflow_mode="execute",
      
        147
                safeguards=safeguards,
      
        148
                reasoning=SimpleNamespace(
      
        149
                    assess_confidence=assess_confidence,
      
        150
                    verify_action=verify_action,
      
        151
                ),
      
        152
                recovery_context=recovery_context,
      
        153
            )
      
        154
            return context
      
        155
        
        156
        
        157
        def tool_outcome(
      
        158
            *,
      
        159
            tool_call: ToolCall,
      
        160
            output: str,
      
        161
            is_error: bool,
      
        162
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        163
            metadata: dict[str, object] | None = None,
      
        164
        ) -> ToolExecutionOutcome:
      
        165
            return ToolExecutionOutcome(
      
        166
                tool_call=tool_call,
      
        167
                state=state,
      
        168
                message=Message.tool_result_message(
      
        169
                    tool_call_id=tool_call.id,
      
        170
                    display_content=output,
      
        171
                    result_content=output,
      
        172
                    is_error=is_error,
      
        173
                ),
      
        174
                event_content=output,
      
        175
                is_error=is_error,
      
        176
                result_output=output,
      
        177
                registry_result=RegistryToolResult(
      
        178
                    output=output,
      
        179
                    is_error=is_error,
      
        180
                    metadata=metadata or {},
      
        181
                ),
      
        182
            )
      
        183
        
        184
        
        185
        @pytest.mark.asyncio
      
        186
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        187
            captured: dict[str, str] = {}
      
        188
        
        189
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        190
                captured["context"] = context
      
        191
                return ConfidenceAssessment(
      
        192
                    action=f"{tool_name} with {tool_args}",
      
        193
                    tool_name=tool_name,
      
        194
                    tool_args=tool_args,
      
        195
                    level=ConfidenceLevel.LOW,
      
        196
                    reasoning="Need to inspect the target first.",
      
        197
                    risks=["Unknown target file"],
      
        198
                )
      
        199
        
        200
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        201
                raise AssertionError("Verification should not run for skipped actions")
      
        202
        
        203
            context = build_context(
      
        204
                temp_dir=temp_dir,
      
        205
                messages=[
      
        206
                    Message(role=Role.USER, content="Please inspect the project."),
      
        207
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        208
                ],
      
        209
                safeguards=FakeSafeguards(),
      
        210
                assess_confidence=assess_confidence,
      
        211
                verify_action=verify_action,
      
        212
                confidence_scoring=True,
      
        213
                min_confidence_for_action=3,
      
        214
            )
      
        215
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        216
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        217
            events: list[AgentEvent] = []
      
        218
        
        219
            async def emit(event: AgentEvent) -> None:
      
        220
                events.append(event)
      
        221
        
        222
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        223
            result = await runner.execute_batch(
      
        224
                tool_calls=[tool_call],
      
        225
                tool_source="assistant",
      
        226
                pending_tool_calls_seen=set(),
      
        227
                emit=emit,
      
        228
                summary=TurnSummary(final_response=""),
      
        229
                dod=create_definition_of_done("Read the docs"),
      
        230
                executor=executor,  # type: ignore[arg-type]
      
        231
                on_confirmation=None,
      
        232
                on_user_question=None,
      
        233
                emit_confirmation=None,
      
        234
                consecutive_errors=0,
      
        235
            )
      
        236
        
        237
            assert result.actions_taken == []
      
        238
            assert executor.calls == []
      
        239
            assert "Please inspect the project." in captured["context"]
      
        240
            assert context.session.messages[-1].role == Role.USER
      
        241
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        242
            event_types = [event.type for event in events]
      
        243
            assert "confidence" in event_types
      
        244
        
        245
        
        246
        @pytest.mark.asyncio
      
        247
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        248
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        250
        
        251
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        252
                raise AssertionError("Verification should not run for failed actions")
      
        253
        
        254
            context = build_context(
      
        255
                temp_dir=temp_dir,
      
        256
                messages=[],
      
        257
                safeguards=FakeSafeguards(),
      
        258
                assess_confidence=assess_confidence,
      
        259
                verify_action=verify_action,
      
        260
                auto_recover=True,
      
        261
            )
      
        262
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        263
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        264
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        265
            summary = TurnSummary(final_response="")
      
        266
            events: list[AgentEvent] = []
      
        267
        
        268
            async def emit(event: AgentEvent) -> None:
      
        269
                events.append(event)
      
        270
        
        271
            await runner.execute_batch(
      
        272
                tool_calls=[tool_call],
      
        273
                tool_source="assistant",
      
        274
                pending_tool_calls_seen=set(),
      
        275
                emit=emit,
      
        276
                summary=summary,
      
        277
                dod=create_definition_of_done("Run tests"),
      
        278
                executor=executor,  # type: ignore[arg-type]
      
        279
                on_confirmation=None,
      
        280
                on_user_question=None,
      
        281
                emit_confirmation=None,
      
        282
                consecutive_errors=0,
      
        283
            )
      
        284
        
        285
            assert context.recovery_context is not None
      
        286
            assert summary.tool_result_messages
      
        287
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        288
            assert any(event.type == "recovery" for event in events)
      
        289
        
        290
        
        291
        @pytest.mark.asyncio
      
        292
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        293
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        294
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        295
        
        296
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        297
                raise AssertionError("Verification should not run for this scenario")
      
        298
        
        299
            context = build_context(
      
        300
                temp_dir=temp_dir,
      
        301
                messages=[],
      
        302
                safeguards=FakeSafeguards(),
      
        303
                assess_confidence=assess_confidence,
      
        304
                verify_action=verify_action,
      
        305
                auto_recover=False,
      
        306
            )
      
        307
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        308
            tool_call = ToolCall(
      
        309
                id="bash-1",
      
        310
                name="bash",
      
        311
                arguments={"command": "python -m http.server 8000", "background": True},
      
        312
            )
      
        313
            metadata = {
      
        314
                "job_id": "bash-1",
      
        315
                "status": "running",
      
        316
                "background": True,
      
        317
            }
      
        318
            executor = FakeExecutor(
      
        319
                [
      
        320
                    tool_outcome(
      
        321
                        tool_call=tool_call,
      
        322
                        output="Started bash job bash-1",
      
        323
                        is_error=False,
      
        324
                        metadata=metadata,
      
        325
                    )
      
        326
                ]
      
        327
            )
      
        328
            events: list[AgentEvent] = []
      
        329
        
        330
            async def emit(event: AgentEvent) -> None:
      
        331
                events.append(event)
      
        332
        
        333
            await runner.execute_batch(
      
        334
                tool_calls=[tool_call],
      
        335
                tool_source="assistant",
      
        336
                pending_tool_calls_seen=set(),
      
        337
                emit=emit,
      
        338
                summary=TurnSummary(final_response=""),
      
        339
                dod=create_definition_of_done("Launch a preview server"),
      
        340
                executor=executor,  # type: ignore[arg-type]
      
        341
                on_confirmation=None,
      
        342
                on_user_question=None,
      
        343
                emit_confirmation=None,
      
        344
                consecutive_errors=0,
      
        345
            )
      
        346
        
        347
            tool_result = next(event for event in events if event.type == "tool_result")
      
        348
            assert tool_result.tool_metadata == metadata
      
        349
        
        350
        
        351
        @pytest.mark.asyncio
      
        352
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        353
            verification_calls: list[str] = []
      
        354
        
        355
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        356
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        357
        
        358
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        359
                verification_calls.append(result)
      
        360
                return ActionVerification(
      
        361
                    tool_name=tool_name,
      
        362
                    tool_args=tool_args,
      
        363
                    expected_outcome="Success",
      
        364
                    actual_result=result,
      
        365
                    verified=False,
      
        366
                    discrepancies=["File contents did not match"],
      
        367
                    needs_correction=True,
      
        368
                    correction_suggestion="Read the file before editing again.",
      
        369
                )
      
        370
        
        371
            existing_recovery = RecoveryContext(
      
        372
                original_tool="edit",
      
        373
                original_args={"file_path": "README.md"},
      
        374
            )
      
        375
            context = build_context(
      
        376
                temp_dir=temp_dir,
      
        377
                messages=[],
      
        378
                safeguards=FakeSafeguards(),
      
        379
                assess_confidence=assess_confidence,
      
        380
                verify_action=verify_action,
      
        381
                recovery_context=existing_recovery,
      
        382
                verification=True,
      
        383
            )
      
        384
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        385
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        386
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        387
            events: list[AgentEvent] = []
      
        388
        
        389
            async def emit(event: AgentEvent) -> None:
      
        390
                events.append(event)
      
        391
        
        392
            await runner.execute_batch(
      
        393
                tool_calls=[tool_call],
      
        394
                tool_source="assistant",
      
        395
                pending_tool_calls_seen=set(),
      
        396
                emit=emit,
      
        397
                summary=TurnSummary(final_response=""),
      
        398
                dod=create_definition_of_done("Read the docs"),
      
        399
                executor=executor,  # type: ignore[arg-type]
      
        400
                on_confirmation=None,
      
        401
                on_user_question=None,
      
        402
                emit_confirmation=None,
      
        403
                consecutive_errors=0,
      
        404
            )
      
        405
        
        406
            assert verification_calls == ["file contents"]
      
        407
            assert context.recovery_context is existing_recovery
      
        408
            assert existing_recovery.successful_steps == [
      
        409
                ("read", {"file_path": "README.md"})
      
        410
            ]
      
        411
            assert context.session.messages[-1].role == Role.TOOL
      
        412
            assert context.session.messages[-1].content == "file contents"
      
        413
            assert any(event.type == "verification" for event in events)
      
        414
        
        415
        
        416
        @pytest.mark.asyncio
      
        417
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        418
            temp_dir: Path,
      
        419
        ) -> None:
      
        420
            async def assess_confidence(
      
        421
                tool_name: str,
      
        422
                tool_args: dict,
      
        423
                context: str,
      
        424
            ) -> ConfidenceAssessment:
      
        425
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        426
        
        427
            async def verify_action(
      
        428
                tool_name: str,
      
        429
                tool_args: dict,
      
        430
                result: str,
      
        431
                expected: str = "",
      
        432
            ) -> ActionVerification:
      
        433
                raise AssertionError("Verification should not run for this scenario")
      
        434
        
        435
            existing_recovery = RecoveryContext(
      
        436
                original_tool="read",
      
        437
                original_args={"file_path": "chapters/04-data-types.html"},
      
        438
            )
      
        439
            existing_recovery.add_attempt(
      
        440
                "read",
      
        441
                {"file_path": "chapters/04-data-types.html"},
      
        442
                "File not found",
      
        443
            )
      
        444
            context = build_context(
      
        445
                temp_dir=temp_dir,
      
        446
                messages=[],
      
        447
                safeguards=FakeSafeguards(),
      
        448
                assess_confidence=assess_confidence,
      
        449
                verify_action=verify_action,
      
        450
                recovery_context=existing_recovery,
      
        451
                auto_recover=False,
      
        452
            )
      
        453
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        454
            tool_call = ToolCall(
      
        455
                id="bash-1",
      
        456
                name="bash",
      
        457
                arguments={"command": "ls chapters"},
      
        458
            )
      
        459
            executor = FakeExecutor(
      
        460
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        461
            )
      
        462
        
        463
            summary = TurnSummary(final_response="")
      
        464
            await runner.execute_batch(
      
        465
                tool_calls=[tool_call],
      
        466
                tool_source="assistant",
      
        467
                pending_tool_calls_seen=set(),
      
        468
                emit=_noop_emit,
      
        469
                summary=summary,
      
        470
                dod=create_definition_of_done("Fix the chapter links"),
      
        471
                executor=executor,  # type: ignore[arg-type]
      
        472
                on_confirmation=None,
      
        473
                on_user_question=None,
      
        474
                emit_confirmation=None,
      
        475
                consecutive_errors=0,
      
        476
            )
      
        477
        
        478
            assert context.recovery_context is existing_recovery
      
        479
            assert existing_recovery.successful_steps == [
      
        480
                ("bash", {"command": "ls chapters"})
      
        481
            ]
      
        482
        
        483
        
        484
        @pytest.mark.asyncio
      
        485
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        486
            temp_dir: Path,
      
        487
        ) -> None:
      
        488
            async def assess_confidence(
      
        489
                tool_name: str,
      
        490
                tool_args: dict,
      
        491
                context: str,
      
        492
            ) -> ConfidenceAssessment:
      
        493
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        494
        
        495
            async def verify_action(
      
        496
                tool_name: str,
      
        497
                tool_args: dict,
      
        498
                result: str,
      
        499
                expected: str = "",
      
        500
            ) -> ActionVerification:
      
        501
                raise AssertionError("Verification should not run for this scenario")
      
        502
        
        503
            existing_recovery = RecoveryContext(
      
        504
                original_tool="read",
      
        505
                original_args={"file_path": "chapters/04-data-types.html"},
      
        506
            )
      
        507
            existing_recovery.add_attempt(
      
        508
                "read",
      
        509
                {"file_path": "chapters/04-data-types.html"},
      
        510
                "File not found",
      
        511
            )
      
        512
            context = build_context(
      
        513
                temp_dir=temp_dir,
      
        514
                messages=[],
      
        515
                safeguards=FakeSafeguards(),
      
        516
                assess_confidence=assess_confidence,
      
        517
                verify_action=verify_action,
      
        518
                recovery_context=existing_recovery,
      
        519
                auto_recover=False,
      
        520
            )
      
        521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        522
            tool_call = ToolCall(
      
        523
                id="patch-1",
      
        524
                name="patch",
      
        525
                arguments={
      
        526
                    "file_path": "index.html",
      
        527
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        528
                },
      
        529
            )
      
        530
            executor = FakeExecutor(
      
        531
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        532
            )
      
        533
        
        534
            summary = TurnSummary(final_response="")
      
        535
            await runner.execute_batch(
      
        536
                tool_calls=[tool_call],
      
        537
                tool_source="assistant",
      
        538
                pending_tool_calls_seen=set(),
      
        539
                emit=_noop_emit,
      
        540
                summary=summary,
      
        541
                dod=create_definition_of_done("Fix the chapter links"),
      
        542
                executor=executor,  # type: ignore[arg-type]
      
        543
                on_confirmation=None,
      
        544
                on_user_question=None,
      
        545
                emit_confirmation=None,
      
        546
                consecutive_errors=0,
      
        547
            )
      
        548
        
        549
            assert context.recovery_context is None
      
        550
        
        551
        
        552
        @pytest.mark.asyncio
      
        553
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        554
            temp_dir: Path,
      
        555
        ) -> None:
      
        556
            async def assess_confidence(
      
        557
                tool_name: str,
      
        558
                tool_args: dict,
      
        559
                context: str,
      
        560
            ) -> ConfidenceAssessment:
      
        561
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        562
        
        563
            async def verify_action(
      
        564
                tool_name: str,
      
        565
                tool_args: dict,
      
        566
                result: str,
      
        567
                expected: str = "",
      
        568
            ) -> ActionVerification:
      
        569
                raise AssertionError("Verification should not run for this scenario")
      
        570
        
        571
            messages = [
      
        572
                Message(
      
        573
                    role=Role.TOOL,
      
        574
                    content=(
      
        575
                        "Observation [glob]: Result: "
      
        576
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        577
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        578
                        f"{temp_dir}/chapters/03-basics.html"
      
        579
                    ),
      
        580
                    tool_results=[],
      
        581
                ),
      
        582
                Message(
      
        583
                    role=Role.ASSISTANT,
      
        584
                    content="I already inspected the first chapter title.",
      
        585
                    tool_calls=[
      
        586
                        ToolCall(
      
        587
                            id="read-ch1",
      
        588
                            name="read",
      
        589
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        590
                        )
      
        591
                    ],
      
        592
                ),
      
        593
                Message.tool_result_message(
      
        594
                    tool_call_id="read-ch1",
      
        595
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        596
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                ),
      
        598
                Message(
      
        599
                    role=Role.ASSISTANT,
      
        600
                    content="I should update the index now.",
      
        601
                    tool_calls=[
      
        602
                        ToolCall(
      
        603
                            id="read-index",
      
        604
                            name="read",
      
        605
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        606
                        )
      
        607
                    ],
      
        608
                ),
      
        609
            ]
      
        610
            context = build_context(
      
        611
                temp_dir=temp_dir,
      
        612
                messages=messages,
      
        613
                safeguards=FakeSafeguards(),
      
        614
                assess_confidence=assess_confidence,
      
        615
                verify_action=verify_action,
      
        616
                auto_recover=False,
      
        617
            )
      
        618
            (temp_dir / "chapters").mkdir()
      
        619
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        620
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        621
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        622
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        623
            implementation_plan = temp_dir / "implementation.md"
      
        624
            implementation_plan.write_text(
      
        625
                "\n".join(
      
        626
                    [
      
        627
                        "# Implementation Plan",
      
        628
                        "",
      
        629
                        "## File Changes",
      
        630
                        f"- `{temp_dir / 'index.html'}`",
      
        631
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        635
                    ]
      
        636
                )
      
        637
            )
      
        638
            context.session.current_task = (
      
        639
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        640
            )
      
        641
            persistent_messages: list[str] = []
      
        642
            ephemeral_messages: list[str] = []
      
        643
            context.queue_steering_message_callback = persistent_messages.append
      
        644
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        645
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        646
            tool_call = ToolCall(
      
        647
                id="read-dup",
      
        648
                name="read",
      
        649
                arguments={"file_path": str(temp_dir / "index.html")},
      
        650
            )
      
        651
            duplicate_message = (
      
        652
                "[Skipped - duplicate action: Already read "
      
        653
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        654
                "reuse the earlier read result instead of rereading]"
      
        655
            )
      
        656
            executor = FakeExecutor(
      
        657
                [
      
        658
                    ToolExecutionOutcome(
      
        659
                        tool_call=tool_call,
      
        660
                        state=ToolExecutionState.DUPLICATE,
      
        661
                        message=Message.tool_result_message(
      
        662
                            tool_call_id=tool_call.id,
      
        663
                            display_content=duplicate_message,
      
        664
                            result_content=duplicate_message,
      
        665
                        ),
      
        666
                        event_content=duplicate_message,
      
        667
                        is_error=False,
      
        668
                        result_output=duplicate_message,
      
        669
                    )
      
        670
                ]
      
        671
            )
      
        672
        
        673
            summary = TurnSummary(final_response="")
      
        674
            dod = create_definition_of_done("Fix the chapter links")
      
        675
            dod.implementation_plan = str(implementation_plan)
      
        676
            dod.pending_items.append("Create the remaining chapter files")
      
        677
            await runner.execute_batch(
      
        678
                tool_calls=[tool_call],
      
        679
                tool_source="assistant",
      
        680
                pending_tool_calls_seen=set(),
      
        681
                emit=_noop_emit,
      
        682
                summary=summary,
      
        683
                dod=dod,
      
        684
                executor=executor,  # type: ignore[arg-type]
      
        685
                on_confirmation=None,
      
        686
                on_user_question=None,
      
        687
                emit_confirmation=None,
      
        688
                consecutive_errors=0,
      
        689
            )
      
        690
        
        691
            assert len(persistent_messages) == 1
      
        692
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        693
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        694
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        695
            assert (
      
        696
                f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
      
        697
                in persistent_messages[0]
      
        698
            )
      
        699
            assert ephemeral_messages == []
      
        700
        
        701
        
        702
        @pytest.mark.asyncio
      
        703
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        704
            temp_dir: Path,
      
        705
        ) -> None:
      
        706
            async def assess_confidence(
      
        707
                tool_name: str,
      
        708
                tool_args: dict,
      
        709
                context: str,
      
        710
            ) -> ConfidenceAssessment:
      
        711
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        712
        
        713
            async def verify_action(
      
        714
                tool_name: str,
      
        715
                tool_args: dict,
      
        716
                result: str,
      
        717
                expected: str = "",
      
        718
            ) -> ActionVerification:
      
        719
                raise AssertionError("Verification should not run for this scenario")
      
        720
        
        721
            context = build_context(
      
        722
                temp_dir=temp_dir,
      
        723
                messages=[],
      
        724
                safeguards=FakeSafeguards(),
      
        725
                assess_confidence=assess_confidence,
      
        726
                verify_action=verify_action,
      
        727
                auto_recover=False,
      
        728
            )
      
        729
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        730
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        731
            sync_todos_to_definition_of_done(
      
        732
                dod,
      
        733
                [
      
        734
                    {
      
        735
                        "content": "Create 03-first-website.html",
      
        736
                        "active_form": "Creating 03-first-website.html",
      
        737
                        "status": "pending",
      
        738
                    },
      
        739
                    {
      
        740
                        "content": "Create 04-configuration-basics.html",
      
        741
                        "active_form": "Creating 04-configuration-basics.html",
      
        742
                        "status": "pending",
      
        743
                    },
      
        744
                ],
      
        745
            )
      
        746
        
        747
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        748
            chapter_path.parent.mkdir(parents=True)
      
        749
            write_call = ToolCall(
      
        750
                id="write-ch3",
      
        751
                name="write",
      
        752
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        753
            )
      
        754
            stale_todo_call = ToolCall(
      
        755
                id="todo-stale",
      
        756
                name="TodoWrite",
      
        757
                arguments={
      
        758
                    "todos": [
      
        759
                        {
      
        760
                            "content": "Create 03-first-website.html",
      
        761
                            "active_form": "Creating 03-first-website.html",
      
        762
                            "status": "pending",
      
        763
                        },
      
        764
                        {
      
        765
                            "content": "Create 04-configuration-basics.html",
      
        766
                            "active_form": "Creating 04-configuration-basics.html",
      
        767
                            "status": "pending",
      
        768
                        },
      
        769
                    ]
      
        770
                },
      
        771
            )
      
        772
            executor = FakeExecutor(
      
        773
                [
      
        774
                    tool_outcome(
      
        775
                        tool_call=write_call,
      
        776
                        output=f"Successfully wrote {chapter_path}",
      
        777
                        is_error=False,
      
        778
                    ),
      
        779
                    tool_outcome(
      
        780
                        tool_call=stale_todo_call,
      
        781
                        output="Todos updated",
      
        782
                        is_error=False,
      
        783
                        metadata={
      
        784
                            "new_todos": [
      
        785
                                {
      
        786
                                    "content": "Create 03-first-website.html",
      
        787
                                    "active_form": "Creating 03-first-website.html",
      
        788
                                    "status": "pending",
      
        789
                                },
      
        790
                                {
      
        791
                                    "content": "Create 04-configuration-basics.html",
      
        792
                                    "active_form": "Creating 04-configuration-basics.html",
      
        793
                                    "status": "pending",
      
        794
                                },
      
        795
                            ]
      
        796
                        },
      
        797
                    ),
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            await runner.execute_batch(
      
        803
                tool_calls=[write_call, stale_todo_call],
      
        804
                tool_source="assistant",
      
        805
                pending_tool_calls_seen=set(),
      
        806
                emit=_noop_emit,
      
        807
                summary=summary,
      
        808
                dod=dod,
      
        809
                executor=executor,  # type: ignore[arg-type]
      
        810
                on_confirmation=None,
      
        811
                on_user_question=None,
      
        812
                emit_confirmation=None,
      
        813
                consecutive_errors=0,
      
        814
            )
      
        815
        
        816
            assert "Create 03-first-website.html" in dod.completed_items
      
        817
            assert "Create 03-first-website.html" not in dod.pending_items
      
        818
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        819
        
        820
        
        821
        @pytest.mark.asyncio
      
        822
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        823
            temp_dir: Path,
      
        824
        ) -> None:
      
        825
            async def assess_confidence(
      
        826
                tool_name: str,
      
        827
                tool_args: dict,
      
        828
                context: str,
      
        829
            ) -> ConfidenceAssessment:
      
        830
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        831
        
        832
            async def verify_action(
      
        833
                tool_name: str,
      
        834
                tool_args: dict,
      
        835
                result: str,
      
        836
                expected: str = "",
      
        837
            ) -> ActionVerification:
      
        838
                raise AssertionError("Verification should not run for this scenario")
      
        839
        
        840
            chapters = temp_dir / "chapters"
      
        841
            chapters.mkdir()
      
        842
            (chapters / "01-introduction.html").write_text(
      
        843
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        844
            )
      
        845
            (chapters / "02-setup.html").write_text(
      
        846
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        847
            )
      
        848
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        849
        
        850
            context = build_context(
      
        851
                temp_dir=temp_dir,
      
        852
                messages=[],
      
        853
                safeguards=FakeSafeguards(),
      
        854
                assess_confidence=assess_confidence,
      
        855
                verify_action=verify_action,
      
        856
                auto_recover=False,
      
        857
            )
      
        858
            context.session.current_task = (
      
        859
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        860
            )
      
        861
            persistent_messages: list[str] = []
      
        862
            ephemeral_messages: list[str] = []
      
        863
            context.queue_steering_message_callback = persistent_messages.append
      
        864
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        865
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        866
            tool_call = ToolCall(
      
        867
                id="glob-1",
      
        868
                name="glob",
      
        869
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        870
            )
      
        871
            executor = FakeExecutor(
      
        872
                [
      
        873
                    tool_outcome(
      
        874
                        tool_call=tool_call,
      
        875
                        output="\n".join(
      
        876
                            [
      
        877
                                str(chapters / "01-introduction.html"),
      
        878
                                str(chapters / "02-setup.html"),
      
        879
                            ]
      
        880
                        ),
      
        881
                        is_error=False,
      
        882
                    )
      
        883
                ]
      
        884
            )
      
        885
        
        886
            summary = TurnSummary(final_response="")
      
        887
            await runner.execute_batch(
      
        888
                tool_calls=[tool_call],
      
        889
                tool_source="assistant",
      
        890
                pending_tool_calls_seen=set(),
      
        891
                emit=_noop_emit,
      
        892
                summary=summary,
      
        893
                dod=create_definition_of_done("Fix the chapter links"),
      
        894
                executor=executor,  # type: ignore[arg-type]
      
        895
                on_confirmation=None,
      
        896
                on_user_question=None,
      
        897
                emit_confirmation=None,
      
        898
                consecutive_errors=0,
      
        899
            )
      
        900
        
        901
            assert persistent_messages == []
      
        902
            assert ephemeral_messages == []
      
        903
            assert len(summary.tool_result_messages) == 1
      
        904
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        905
        
        906
        
        907
        @pytest.mark.asyncio
      
        908
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        909
            temp_dir: Path,
      
        910
        ) -> None:
      
        911
            async def assess_confidence(
      
        912
                tool_name: str,
      
        913
                tool_args: dict,
      
        914
                context: str,
      
        915
            ) -> ConfidenceAssessment:
      
        916
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        917
        
        918
            async def verify_action(
      
        919
                tool_name: str,
      
        920
                tool_args: dict,
      
        921
                result: str,
      
        922
                expected: str = "",
      
        923
            ) -> ActionVerification:
      
        924
                raise AssertionError("Verification should not run for this scenario")
      
        925
        
        926
            chapters = temp_dir / "chapters"
      
        927
            chapters.mkdir()
      
        928
            (chapters / "01-introduction.html").write_text(
      
        929
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        930
            )
      
        931
            (chapters / "02-setup.html").write_text(
      
        932
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        933
            )
      
        934
            index_path = temp_dir / "index.html"
      
        935
            old_block = (
      
        936
                '<ul class="chapter-list">\n'
      
        937
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        938
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        939
                "</ul>\n"
      
        940
            )
      
        941
            new_block = (
      
        942
                '<ul class="chapter-list">\n'
      
        943
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        944
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        945
                "</ul>\n"
      
        946
            )
      
        947
            index_path.write_text(new_block)
      
        948
        
        949
            context = build_context(
      
        950
                temp_dir=temp_dir,
      
        951
                messages=[],
      
        952
                safeguards=FakeSafeguards(),
      
        953
                assess_confidence=assess_confidence,
      
        954
                verify_action=verify_action,
      
        955
                auto_recover=False,
      
        956
            )
      
        957
            context.session.current_task = (
      
        958
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        959
            )
      
        960
            persistent_messages: list[str] = []
      
        961
            ephemeral_messages: list[str] = []
      
        962
            context.queue_steering_message_callback = persistent_messages.append
      
        963
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        964
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        965
            tool_call = ToolCall(
      
        966
                id="edit-1",
      
        967
                name="edit",
      
        968
                arguments={
      
        969
                    "file_path": str(index_path),
      
        970
                    "old_string": old_block,
      
        971
                    "new_string": new_block,
      
        972
                },
      
        973
            )
      
        974
            executor = FakeExecutor(
      
        975
                [
      
        976
                    tool_outcome(
      
        977
                        tool_call=tool_call,
      
        978
                        output=f"Successfully edited {index_path}",
      
        979
                        is_error=False,
      
        980
                    )
      
        981
                ]
      
        982
            )
      
        983
        
        984
            summary = TurnSummary(final_response="")
      
        985
            await runner.execute_batch(
      
        986
                tool_calls=[tool_call],
      
        987
                tool_source="assistant",
      
        988
                pending_tool_calls_seen=set(),
      
        989
                emit=_noop_emit,
      
        990
                summary=summary,
      
        991
                dod=create_definition_of_done(
      
        992
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        993
                ),
      
        994
                executor=executor,  # type: ignore[arg-type]
      
        995
                on_confirmation=None,
      
        996
                on_user_question=None,
      
        997
                emit_confirmation=None,
      
        998
                consecutive_errors=0,
      
        999
            )
      
        1000
        
        1001
            assert all(
      
        1002
                "Semantic verification preview:" not in message.content
      
        1003
                for message in summary.tool_result_messages
      
        1004
            )
      
        1005
            assert persistent_messages == []
      
        1006
            assert ephemeral_messages == []
      
        1007
        
        1008
        
        1009
        @pytest.mark.asyncio
      
        1010
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1011
            temp_dir: Path,
      
        1012
        ) -> None:
      
        1013
            async def assess_confidence(
      
        1014
                tool_name: str,
      
        1015
                tool_args: dict,
      
        1016
                context: str,
      
        1017
            ) -> ConfidenceAssessment:
      
        1018
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1019
        
        1020
            async def verify_action(
      
        1021
                tool_name: str,
      
        1022
                tool_args: dict,
      
        1023
                result: str,
      
        1024
                expected: str = "",
      
        1025
            ) -> ActionVerification:
      
        1026
                raise AssertionError("Verification should not run for this scenario")
      
        1027
        
        1028
            chapters = temp_dir / "chapters"
      
        1029
            chapters.mkdir()
      
        1030
            (chapters / "01-introduction.html").write_text(
      
        1031
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1032
            )
      
        1033
            (chapters / "02-setup.html").write_text(
      
        1034
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1035
            )
      
        1036
            index_path = temp_dir / "index.html"
      
        1037
            index_path.write_text(
      
        1038
                "<h2>Table of Contents</h2>\n"
      
        1039
                '<ul class="chapter-list">\n'
      
        1040
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1041
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1042
                "</ul>\n"
      
        1043
            )
      
        1044
        
        1045
            prompt = (
      
        1046
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1047
                "for the structure and cadence of the guide. We are going to make an all "
      
        1048
                "new equally thorough guide on how to use the nginx tool."
      
        1049
            )
      
        1050
        
        1051
            context = build_context(
      
        1052
                temp_dir=temp_dir,
      
        1053
                messages=[],
      
        1054
                safeguards=FakeSafeguards(),
      
        1055
                assess_confidence=assess_confidence,
      
        1056
                verify_action=verify_action,
      
        1057
                auto_recover=False,
      
        1058
            )
      
        1059
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1060
            persistent_messages: list[str] = []
      
        1061
            ephemeral_messages: list[str] = []
      
        1062
            context.queue_steering_message_callback = persistent_messages.append
      
        1063
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1064
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1065
            tool_call = ToolCall(
      
        1066
                id="read-index",
      
        1067
                name="read",
      
        1068
                arguments={"file_path": str(index_path)},
      
        1069
            )
      
        1070
            executor = FakeExecutor(
      
        1071
                [
      
        1072
                    tool_outcome(
      
        1073
                        tool_call=tool_call,
      
        1074
                        output=index_path.read_text(),
      
        1075
                        is_error=False,
      
        1076
                    )
      
        1077
                ]
      
        1078
            )
      
        1079
        
        1080
            summary = TurnSummary(final_response="")
      
        1081
            await runner.execute_batch(
      
        1082
                tool_calls=[tool_call],
      
        1083
                tool_source="assistant",
      
        1084
                pending_tool_calls_seen=set(),
      
        1085
                emit=_noop_emit,
      
        1086
                summary=summary,
      
        1087
                dod=create_definition_of_done(prompt),
      
        1088
                executor=executor,  # type: ignore[arg-type]
      
        1089
                on_confirmation=None,
      
        1090
                on_user_question=None,
      
        1091
                emit_confirmation=None,
      
        1092
                consecutive_errors=0,
      
        1093
            )
      
        1094
        
        1095
            assert persistent_messages == []
      
        1096
            assert ephemeral_messages == []
      
        1097
            assert all(
      
        1098
                "Semantic verification preview:" not in message.content
      
        1099
                for message in summary.tool_result_messages
      
        1100
            )
      
        1101
        
        1102
        
        1103
        @pytest.mark.asyncio
      
        1104
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1105
            temp_dir: Path,
      
        1106
        ) -> None:
      
        1107
            async def assess_confidence(
      
        1108
                tool_name: str,
      
        1109
                tool_args: dict,
      
        1110
                context: str,
      
        1111
            ) -> ConfidenceAssessment:
      
        1112
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1113
        
        1114
            async def verify_action(
      
        1115
                tool_name: str,
      
        1116
                tool_args: dict,
      
        1117
                result: str,
      
        1118
                expected: str = "",
      
        1119
            ) -> ActionVerification:
      
        1120
                raise AssertionError("Verification should not run for this scenario")
      
        1121
        
        1122
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1123
            reference.parent.mkdir(parents=True)
      
        1124
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1125
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1126
            chapters = nginx_root / "chapters"
      
        1127
            implementation_plan = temp_dir / "implementation.md"
      
        1128
            implementation_plan.write_text(
      
        1129
                "\n".join(
      
        1130
                    [
      
        1131
                        "# Implementation Plan",
      
        1132
                        "",
      
        1133
                        "## File Changes",
      
        1134
                        f"- `{chapters}/`",
      
        1135
                        f"- `{nginx_root / 'index.html'}`",
      
        1136
                        "",
      
        1137
                    ]
      
        1138
                )
      
        1139
            )
      
        1140
        
        1141
            context = build_context(
      
        1142
                temp_dir=temp_dir,
      
        1143
                messages=[],
      
        1144
                safeguards=FakeSafeguards(),
      
        1145
                assess_confidence=assess_confidence,
      
        1146
                verify_action=verify_action,
      
        1147
                auto_recover=False,
      
        1148
            )
      
        1149
            persistent_messages: list[str] = []
      
        1150
            ephemeral_messages: list[str] = []
      
        1151
            context.queue_steering_message_callback = persistent_messages.append
      
        1152
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1153
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1154
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1155
            dod.implementation_plan = str(implementation_plan)
      
        1156
            sync_todos_to_definition_of_done(
      
        1157
                dod,
      
        1158
                [
      
        1159
                    {
      
        1160
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1161
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1162
                        "status": "pending",
      
        1163
                    },
      
        1164
                    {
      
        1165
                        "content": "Create the nginx directory structure",
      
        1166
                        "active_form": "Working on: Create the nginx directory structure",
      
        1167
                        "status": "pending",
      
        1168
                    },
      
        1169
                    {
      
        1170
                        "content": "Create the nginx index.html file",
      
        1171
                        "active_form": "Working on: Create the nginx index.html file",
      
        1172
                        "status": "pending",
      
        1173
                    },
      
        1174
                ],
      
        1175
            )
      
        1176
            tool_call = ToolCall(
      
        1177
                id="read-reference",
      
        1178
                name="read",
      
        1179
                arguments={"file_path": str(reference)},
      
        1180
            )
      
        1181
            executor = FakeExecutor(
      
        1182
                [
      
        1183
                    tool_outcome(
      
        1184
                        tool_call=tool_call,
      
        1185
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1186
                        is_error=False,
      
        1187
                    )
      
        1188
                ]
      
        1189
            )
      
        1190
        
        1191
            summary = TurnSummary(final_response="")
      
        1192
            await runner.execute_batch(
      
        1193
                tool_calls=[tool_call],
      
        1194
                tool_source="assistant",
      
        1195
                pending_tool_calls_seen=set(),
      
        1196
                emit=_noop_emit,
      
        1197
                summary=summary,
      
        1198
                dod=dod,
      
        1199
                executor=executor,  # type: ignore[arg-type]
      
        1200
                on_confirmation=None,
      
        1201
                on_user_question=None,
      
        1202
                emit_confirmation=None,
      
        1203
                consecutive_errors=0,
      
        1204
            )
      
        1205
        
        1206
            assert (
      
        1207
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1208
                in dod.completed_items
      
        1209
            )
      
        1210
            assert any(
      
        1211
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1212
                in message
      
        1213
                for message in persistent_messages
      
        1214
            )
      
        1215
            assert any(
      
        1216
                "Resume by creating `chapters/` now." in message
      
        1217
                for message in persistent_messages
      
        1218
            )
      
        1219
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1220
            assert ephemeral_messages == []
      
        1221
        
        1222
        
        1223
        @pytest.mark.asyncio
      
        1224
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1225
            temp_dir: Path,
      
        1226
        ) -> None:
      
        1227
            async def assess_confidence(
      
        1228
                tool_name: str,
      
        1229
                tool_args: dict,
      
        1230
                context: str,
      
        1231
            ) -> ConfidenceAssessment:
      
        1232
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1233
        
        1234
            async def verify_action(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                result: str,
      
        1238
                expected: str = "",
      
        1239
            ) -> ActionVerification:
      
        1240
                raise AssertionError("Verification should not run for this scenario")
      
        1241
        
        1242
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1243
            reference.parent.mkdir(parents=True)
      
        1244
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1245
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1246
            chapters = nginx_root / "chapters"
      
        1247
            implementation_plan = temp_dir / "implementation.md"
      
        1248
            implementation_plan.write_text(
      
        1249
                "\n".join(
      
        1250
                    [
      
        1251
                        "# Implementation Plan",
      
        1252
                        "",
      
        1253
                        "## File Changes",
      
        1254
                        f"- `{nginx_root / 'index.html'}`",
      
        1255
                        f"- `{chapters}/`",
      
        1256
                        "",
      
        1257
                    ]
      
        1258
                )
      
        1259
            )
      
        1260
        
        1261
            context = build_context(
      
        1262
                temp_dir=temp_dir,
      
        1263
                messages=[],
      
        1264
                safeguards=FakeSafeguards(),
      
        1265
                assess_confidence=assess_confidence,
      
        1266
                verify_action=verify_action,
      
        1267
                auto_recover=False,
      
        1268
            )
      
        1269
            persistent_messages: list[str] = []
      
        1270
            ephemeral_messages: list[str] = []
      
        1271
            context.queue_steering_message_callback = persistent_messages.append
      
        1272
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1273
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1274
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1275
            dod.implementation_plan = str(implementation_plan)
      
        1276
            sync_todos_to_definition_of_done(
      
        1277
                dod,
      
        1278
                [
      
        1279
                    {
      
        1280
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1281
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1282
                        "status": "pending",
      
        1283
                    },
      
        1284
                    {
      
        1285
                        "content": "Create the nginx directory structure",
      
        1286
                        "active_form": "Working on: Create the nginx directory structure",
      
        1287
                        "status": "pending",
      
        1288
                    },
      
        1289
                    {
      
        1290
                        "content": "Create the nginx index.html file",
      
        1291
                        "active_form": "Working on: Create the nginx index.html file",
      
        1292
                        "status": "pending",
      
        1293
                    },
      
        1294
                ],
      
        1295
                project_root=temp_dir,
      
        1296
            )
      
        1297
            tool_call = ToolCall(
      
        1298
                id="read-reference-index-first",
      
        1299
                name="read",
      
        1300
                arguments={"file_path": str(reference)},
      
        1301
            )
      
        1302
            executor = FakeExecutor(
      
        1303
                [
      
        1304
                    tool_outcome(
      
        1305
                        tool_call=tool_call,
      
        1306
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1307
                        is_error=False,
      
        1308
                    )
      
        1309
                ]
      
        1310
            )
      
        1311
        
        1312
            summary = TurnSummary(final_response="")
      
        1313
            await runner.execute_batch(
      
        1314
                tool_calls=[tool_call],
      
        1315
                tool_source="assistant",
      
        1316
                pending_tool_calls_seen=set(),
      
        1317
                emit=_noop_emit,
      
        1318
                summary=summary,
      
        1319
                dod=dod,
      
        1320
                executor=executor,  # type: ignore[arg-type]
      
        1321
                on_confirmation=None,
      
        1322
                on_user_question=None,
      
        1323
                emit_confirmation=None,
      
        1324
                consecutive_errors=0,
      
        1325
            )
      
        1326
        
        1327
            assert persistent_messages
      
        1328
            assert any(
      
        1329
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1330
                in message
      
        1331
                for message in persistent_messages
      
        1332
            )
      
        1333
            assert any(
      
        1334
                "Resume by creating `chapters/` now." in message
      
        1335
                for message in persistent_messages
      
        1336
            )
      
        1337
            assert all(
      
        1338
                "Next step: create `index.html`." not in message
      
        1339
                for message in persistent_messages
      
        1340
            )
      
        1341
            assert ephemeral_messages == []
      
        1342
        
        1343
        
        1344
        @pytest.mark.asyncio
      
        1345
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1346
            temp_dir: Path,
      
        1347
        ) -> None:
      
        1348
            async def assess_confidence(
      
        1349
                tool_name: str,
      
        1350
                tool_args: dict,
      
        1351
                context: str,
      
        1352
            ) -> ConfidenceAssessment:
      
        1353
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1354
        
        1355
            async def verify_action(
      
        1356
                tool_name: str,
      
        1357
                tool_args: dict,
      
        1358
                result: str,
      
        1359
                expected: str = "",
      
        1360
            ) -> ActionVerification:
      
        1361
                raise AssertionError("Verification should not run for this scenario")
      
        1362
        
        1363
            reference = temp_dir / "fortran" / "index.html"
      
        1364
            reference.parent.mkdir(parents=True)
      
        1365
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1366
        
        1367
            messages = [
      
        1368
                Message(
      
        1369
                    role=Role.TOOL,
      
        1370
                    content=(
      
        1371
                        "Observation [read]: Result: "
      
        1372
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1373
                    ),
      
        1374
                )
      
        1375
            ]
      
        1376
            context = build_context(
      
        1377
                temp_dir=temp_dir,
      
        1378
                messages=messages,
      
        1379
                safeguards=FakeSafeguards(),
      
        1380
                assess_confidence=assess_confidence,
      
        1381
                verify_action=verify_action,
      
        1382
                auto_recover=False,
      
        1383
            )
      
        1384
            prompt = (
      
        1385
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1386
                "for the structure and cadence of the guide. We are going to make an all "
      
        1387
                "new equally thorough guide on how to use the nginx tool."
      
        1388
            )
      
        1389
            context.session.current_task = prompt
      
        1390
            persistent_messages: list[str] = []
      
        1391
            ephemeral_messages: list[str] = []
      
        1392
            context.queue_steering_message_callback = persistent_messages.append
      
        1393
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1394
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1395
            dod = create_definition_of_done(prompt)
      
        1396
            sync_todos_to_definition_of_done(
      
        1397
                dod,
      
        1398
                [
      
        1399
                    {
      
        1400
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1401
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1402
                        "status": "completed",
      
        1403
                    },
      
        1404
                    {
      
        1405
                        "content": "Create the nginx directory structure",
      
        1406
                        "active_form": "Working on: Create the nginx directory structure",
      
        1407
                        "status": "pending",
      
        1408
                    },
      
        1409
                    {
      
        1410
                        "content": "Create the nginx index.html file",
      
        1411
                        "active_form": "Working on: Create the nginx index.html file",
      
        1412
                        "status": "pending",
      
        1413
                    },
      
        1414
                ],
      
        1415
            )
      
        1416
            tool_call = ToolCall(
      
        1417
                id="read-dup",
      
        1418
                name="read",
      
        1419
                arguments={"file_path": str(reference)},
      
        1420
            )
      
        1421
            duplicate_message = (
      
        1422
                "[Skipped - duplicate action: Already read "
      
        1423
                f"{reference} recently without any intervening changes; "
      
        1424
                "reuse the earlier read result instead of rereading]"
      
        1425
            )
      
        1426
            executor = FakeExecutor(
      
        1427
                [
      
        1428
                    ToolExecutionOutcome(
      
        1429
                        tool_call=tool_call,
      
        1430
                        state=ToolExecutionState.DUPLICATE,
      
        1431
                        message=Message.tool_result_message(
      
        1432
                            tool_call_id=tool_call.id,
      
        1433
                            display_content=duplicate_message,
      
        1434
                            result_content=duplicate_message,
      
        1435
                        ),
      
        1436
                        event_content=duplicate_message,
      
        1437
                        is_error=False,
      
        1438
                        result_output=duplicate_message,
      
        1439
                    )
      
        1440
                ]
      
        1441
            )
      
        1442
        
        1443
            summary = TurnSummary(final_response="")
      
        1444
            await runner.execute_batch(
      
        1445
                tool_calls=[tool_call],
      
        1446
                tool_source="assistant",
      
        1447
                pending_tool_calls_seen=set(),
      
        1448
                emit=_noop_emit,
      
        1449
                summary=summary,
      
        1450
                dod=dod,
      
        1451
                executor=executor,  # type: ignore[arg-type]
      
        1452
                on_confirmation=None,
      
        1453
                on_user_question=None,
      
        1454
                emit_confirmation=None,
      
        1455
                consecutive_errors=0,
      
        1456
            )
      
        1457
        
        1458
            assert len(persistent_messages) == 1
      
        1459
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1460
            assert (
      
        1461
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1462
                in persistent_messages[0]
      
        1463
            )
      
        1464
            assert "Update `" not in persistent_messages[0]
      
        1465
            assert ephemeral_messages == []
      
        1466
        
        1467
        
        1468
        @pytest.mark.asyncio
      
        1469
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1470
            temp_dir: Path,
      
        1471
        ) -> None:
      
        1472
            async def assess_confidence(
      
        1473
                tool_name: str,
      
        1474
                tool_args: dict,
      
        1475
                context: str,
      
        1476
            ) -> ConfidenceAssessment:
      
        1477
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1478
        
        1479
            async def verify_action(
      
        1480
                tool_name: str,
      
        1481
                tool_args: dict,
      
        1482
                result: str,
      
        1483
                expected: str = "",
      
        1484
            ) -> ActionVerification:
      
        1485
                raise AssertionError("Verification should not run for this scenario")
      
        1486
        
        1487
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1488
            chapters = guide_root / "chapters"
      
        1489
            chapters.mkdir(parents=True)
      
        1490
            chapter_one = chapters / "01-introduction.html"
      
        1491
            chapter_one.write_text("<html></html>\n")
      
        1492
            index_path = guide_root / "index.html"
      
        1493
        
        1494
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1495
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1496
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1497
        
        1498
            implementation_plan = temp_dir / "implementation.md"
      
        1499
            implementation_plan.write_text(
      
        1500
                "\n".join(
      
        1501
                    [
      
        1502
                        "# Implementation Plan",
      
        1503
                        "",
      
        1504
                        "## File Changes",
      
        1505
                        f"- `{guide_root}/`",
      
        1506
                        f"- `{chapters}/`",
      
        1507
                        f"- `{index_path}`",
      
        1508
                        f"- `{chapter_one}`",
      
        1509
                        f"- `{chapters / '02-installation.html'}`",
      
        1510
                        "",
      
        1511
                    ]
      
        1512
                )
      
        1513
            )
      
        1514
        
        1515
            context = build_context(
      
        1516
                temp_dir=temp_dir,
      
        1517
                messages=[],
      
        1518
                safeguards=FakeSafeguards(),
      
        1519
                assess_confidence=assess_confidence,
      
        1520
                verify_action=verify_action,
      
        1521
                auto_recover=False,
      
        1522
            )
      
        1523
            persistent_messages: list[str] = []
      
        1524
            ephemeral_messages: list[str] = []
      
        1525
            context.queue_steering_message_callback = persistent_messages.append
      
        1526
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1527
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1528
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1529
            dod.implementation_plan = str(implementation_plan)
      
        1530
            dod.touched_files.append(str(chapter_one))
      
        1531
            sync_todos_to_definition_of_done(
      
        1532
                dod,
      
        1533
                [
      
        1534
                    {
      
        1535
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1536
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1537
                        "status": "pending",
      
        1538
                    },
      
        1539
                    {
      
        1540
                        "content": "Create each chapter file with appropriate content",
      
        1541
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1542
                        "status": "pending",
      
        1543
                    },
      
        1544
                    {
      
        1545
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1546
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1547
                        "status": "pending",
      
        1548
                    },
      
        1549
                ],
      
        1550
            )
      
        1551
            tool_call = ToolCall(
      
        1552
                id="read-reference-chapter",
      
        1553
                name="read",
      
        1554
                arguments={"file_path": str(reference)},
      
        1555
            )
      
        1556
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1557
            executor = FakeExecutor(
      
        1558
                [
      
        1559
                    ToolExecutionOutcome(
      
        1560
                        tool_call=tool_call,
      
        1561
                        state=ToolExecutionState.EXECUTED,
      
        1562
                        message=Message.tool_result_message(
      
        1563
                            tool_call_id=tool_call.id,
      
        1564
                            display_content=read_output,
      
        1565
                            result_content=read_output,
      
        1566
                        ),
      
        1567
                        event_content=read_output,
      
        1568
                        is_error=False,
      
        1569
                        result_output=read_output,
      
        1570
                    )
      
        1571
                ]
      
        1572
            )
      
        1573
        
        1574
            summary = TurnSummary(final_response="")
      
        1575
            await runner.execute_batch(
      
        1576
                tool_calls=[tool_call],
      
        1577
                tool_source="assistant",
      
        1578
                pending_tool_calls_seen=set(),
      
        1579
                emit=_noop_emit,
      
        1580
                summary=summary,
      
        1581
                dod=dod,
      
        1582
                executor=executor,  # type: ignore[arg-type]
      
        1583
                on_confirmation=None,
      
        1584
                on_user_question=None,
      
        1585
                emit_confirmation=None,
      
        1586
                consecutive_errors=0,
      
        1587
            )
      
        1588
        
        1589
            assert persistent_messages
      
        1590
            assert any(
      
        1591
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1592
                in message
      
        1593
                for message in persistent_messages
      
        1594
            )
      
        1595
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1596
            assert not any(
      
        1597
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1598
                in message
      
        1599
                for message in persistent_messages
      
        1600
            )
      
        1601
            assert ephemeral_messages == []
      
        1602
        
        1603
        
        1604
        @pytest.mark.asyncio
      
        1605
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1606
            temp_dir: Path,
      
        1607
        ) -> None:
      
        1608
            async def assess_confidence(
      
        1609
                tool_name: str,
      
        1610
                tool_args: dict,
      
        1611
                context: str,
      
        1612
            ) -> ConfidenceAssessment:
      
        1613
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1614
        
        1615
            async def verify_action(
      
        1616
                tool_name: str,
      
        1617
                tool_args: dict,
      
        1618
                result: str,
      
        1619
                expected: str = "",
      
        1620
            ) -> ActionVerification:
      
        1621
                raise AssertionError("Verification should not run for this scenario")
      
        1622
        
        1623
            guide_root = temp_dir / "guides" / "nginx"
      
        1624
            chapters = guide_root / "chapters"
      
        1625
            guide_root.mkdir(parents=True)
      
        1626
            chapters.mkdir()
      
        1627
            index_path = guide_root / "index.html"
      
        1628
            chapter_one = chapters / "01-getting-started.html"
      
        1629
            chapter_two = chapters / "02-installation.html"
      
        1630
            index_path.write_text("<html></html>\n")
      
        1631
            chapter_one.write_text("<h1>One</h1>\n")
      
        1632
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1633
        
        1634
            implementation_plan = temp_dir / "implementation.md"
      
        1635
            implementation_plan.write_text(
      
        1636
                "\n".join(
      
        1637
                    [
      
        1638
                        "# Implementation Plan",
      
        1639
                        "",
      
        1640
                        "## File Changes",
      
        1641
                        f"- `{guide_root}/`",
      
        1642
                        f"- `{chapters}/`",
      
        1643
                        f"- `{index_path}`",
      
        1644
                        f"- `{chapter_one}`",
      
        1645
                        f"- `{chapter_two}`",
      
        1646
                        "",
      
        1647
                    ]
      
        1648
                )
      
        1649
            )
      
        1650
        
        1651
            context = build_context(
      
        1652
                temp_dir=temp_dir,
      
        1653
                messages=[],
      
        1654
                safeguards=FakeSafeguards(),
      
        1655
                assess_confidence=assess_confidence,
      
        1656
                verify_action=verify_action,
      
        1657
                auto_recover=False,
      
        1658
            )
      
        1659
            persistent_messages: list[str] = []
      
        1660
            ephemeral_messages: list[str] = []
      
        1661
            context.queue_steering_message_callback = persistent_messages.append
      
        1662
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1663
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1664
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1665
            dod.implementation_plan = str(implementation_plan)
      
        1666
            dod.pending_items = [
      
        1667
                "Create 07-performance-tuning.html",
      
        1668
                "Verify all guide files are linked and complete",
      
        1669
                "Complete the requested work",
      
        1670
            ]
      
        1671
        
        1672
            tool_call = ToolCall(
      
        1673
                id="read-dup",
      
        1674
                name="read",
      
        1675
                arguments={"file_path": str(chapter_one)},
      
        1676
            )
      
        1677
            duplicate_message = (
      
        1678
                "[Skipped - duplicate action: Already read "
      
        1679
                f"{chapter_one} recently without any intervening changes; "
      
        1680
                "reuse the earlier read result instead of rereading]"
      
        1681
            )
      
        1682
            executor = FakeExecutor(
      
        1683
                [
      
        1684
                    ToolExecutionOutcome(
      
        1685
                        tool_call=tool_call,
      
        1686
                        state=ToolExecutionState.DUPLICATE,
      
        1687
                        message=Message.tool_result_message(
      
        1688
                            tool_call_id=tool_call.id,
      
        1689
                            display_content=duplicate_message,
      
        1690
                            result_content=duplicate_message,
      
        1691
                        ),
      
        1692
                        event_content=duplicate_message,
      
        1693
                        is_error=False,
      
        1694
                        result_output=duplicate_message,
      
        1695
                    )
      
        1696
                ]
      
        1697
            )
      
        1698
        
        1699
            summary = TurnSummary(final_response="")
      
        1700
            await runner.execute_batch(
      
        1701
                tool_calls=[tool_call],
      
        1702
                tool_source="assistant",
      
        1703
                pending_tool_calls_seen=set(),
      
        1704
                emit=_noop_emit,
      
        1705
                summary=summary,
      
        1706
                dod=dod,
      
        1707
                executor=executor,  # type: ignore[arg-type]
      
        1708
                on_confirmation=None,
      
        1709
                on_user_question=None,
      
        1710
                emit_confirmation=None,
      
        1711
                consecutive_errors=0,
      
        1712
            )
      
        1713
        
        1714
            assert len(persistent_messages) == 1
      
        1715
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1716
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1717
            assert ephemeral_messages == []
      
        1718
        
        1719
        
        1720
        @pytest.mark.asyncio
      
        1721
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1722
            temp_dir: Path,
      
        1723
        ) -> None:
      
        1724
            async def assess_confidence(
      
        1725
                tool_name: str,
      
        1726
                tool_args: dict,
      
        1727
                context: str,
      
        1728
            ) -> ConfidenceAssessment:
      
        1729
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1730
        
        1731
            async def verify_action(
      
        1732
                tool_name: str,
      
        1733
                tool_args: dict,
      
        1734
                result: str,
      
        1735
                expected: str = "",
      
        1736
            ) -> ActionVerification:
      
        1737
                raise AssertionError("Verification should not run for this scenario")
      
        1738
        
        1739
            guide_root = temp_dir / "guides" / "nginx"
      
        1740
            chapters = guide_root / "chapters"
      
        1741
            guide_root.mkdir(parents=True)
      
        1742
            chapters.mkdir()
      
        1743
            index_path = guide_root / "index.html"
      
        1744
            chapter_one = chapters / "01-getting-started.html"
      
        1745
            chapter_two = chapters / "02-installation.html"
      
        1746
            index_path.write_text("<html></html>\n")
      
        1747
            chapter_one.write_text("<h1>One</h1>\n")
      
        1748
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1749
        
        1750
            implementation_plan = temp_dir / "implementation.md"
      
        1751
            implementation_plan.write_text(
      
        1752
                "\n".join(
      
        1753
                    [
      
        1754
                        "# Implementation Plan",
      
        1755
                        "",
      
        1756
                        "## File Changes",
      
        1757
                        f"- `{guide_root}/`",
      
        1758
                        f"- `{chapters}/`",
      
        1759
                        f"- `{index_path}`",
      
        1760
                        f"- `{chapter_one}`",
      
        1761
                        f"- `{chapter_two}`",
      
        1762
                        "",
      
        1763
                    ]
      
        1764
                )
      
        1765
            )
      
        1766
        
        1767
            context = build_context(
      
        1768
                temp_dir=temp_dir,
      
        1769
                messages=[],
      
        1770
                safeguards=FakeSafeguards(),
      
        1771
                assess_confidence=assess_confidence,
      
        1772
                verify_action=verify_action,
      
        1773
                auto_recover=False,
      
        1774
            )
      
        1775
            persistent_messages: list[str] = []
      
        1776
            ephemeral_messages: list[str] = []
      
        1777
            context.queue_steering_message_callback = persistent_messages.append
      
        1778
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1779
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1780
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1781
            dod.implementation_plan = str(implementation_plan)
      
        1782
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1783
            dod.pending_items = [
      
        1784
                "Create 07-performance-tuning.html",
      
        1785
                "Complete the requested work",
      
        1786
            ]
      
        1787
        
        1788
            tool_call = ToolCall(
      
        1789
                id="read-dup",
      
        1790
                name="read",
      
        1791
                arguments={"file_path": str(chapter_one)},
      
        1792
            )
      
        1793
            duplicate_message = (
      
        1794
                "[Skipped - duplicate action: Already read "
      
        1795
                f"{chapter_one} recently without any intervening changes; "
      
        1796
                "reuse the earlier read result instead of rereading]"
      
        1797
            )
      
        1798
            executor = FakeExecutor(
      
        1799
                [
      
        1800
                    ToolExecutionOutcome(
      
        1801
                        tool_call=tool_call,
      
        1802
                        state=ToolExecutionState.DUPLICATE,
      
        1803
                        message=Message.tool_result_message(
      
        1804
                            tool_call_id=tool_call.id,
      
        1805
                            display_content=duplicate_message,
      
        1806
                            result_content=duplicate_message,
      
        1807
                        ),
      
        1808
                        event_content=duplicate_message,
      
        1809
                        is_error=False,
      
        1810
                        result_output=duplicate_message,
      
        1811
                    )
      
        1812
                ]
      
        1813
            )
      
        1814
        
        1815
            summary = TurnSummary(final_response="")
      
        1816
            await runner.execute_batch(
      
        1817
                tool_calls=[tool_call],
      
        1818
                tool_source="assistant",
      
        1819
                pending_tool_calls_seen=set(),
      
        1820
                emit=_noop_emit,
      
        1821
                summary=summary,
      
        1822
                dod=dod,
      
        1823
                executor=executor,  # type: ignore[arg-type]
      
        1824
                on_confirmation=None,
      
        1825
                on_user_question=None,
      
        1826
                emit_confirmation=None,
      
        1827
                consecutive_errors=0,
      
        1828
            )
      
        1829
        
        1830
            assert len(persistent_messages) == 1
      
        1831
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1832
            assert (
      
        1833
                "Move to verification or final confirmation using the files already on disk."
      
        1834
                in persistent_messages[0]
      
        1835
            )
      
        1836
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1837
            assert ephemeral_messages == []
      
        1838
        
        1839
        
        1840
        @pytest.mark.asyncio
      
        1841
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1842
            temp_dir: Path,
      
        1843
        ) -> None:
      
        1844
            async def assess_confidence(
      
        1845
                tool_name: str,
      
        1846
                tool_args: dict,
      
        1847
                context: str,
      
        1848
            ) -> ConfidenceAssessment:
      
        1849
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1850
        
        1851
            async def verify_action(
      
        1852
                tool_name: str,
      
        1853
                tool_args: dict,
      
        1854
                result: str,
      
        1855
                expected: str = "",
      
        1856
            ) -> ActionVerification:
      
        1857
                raise AssertionError("Verification should not run for this scenario")
      
        1858
        
        1859
            guide_root = temp_dir / "guides" / "nginx"
      
        1860
            chapters = guide_root / "chapters"
      
        1861
            guide_root.mkdir(parents=True)
      
        1862
            chapters.mkdir()
      
        1863
            index_path = guide_root / "index.html"
      
        1864
            chapter_one = chapters / "01-getting-started.html"
      
        1865
            chapter_two = chapters / "02-installation.html"
      
        1866
            index_path.write_text("<html></html>\n")
      
        1867
            chapter_one.write_text("<h1>One</h1>\n")
      
        1868
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1869
        
        1870
            implementation_plan = temp_dir / "implementation.md"
      
        1871
            implementation_plan.write_text(
      
        1872
                "\n".join(
      
        1873
                    [
      
        1874
                        "# Implementation Plan",
      
        1875
                        "",
      
        1876
                        "## File Changes",
      
        1877
                        f"- `{guide_root}/`",
      
        1878
                        f"- `{chapters}/`",
      
        1879
                        f"- `{index_path}`",
      
        1880
                        f"- `{chapter_one}`",
      
        1881
                        f"- `{chapter_two}`",
      
        1882
                        "",
      
        1883
                    ]
      
        1884
                )
      
        1885
            )
      
        1886
        
        1887
            context = build_context(
      
        1888
                temp_dir=temp_dir,
      
        1889
                messages=[],
      
        1890
                safeguards=FakeSafeguards(),
      
        1891
                assess_confidence=assess_confidence,
      
        1892
                verify_action=verify_action,
      
        1893
                auto_recover=False,
      
        1894
            )
      
        1895
            persistent_messages: list[str] = []
      
        1896
            ephemeral_messages: list[str] = []
      
        1897
            context.queue_steering_message_callback = persistent_messages.append
      
        1898
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1899
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1900
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1901
            dod.implementation_plan = str(implementation_plan)
      
        1902
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1903
            dod.pending_items = [
      
        1904
                "Create 01-getting-started.html",
      
        1905
                "Creating 02-installation.html",
      
        1906
                "Complete the requested work",
      
        1907
            ]
      
        1908
        
        1909
            tool_call = ToolCall(
      
        1910
                id="read-dup-built-stale",
      
        1911
                name="read",
      
        1912
                arguments={"file_path": str(chapter_one)},
      
        1913
            )
      
        1914
            duplicate_message = (
      
        1915
                "[Skipped - duplicate action: Already read "
      
        1916
                f"{chapter_one} recently without any intervening changes; "
      
        1917
                "reuse the earlier read result instead of rereading]"
      
        1918
            )
      
        1919
            executor = FakeExecutor(
      
        1920
                [
      
        1921
                    ToolExecutionOutcome(
      
        1922
                        tool_call=tool_call,
      
        1923
                        state=ToolExecutionState.DUPLICATE,
      
        1924
                        message=Message.tool_result_message(
      
        1925
                            tool_call_id=tool_call.id,
      
        1926
                            display_content=duplicate_message,
      
        1927
                            result_content=duplicate_message,
      
        1928
                        ),
      
        1929
                        event_content=duplicate_message,
      
        1930
                        is_error=False,
      
        1931
                        result_output=duplicate_message,
      
        1932
                    )
      
        1933
                ]
      
        1934
            )
      
        1935
        
        1936
            summary = TurnSummary(final_response="")
      
        1937
            await runner.execute_batch(
      
        1938
                tool_calls=[tool_call],
      
        1939
                tool_source="assistant",
      
        1940
                pending_tool_calls_seen=set(),
      
        1941
                emit=_noop_emit,
      
        1942
                summary=summary,
      
        1943
                dod=dod,
      
        1944
                executor=executor,  # type: ignore[arg-type]
      
        1945
                on_confirmation=None,
      
        1946
                on_user_question=None,
      
        1947
                emit_confirmation=None,
      
        1948
                consecutive_errors=0,
      
        1949
            )
      
        1950
        
        1951
            assert len(persistent_messages) == 1
      
        1952
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1953
            assert (
      
        1954
                "Move to verification or final confirmation using the files already on disk."
      
        1955
                in persistent_messages[0]
      
        1956
            )
      
        1957
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        1958
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        1959
            assert ephemeral_messages == []
      
        1960
        
        1961
        
        1962
        @pytest.mark.asyncio
      
        1963
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        1964
            temp_dir: Path,
      
        1965
        ) -> None:
      
        1966
            async def assess_confidence(
      
        1967
                tool_name: str,
      
        1968
                tool_args: dict,
      
        1969
                context: str,
      
        1970
            ) -> ConfidenceAssessment:
      
        1971
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1972
        
        1973
            async def verify_action(
      
        1974
                tool_name: str,
      
        1975
                tool_args: dict,
      
        1976
                result: str,
      
        1977
                expected: str = "",
      
        1978
            ) -> ActionVerification:
      
        1979
                raise AssertionError("Verification should not run for this scenario")
      
        1980
        
        1981
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1982
            reference.parent.mkdir(parents=True)
      
        1983
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1984
        
        1985
            context = build_context(
      
        1986
                temp_dir=temp_dir,
      
        1987
                messages=[],
      
        1988
                safeguards=FakeSafeguards(),
      
        1989
                assess_confidence=assess_confidence,
      
        1990
                verify_action=verify_action,
      
        1991
                auto_recover=False,
      
        1992
            )
      
        1993
            persistent_messages: list[str] = []
      
        1994
            ephemeral_messages: list[str] = []
      
        1995
            context.queue_steering_message_callback = persistent_messages.append
      
        1996
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1997
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1998
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1999
            sync_todos_to_definition_of_done(
      
        2000
                dod,
      
        2001
                [
      
        2002
                    {
      
        2003
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2004
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2005
                        "status": "pending",
      
        2006
                    },
      
        2007
                    {
      
        2008
                        "content": "Create the nginx index.html file",
      
        2009
                        "active_form": "Working on: Create the nginx index.html file",
      
        2010
                        "status": "pending",
      
        2011
                    },
      
        2012
                ],
      
        2013
            )
      
        2014
            tool_call = ToolCall(
      
        2015
                id="read-reference",
      
        2016
                name="read",
      
        2017
                arguments={"file_path": str(reference)},
      
        2018
            )
      
        2019
            executor = FakeExecutor(
      
        2020
                [
      
        2021
                    tool_outcome(
      
        2022
                        tool_call=tool_call,
      
        2023
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2024
                        is_error=False,
      
        2025
                    )
      
        2026
                ]
      
        2027
            )
      
        2028
        
        2029
            summary = TurnSummary(final_response="")
      
        2030
            await runner.execute_batch(
      
        2031
                tool_calls=[tool_call],
      
        2032
                tool_source="assistant",
      
        2033
                pending_tool_calls_seen=set(),
      
        2034
                emit=_noop_emit,
      
        2035
                summary=summary,
      
        2036
                dod=dod,
      
        2037
                executor=executor,  # type: ignore[arg-type]
      
        2038
                on_confirmation=None,
      
        2039
                on_user_question=None,
      
        2040
                emit_confirmation=None,
      
        2041
                consecutive_errors=0,
      
        2042
            )
      
        2043
        
        2044
            assert any(
      
        2045
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2046
                in message
      
        2047
                for message in persistent_messages
      
        2048
            )
      
        2049
            assert any(
      
        2050
                "stop gathering more reference material and perform the change now" in message
      
        2051
                for message in persistent_messages
      
        2052
            )
      
        2053
            assert ephemeral_messages == []
      
        2054
        
        2055
        
        2056
        @pytest.mark.asyncio
      
        2057
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2058
            temp_dir: Path,
      
        2059
        ) -> None:
      
        2060
            async def assess_confidence(
      
        2061
                tool_name: str,
      
        2062
                tool_args: dict,
      
        2063
                context: str,
      
        2064
            ) -> ConfidenceAssessment:
      
        2065
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2066
        
        2067
            async def verify_action(
      
        2068
                tool_name: str,
      
        2069
                tool_args: dict,
      
        2070
                result: str,
      
        2071
                expected: str = "",
      
        2072
            ) -> ActionVerification:
      
        2073
                raise AssertionError("Verification should not run for this scenario")
      
        2074
        
        2075
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2076
            reference.parent.mkdir(parents=True)
      
        2077
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2078
        
        2079
            context = build_context(
      
        2080
                temp_dir=temp_dir,
      
        2081
                messages=[],
      
        2082
                safeguards=FakeSafeguards(),
      
        2083
                assess_confidence=assess_confidence,
      
        2084
                verify_action=verify_action,
      
        2085
                auto_recover=False,
      
        2086
            )
      
        2087
            persistent_messages: list[str] = []
      
        2088
            ephemeral_messages: list[str] = []
      
        2089
            context.queue_steering_message_callback = persistent_messages.append
      
        2090
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2091
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2092
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2093
            sync_todos_to_definition_of_done(
      
        2094
                dod,
      
        2095
                [
      
        2096
                    {
      
        2097
                        "content": "First, examine the existing fortran guide structure and content",
      
        2098
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2099
                        "status": "pending",
      
        2100
                    },
      
        2101
                    {
      
        2102
                        "content": "Create the nginx directory structure",
      
        2103
                        "active_form": "Working on: Create the nginx directory structure",
      
        2104
                        "status": "pending",
      
        2105
                    },
      
        2106
                ],
      
        2107
            )
      
        2108
            tool_call = ToolCall(
      
        2109
                id="read-reference",
      
        2110
                name="read",
      
        2111
                arguments={"file_path": str(reference)},
      
        2112
            )
      
        2113
            executor = FakeExecutor(
      
        2114
                [
      
        2115
                    tool_outcome(
      
        2116
                        tool_call=tool_call,
      
        2117
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2118
                        is_error=False,
      
        2119
                    )
      
        2120
                ]
      
        2121
            )
      
        2122
        
        2123
            summary = TurnSummary(final_response="")
      
        2124
            await runner.execute_batch(
      
        2125
                tool_calls=[tool_call],
      
        2126
                tool_source="assistant",
      
        2127
                pending_tool_calls_seen=set(),
      
        2128
                emit=_noop_emit,
      
        2129
                summary=summary,
      
        2130
                dod=dod,
      
        2131
                executor=executor,  # type: ignore[arg-type]
      
        2132
                on_confirmation=None,
      
        2133
                on_user_question=None,
      
        2134
                emit_confirmation=None,
      
        2135
                consecutive_errors=0,
      
        2136
            )
      
        2137
        
        2138
            assert persistent_messages
      
        2139
            assert any(
      
        2140
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2141
                in message
      
        2142
                for message in persistent_messages
      
        2143
            )
      
        2144
            assert ephemeral_messages == []
      
        2145
        
        2146
        
        2147
        @pytest.mark.asyncio
      
        2148
        async def test_tool_batch_runner_missing_artifact_nudge_stays_quiet_after_setup_mkdir(
      
        2149
            temp_dir: Path,
      
        2150
        ) -> None:
      
        2151
            async def assess_confidence(
      
        2152
                tool_name: str,
      
        2153
                tool_args: dict,
      
        2154
                context: str,
      
        2155
            ) -> ConfidenceAssessment:
      
        2156
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2157
        
        2158
            async def verify_action(
      
        2159
                tool_name: str,
      
        2160
                tool_args: dict,
      
        2161
                result: str,
      
        2162
                expected: str = "",
      
        2163
            ) -> ActionVerification:
      
        2164
                raise AssertionError("Verification should not run for this scenario")
      
        2165
        
        2166
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2167
            chapters = nginx_root / "chapters"
      
        2168
            implementation_plan = temp_dir / "implementation.md"
      
        2169
            implementation_plan.write_text(
      
        2170
                "\n".join(
      
        2171
                    [
      
        2172
                        "# Implementation Plan",
      
        2173
                        "",
      
        2174
                        "## File Changes",
      
        2175
                        f"- `{chapters}/`",
      
        2176
                        f"- `{nginx_root / 'index.html'}`",
      
        2177
                        "",
      
        2178
                    ]
      
        2179
                )
      
        2180
            )
      
        2181
        
        2182
            context = build_context(
      
        2183
                temp_dir=temp_dir,
      
        2184
                messages=[],
      
        2185
                safeguards=FakeSafeguards(),
      
        2186
                assess_confidence=assess_confidence,
      
        2187
                verify_action=verify_action,
      
        2188
                auto_recover=False,
      
        2189
            )
      
        2190
            persistent_messages: list[str] = []
      
        2191
            ephemeral_messages: list[str] = []
      
        2192
            context.queue_steering_message_callback = persistent_messages.append
      
        2193
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2194
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2195
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2196
            dod.implementation_plan = str(implementation_plan)
      
        2197
            sync_todos_to_definition_of_done(
      
        2198
                dod,
      
        2199
                [
      
        2200
                    {
      
        2201
                        "content": "Create the nginx directory structure",
      
        2202
                        "active_form": "Creating the nginx directory structure",
      
        2203
                        "status": "pending",
      
        2204
                    },
      
        2205
                    {
      
        2206
                        "content": "Develop the main index.html file with proper structure",
      
        2207
                        "active_form": "Developing the main index.html file with proper structure",
      
        2208
                        "status": "pending",
      
        2209
                    },
      
        2210
                ],
      
        2211
            )
      
        2212
        
        2213
            tool_call = ToolCall(
      
        2214
                id="mkdir-nginx",
      
        2215
                name="bash",
      
        2216
                arguments={"command": f"mkdir -p {chapters}"},
      
        2217
            )
      
        2218
            executor = FakeExecutor(
      
        2219
                [
      
        2220
                    tool_outcome(
      
        2221
                        tool_call=tool_call,
      
        2222
                        output="",
      
        2223
                        is_error=False,
      
        2224
                    )
      
        2225
                ]
      
        2226
            )
      
        2227
        
        2228
            summary = TurnSummary(final_response="")
      
        2229
            await runner.execute_batch(
      
        2230
                tool_calls=[tool_call],
      
        2231
                tool_source="assistant",
      
        2232
                pending_tool_calls_seen=set(),
      
        2233
                emit=_noop_emit,
      
        2234
                summary=summary,
      
        2235
                dod=dod,
      
        2236
                executor=executor,  # type: ignore[arg-type]
      
        2237
                on_confirmation=None,
      
        2238
                on_user_question=None,
      
        2239
                emit_confirmation=None,
      
        2240
                consecutive_errors=0,
      
        2241
            )
      
        2242
        
        2243
            assert persistent_messages == []
      
        2244
            assert ephemeral_messages == []
      
        2245
        
        2246
        
        2247
        @pytest.mark.asyncio
      
        2248
        async def test_tool_batch_runner_first_file_handoff_stays_persistent(
      
        2249
            temp_dir: Path,
      
        2250
        ) -> None:
      
        2251
            async def assess_confidence(
      
        2252
                tool_name: str,
      
        2253
                tool_args: dict,
      
        2254
                context: str,
      
        2255
            ) -> ConfidenceAssessment:
      
        2256
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2257
        
        2258
            async def verify_action(
      
        2259
                tool_name: str,
      
        2260
                tool_args: dict,
      
        2261
                result: str,
      
        2262
                expected: str = "",
      
        2263
            ) -> ActionVerification:
      
        2264
                raise AssertionError("Verification should not run for this scenario")
      
        2265
        
        2266
            nginx_root = temp_dir / "guides" / "nginx"
      
        2267
            chapters = nginx_root / "chapters"
      
        2268
            chapters.mkdir(parents=True)
      
        2269
            index_path = nginx_root / "index.html"
      
        2270
        
        2271
            implementation_plan = temp_dir / "implementation.md"
      
        2272
            implementation_plan.write_text(
      
        2273
                "\n".join(
      
        2274
                    [
      
        2275
                        "# Implementation Plan",
      
        2276
                        "",
      
        2277
                        "## File Changes",
      
        2278
                        f"- `{chapters}/`",
      
        2279
                        f"- `{index_path}`",
      
        2280
                        f"- `{chapters / '01-introduction.html'}`",
      
        2281
                        "",
      
        2282
                    ]
      
        2283
                )
      
        2284
            )
      
        2285
        
        2286
            context = build_context(
      
        2287
                temp_dir=temp_dir,
      
        2288
                messages=[],
      
        2289
                safeguards=FakeSafeguards(),
      
        2290
                assess_confidence=assess_confidence,
      
        2291
                verify_action=verify_action,
      
        2292
                auto_recover=False,
      
        2293
            )
      
        2294
            persistent_messages: list[str] = []
      
        2295
            ephemeral_messages: list[str] = []
      
        2296
            context.queue_steering_message_callback = persistent_messages.append
      
        2297
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2298
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2299
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2300
            dod.implementation_plan = str(implementation_plan)
      
        2301
            sync_todos_to_definition_of_done(
      
        2302
                dod,
      
        2303
                [
      
        2304
                    {
      
        2305
                        "content": "Create the main index.html file with proper structure",
      
        2306
                        "active_form": "Creating the main index.html file with proper structure",
      
        2307
                        "status": "pending",
      
        2308
                    },
      
        2309
                    {
      
        2310
                        "content": "Create each chapter file with appropriate content",
      
        2311
                        "active_form": "Creating each chapter file with appropriate content",
      
        2312
                        "status": "pending",
      
        2313
                    },
      
        2314
                ],
      
        2315
            )
      
        2316
        
        2317
            tool_call = ToolCall(
      
        2318
                id="write-index",
      
        2319
                name="write",
      
        2320
                arguments={
      
        2321
                    "file_path": str(index_path),
      
        2322
                    "content": "<html></html>\n",
      
        2323
                },
      
        2324
            )
      
        2325
            executor = FakeExecutor(
      
        2326
                [
      
        2327
                    tool_outcome(
      
        2328
                        tool_call=tool_call,
      
        2329
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2330
                        is_error=False,
      
        2331
                    )
      
        2332
                ]
      
        2333
            )
      
        2334
        
        2335
            summary = TurnSummary(final_response="")
      
        2336
            await runner.execute_batch(
      
        2337
                tool_calls=[tool_call],
      
        2338
                tool_source="assistant",
      
        2339
                pending_tool_calls_seen=set(),
      
        2340
                emit=_noop_emit,
      
        2341
                summary=summary,
      
        2342
                dod=dod,
      
        2343
                executor=executor,  # type: ignore[arg-type]
      
        2344
                on_confirmation=None,
      
        2345
                on_user_question=None,
      
        2346
                emit_confirmation=None,
      
        2347
                consecutive_errors=0,
      
        2348
            )
      
        2349
        
        2350
            assert persistent_messages
      
        2351
            message = persistent_messages[-1]
      
        2352
            assert "Confirmed progress:" in message
      
        2353
            assert "Next step: create `01-introduction.html`." in message
      
        2354
            assert (
      
        2355
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2356
                in message
      
        2357
            )
      
        2358
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2359
            assert ephemeral_messages == []
      
        2360
        
        2361
        
        2362
        @pytest.mark.asyncio
      
        2363
        async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
      
        2364
            temp_dir: Path,
      
        2365
        ) -> None:
      
        2366
            async def assess_confidence(
      
        2367
                tool_name: str,
      
        2368
                tool_args: dict,
      
        2369
                context: str,
      
        2370
            ) -> ConfidenceAssessment:
      
        2371
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2372
        
        2373
            async def verify_action(
      
        2374
                tool_name: str,
      
        2375
                tool_args: dict,
      
        2376
                result: str,
      
        2377
                expected: str = "",
      
        2378
            ) -> ActionVerification:
      
        2379
                raise AssertionError("Verification should not run for this scenario")
      
        2380
        
        2381
            nginx_root = temp_dir / "guides" / "nginx"
      
        2382
            chapters = nginx_root / "chapters"
      
        2383
            chapters.mkdir(parents=True)
      
        2384
            index_path = nginx_root / "index.html"
      
        2385
        
        2386
            implementation_plan = temp_dir / "implementation.md"
      
        2387
            implementation_plan.write_text(
      
        2388
                "\n".join(
      
        2389
                    [
      
        2390
                        "# Implementation Plan",
      
        2391
                        "",
      
        2392
                        "## File Changes",
      
        2393
                        f"- `{chapters}/`",
      
        2394
                        f"- `{index_path}`",
      
        2395
                        f"- `{chapters / '01-introduction.html'}`",
      
        2396
                        "",
      
        2397
                    ]
      
        2398
                )
      
        2399
            )
      
        2400
        
        2401
            context = build_context(
      
        2402
                temp_dir=temp_dir,
      
        2403
                messages=[
      
        2404
                    Message(
      
        2405
                        role=Role.USER,
      
        2406
                        content=(
      
        2407
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        2408
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        2409
                        ),
      
        2410
                    )
      
        2411
                ],
      
        2412
                safeguards=FakeSafeguards(),
      
        2413
                assess_confidence=assess_confidence,
      
        2414
                verify_action=verify_action,
      
        2415
                auto_recover=False,
      
        2416
            )
      
        2417
            persistent_messages: list[str] = []
      
        2418
            ephemeral_messages: list[str] = []
      
        2419
            context.queue_steering_message_callback = persistent_messages.append
      
        2420
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2421
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2422
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2423
            dod.implementation_plan = str(implementation_plan)
      
        2424
            sync_todos_to_definition_of_done(
      
        2425
                dod,
      
        2426
                [
      
        2427
                    {
      
        2428
                        "content": "Create the main index.html file with proper structure",
      
        2429
                        "active_form": "Creating the main index.html file with proper structure",
      
        2430
                        "status": "pending",
      
        2431
                    },
      
        2432
                    {
      
        2433
                        "content": "Create each chapter file with appropriate content",
      
        2434
                        "active_form": "Creating each chapter file with appropriate content",
      
        2435
                        "status": "pending",
      
        2436
                    },
      
        2437
                ],
      
        2438
            )
      
        2439
        
        2440
            tool_call = ToolCall(
      
        2441
                id="write-index-recovered",
      
        2442
                name="write",
      
        2443
                arguments={
      
        2444
                    "file_path": str(index_path),
      
        2445
                    "content": "<html></html>\n",
      
        2446
                },
      
        2447
            )
      
        2448
            executor = FakeExecutor(
      
        2449
                [
      
        2450
                    tool_outcome(
      
        2451
                        tool_call=tool_call,
      
        2452
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2453
                        is_error=False,
      
        2454
                    )
      
        2455
                ]
      
        2456
            )
      
        2457
        
        2458
            summary = TurnSummary(final_response="")
      
        2459
            await runner.execute_batch(
      
        2460
                tool_calls=[tool_call],
      
        2461
                tool_source="assistant",
      
        2462
                pending_tool_calls_seen=set(),
      
        2463
                emit=_noop_emit,
      
        2464
                summary=summary,
      
        2465
                dod=dod,
      
        2466
                executor=executor,  # type: ignore[arg-type]
      
        2467
                on_confirmation=None,
      
        2468
                on_user_question=None,
      
        2469
                emit_confirmation=None,
      
        2470
                consecutive_errors=0,
      
        2471
            )
      
        2472
        
        2473
            assert persistent_messages == []
      
        2474
            assert ephemeral_messages
      
        2475
            message = ephemeral_messages[-1]
      
        2476
            assert "Resume by creating `01-introduction.html` now." in message
      
        2477
        
        2478
        
        2479
        @pytest.mark.asyncio
      
        2480
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        2481
            temp_dir: Path,
      
        2482
        ) -> None:
      
        2483
            async def assess_confidence(
      
        2484
                tool_name: str,
      
        2485
                tool_args: dict,
      
        2486
                context: str,
      
        2487
            ) -> ConfidenceAssessment:
      
        2488
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2489
        
        2490
            async def verify_action(
      
        2491
                tool_name: str,
      
        2492
                tool_args: dict,
      
        2493
                result: str,
      
        2494
                expected: str = "",
      
        2495
            ) -> ActionVerification:
      
        2496
                raise AssertionError("Verification should not run for this scenario")
      
        2497
        
        2498
            guide_root = temp_dir / "guides" / "nginx"
      
        2499
            chapters = guide_root / "chapters"
      
        2500
            chapters.mkdir(parents=True)
      
        2501
            index_path = guide_root / "index.html"
      
        2502
            chapter_one = chapters / "01-getting-started.html"
      
        2503
            chapter_one.write_text("<h1>One</h1>\n")
      
        2504
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2505
        
        2506
            implementation_plan = temp_dir / "implementation.md"
      
        2507
            implementation_plan.write_text(
      
        2508
                "\n".join(
      
        2509
                    [
      
        2510
                        "# Implementation Plan",
      
        2511
                        "",
      
        2512
                        "## File Changes",
      
        2513
                        f"- `{index_path}`",
      
        2514
                        f"- `{chapter_one}`",
      
        2515
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        2516
                        "",
      
        2517
                    ]
      
        2518
                )
      
        2519
            )
      
        2520
        
        2521
            context = build_context(
      
        2522
                temp_dir=temp_dir,
      
        2523
                messages=[],
      
        2524
                safeguards=FakeSafeguards(),
      
        2525
                assess_confidence=assess_confidence,
      
        2526
                verify_action=verify_action,
      
        2527
                auto_recover=False,
      
        2528
            )
      
        2529
            persistent_messages: list[str] = []
      
        2530
            ephemeral_messages: list[str] = []
      
        2531
            context.queue_steering_message_callback = persistent_messages.append
      
        2532
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2533
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2534
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2535
            dod.implementation_plan = str(implementation_plan)
      
        2536
            sync_todos_to_definition_of_done(
      
        2537
                dod,
      
        2538
                [
      
        2539
                    {
      
        2540
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2541
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2542
                        "status": "pending",
      
        2543
                    },
      
        2544
                    {
      
        2545
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        2546
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        2547
                        "status": "pending",
      
        2548
                    },
      
        2549
                ],
      
        2550
            )
      
        2551
            assert tool_batches_should_prioritize_missing_artifact(
      
        2552
                dod=dod,
      
        2553
                next_pending=dod.pending_items[0],
      
        2554
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        2555
                project_root=temp_dir,
      
        2556
            )
      
        2557
        
        2558
            tool_call = ToolCall(
      
        2559
                id="dup-read",
      
        2560
                name="read",
      
        2561
                arguments={"file_path": str(index_path)},
      
        2562
            )
      
        2563
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        2564
        
        2565
            assert persistent_messages
      
        2566
            message = persistent_messages[-1]
      
        2567
            assert "06-ssl-configuration.html" in message
      
        2568
            assert "Do not switch into review or consistency-check mode" in message
      
        2569
            assert (
      
        2570
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        2571
                not in message
      
        2572
            )
      
        2573
        
        2574
        
        2575
        @pytest.mark.asyncio
      
        2576
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        2577
            temp_dir: Path,
      
        2578
        ) -> None:
      
        2579
            async def assess_confidence(
      
        2580
                tool_name: str,
      
        2581
                tool_args: dict,
      
        2582
                context: str,
      
        2583
            ) -> ConfidenceAssessment:
      
        2584
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2585
        
        2586
            async def verify_action(
      
        2587
                tool_name: str,
      
        2588
                tool_args: dict,
      
        2589
                result: str,
      
        2590
                expected: str = "",
      
        2591
            ) -> ActionVerification:
      
        2592
                raise AssertionError("Verification should not run for this scenario")
      
        2593
        
        2594
            guide_root = temp_dir / "guides" / "nginx"
      
        2595
            chapters = guide_root / "chapters"
      
        2596
            chapters.mkdir(parents=True)
      
        2597
            index_path = guide_root / "index.html"
      
        2598
            chapter_one = chapters / "01-getting-started.html"
      
        2599
            chapter_two = chapters / "02-installation.html"
      
        2600
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2601
            chapter_one.write_text("<h1>One</h1>\n")
      
        2602
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2603
        
        2604
            implementation_plan = temp_dir / "implementation.md"
      
        2605
            implementation_plan.write_text(
      
        2606
                "\n".join(
      
        2607
                    [
      
        2608
                        "# Implementation Plan",
      
        2609
                        "",
      
        2610
                        "## File Changes",
      
        2611
                        f"- `{chapters}/`",
      
        2612
                        f"- `{index_path}`",
      
        2613
                        f"- `{chapter_one}`",
      
        2614
                        f"- `{chapter_two}`",
      
        2615
                        "",
      
        2616
                    ]
      
        2617
                )
      
        2618
            )
      
        2619
        
        2620
            context = build_context(
      
        2621
                temp_dir=temp_dir,
      
        2622
                messages=[],
      
        2623
                safeguards=FakeSafeguards(),
      
        2624
                assess_confidence=assess_confidence,
      
        2625
                verify_action=verify_action,
      
        2626
                auto_recover=False,
      
        2627
            )
      
        2628
            persistent_messages: list[str] = []
      
        2629
            ephemeral_messages: list[str] = []
      
        2630
            context.queue_steering_message_callback = persistent_messages.append
      
        2631
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2632
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2633
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2634
            dod.implementation_plan = str(implementation_plan)
      
        2635
            sync_todos_to_definition_of_done(
      
        2636
                dod,
      
        2637
                [
      
        2638
                    {
      
        2639
                        "content": "Create the guide files",
      
        2640
                        "active_form": "Working on: Create the guide files",
      
        2641
                        "status": "completed",
      
        2642
                    },
      
        2643
                    {
      
        2644
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2645
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2646
                        "status": "pending",
      
        2647
                    },
      
        2648
                ],
      
        2649
            )
      
        2650
            tool_call = ToolCall(
      
        2651
                id="write-final",
      
        2652
                name="write",
      
        2653
                arguments={
      
        2654
                    "file_path": str(chapter_two),
      
        2655
                    "content": "<h1>Two</h1>\n",
      
        2656
                },
      
        2657
            )
      
        2658
            executor = FakeExecutor(
      
        2659
                [
      
        2660
                    tool_outcome(
      
        2661
                        tool_call=tool_call,
      
        2662
                        output=f"Successfully wrote {chapter_two}",
      
        2663
                        is_error=False,
      
        2664
                    )
      
        2665
                ]
      
        2666
            )
      
        2667
        
        2668
            summary = TurnSummary(final_response="")
      
        2669
            await runner.execute_batch(
      
        2670
                tool_calls=[tool_call],
      
        2671
                tool_source="assistant",
      
        2672
                pending_tool_calls_seen=set(),
      
        2673
                emit=_noop_emit,
      
        2674
                summary=summary,
      
        2675
                dod=dod,
      
        2676
                executor=executor,  # type: ignore[arg-type]
      
        2677
                on_confirmation=None,
      
        2678
                on_user_question=None,
      
        2679
                emit_confirmation=None,
      
        2680
                consecutive_errors=0,
      
        2681
            )
      
        2682
        
        2683
            assert any(
      
        2684
                "All explicitly planned artifacts now exist." in message
      
        2685
                for message in persistent_messages
      
        2686
            )
      
        2687
            assert any(
      
        2688
                "Ensure all files are properly linked and formatted consistently" in message
      
        2689
                for message in persistent_messages
      
        2690
            )
      
        2691
            assert any(
      
        2692
                "Move to verification once no specific mismatch remains." in message
      
        2693
                for message in persistent_messages
      
        2694
            )
      
        2695
        
        2696
        
        2697
        @pytest.mark.asyncio
      
        2698
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        2699
            temp_dir: Path,
      
        2700
        ) -> None:
      
        2701
            async def assess_confidence(
      
        2702
                tool_name: str,
      
        2703
                tool_args: dict,
      
        2704
                context: str,
      
        2705
            ) -> ConfidenceAssessment:
      
        2706
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2707
        
        2708
            async def verify_action(
      
        2709
                tool_name: str,
      
        2710
                tool_args: dict,
      
        2711
                result: str,
      
        2712
                expected: str = "",
      
        2713
            ) -> ActionVerification:
      
        2714
                raise AssertionError("Verification should not run in this scenario")
      
        2715
        
        2716
            guide_root = temp_dir / "guides" / "nginx"
      
        2717
            chapters = guide_root / "chapters"
      
        2718
            guide_root.mkdir(parents=True)
      
        2719
            chapters.mkdir()
      
        2720
            index_path = guide_root / "index.html"
      
        2721
            index_path.write_text("<html></html>\n")
      
        2722
            chapter_one = chapters / "01-getting-started.html"
      
        2723
            chapter_two = chapters / "02-installation.html"
      
        2724
            implementation_plan = temp_dir / "implementation.md"
      
        2725
            implementation_plan.write_text(
      
        2726
                "\n".join(
      
        2727
                    [
      
        2728
                        "# Implementation Plan",
      
        2729
                        "",
      
        2730
                        "## File Changes",
      
        2731
                        f"- `{guide_root}/`",
      
        2732
                        f"- `{index_path}`",
      
        2733
                        f"- `{chapter_one}`",
      
        2734
                        f"- `{chapter_two}`",
      
        2735
                        "",
      
        2736
                    ]
      
        2737
                )
      
        2738
            )
      
        2739
        
        2740
            context = build_context(
      
        2741
                temp_dir=temp_dir,
      
        2742
                messages=[],
      
        2743
                safeguards=FakeSafeguards(),
      
        2744
                assess_confidence=assess_confidence,
      
        2745
                verify_action=verify_action,
      
        2746
                auto_recover=False,
      
        2747
            )
      
        2748
            persistent_messages: list[str] = []
      
        2749
            ephemeral_messages: list[str] = []
      
        2750
            context.queue_steering_message_callback = persistent_messages.append
      
        2751
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2752
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2753
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2754
            dod.implementation_plan = str(implementation_plan)
      
        2755
            sync_todos_to_definition_of_done(
      
        2756
                dod,
      
        2757
                [
      
        2758
                    {
      
        2759
                        "content": "Create the main index.html file with proper structure",
      
        2760
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        2761
                        "status": "pending",
      
        2762
                    },
      
        2763
                    {
      
        2764
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        2765
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        2766
                        "status": "pending",
      
        2767
                    },
      
        2768
                    {
      
        2769
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2770
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2771
                        "status": "pending",
      
        2772
                    },
      
        2773
                ],
      
        2774
            )
      
        2775
            tool_call = ToolCall(
      
        2776
                id="write-index",
      
        2777
                name="write",
      
        2778
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        2779
            )
      
        2780
            executor = FakeExecutor(
      
        2781
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        2782
            )
      
        2783
        
        2784
            summary = TurnSummary(final_response="")
      
        2785
            await runner.execute_batch(
      
        2786
                tool_calls=[tool_call],
      
        2787
                tool_source="assistant",
      
        2788
                pending_tool_calls_seen=set(),
      
        2789
                emit=_noop_emit,
      
        2790
                summary=summary,
      
        2791
                dod=dod,
      
        2792
                executor=executor,  # type: ignore[arg-type]
      
        2793
                on_confirmation=None,
      
        2794
                on_user_question=None,
      
        2795
                emit_confirmation=None,
      
        2796
                consecutive_errors=0,
      
        2797
            )
      
        2798
        
        2799
            assert persistent_messages
      
        2800
            message = persistent_messages[-1]
      
        2801
            assert "Next step: create `01-getting-started.html`." in message
      
        2802
            assert (
      
        2803
                f"Prefer one `write(file_path=..., content=...)` call for `{chapter_one.resolve(strict=False)}` now."
      
        2804
                in message
      
        2805
            )
      
        2806
            assert "refresh `TodoWrite`" not in message
      
        2807
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2808
        
        2809
        
        2810
        @pytest.mark.asyncio
      
        2811
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        2812
            temp_dir: Path,
      
        2813
        ) -> None:
      
        2814
            async def assess_confidence(
      
        2815
                tool_name: str,
      
        2816
                tool_args: dict,
      
        2817
                context: str,
      
        2818
            ) -> ConfidenceAssessment:
      
        2819
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2820
        
        2821
            async def verify_action(
      
        2822
                tool_name: str,
      
        2823
                tool_args: dict,
      
        2824
                result: str,
      
        2825
                expected: str = "",
      
        2826
            ) -> ActionVerification:
      
        2827
                raise AssertionError("Verification should not run in this scenario")
      
        2828
        
        2829
            guide_root = temp_dir / "guides" / "nginx"
      
        2830
            chapters = guide_root / "chapters"
      
        2831
            guide_root.mkdir(parents=True)
      
        2832
            chapters.mkdir()
      
        2833
            index_path = guide_root / "index.html"
      
        2834
            index_path.write_text("<html></html>\n")
      
        2835
        
        2836
            chapter_paths = [
      
        2837
                chapters / "01-getting-started.html",
      
        2838
                chapters / "02-installation.html",
      
        2839
                chapters / "03-first-website.html",
      
        2840
                chapters / "04-configuration-basics.html",
      
        2841
                chapters / "05-advanced-configurations.html",
      
        2842
                chapters / "06-performance-tuning.html",
      
        2843
                chapters / "07-security-best-practices.html",
      
        2844
            ]
      
        2845
            for chapter in chapter_paths[:4]:
      
        2846
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        2847
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        2848
        
        2849
            implementation_plan = temp_dir / "implementation.md"
      
        2850
            implementation_plan.write_text(
      
        2851
                "\n".join(
      
        2852
                    [
      
        2853
                        "# Implementation Plan",
      
        2854
                        "",
      
        2855
                        "## File Changes",
      
        2856
                        f"- `{guide_root}/`",
      
        2857
                        f"- `{chapters}/`",
      
        2858
                        f"- `{index_path}`",
      
        2859
                        *[f"- `{path}`" for path in chapter_paths],
      
        2860
                        "",
      
        2861
                    ]
      
        2862
                )
      
        2863
            )
      
        2864
        
        2865
            context = build_context(
      
        2866
                temp_dir=temp_dir,
      
        2867
                messages=[],
      
        2868
                safeguards=FakeSafeguards(),
      
        2869
                assess_confidence=assess_confidence,
      
        2870
                verify_action=verify_action,
      
        2871
                auto_recover=False,
      
        2872
            )
      
        2873
            persistent_messages: list[str] = []
      
        2874
            ephemeral_messages: list[str] = []
      
        2875
            context.queue_steering_message_callback = persistent_messages.append
      
        2876
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2877
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2878
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        2879
            dod.implementation_plan = str(implementation_plan)
      
        2880
            sync_todos_to_definition_of_done(
      
        2881
                dod,
      
        2882
                [
      
        2883
                    {
      
        2884
                        "content": "Create the nginx guide artifacts",
      
        2885
                        "active_form": "Creating nginx guide artifacts",
      
        2886
                        "status": "pending",
      
        2887
                    },
      
        2888
                    {
      
        2889
                        "content": "Verify all guide files are linked and complete",
      
        2890
                        "active_form": "Verifying guide linkage and completeness",
      
        2891
                        "status": "pending",
      
        2892
                    },
      
        2893
                ],
      
        2894
            )
      
        2895
            tool_call = ToolCall(
      
        2896
                id="write-chapter-05",
      
        2897
                name="write",
      
        2898
                arguments={
      
        2899
                    "file_path": str(chapter_paths[4]),
      
        2900
                    "content": "<h1>Advanced configurations</h1>\n",
      
        2901
                },
      
        2902
            )
      
        2903
            executor = FakeExecutor(
      
        2904
                [
      
        2905
                    tool_outcome(
      
        2906
                        tool_call=tool_call,
      
        2907
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        2908
                        is_error=False,
      
        2909
                    )
      
        2910
                ]
      
        2911
            )
      
        2912
        
        2913
            summary = TurnSummary(final_response="")
      
        2914
            await runner.execute_batch(
      
        2915
                tool_calls=[tool_call],
      
        2916
                tool_source="assistant",
      
        2917
                pending_tool_calls_seen=set(),
      
        2918
                emit=_noop_emit,
      
        2919
                summary=summary,
      
        2920
                dod=dod,
      
        2921
                executor=executor,  # type: ignore[arg-type]
      
        2922
                on_confirmation=None,
      
        2923
                on_user_question=None,
      
        2924
                emit_confirmation=None,
      
        2925
                consecutive_errors=0,
      
        2926
            )
      
        2927
        
        2928
            assert any(
      
        2929
                "Resume by creating `06-performance-tuning.html` now." in message
      
        2930
                for message in ephemeral_messages
      
        2931
            )
      
        2932
            assert not any(
      
        2933
                "All explicitly planned artifacts now exist." in message
      
        2934
                for message in ephemeral_messages
      
        2935
            )
      
        2936
        
        2937
        
        2938
        @pytest.mark.asyncio
      
        2939
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        2940
            temp_dir: Path,
      
        2941
        ) -> None:
      
        2942
            async def assess_confidence(
      
        2943
                tool_name: str,
      
        2944
                tool_args: dict,
      
        2945
                context: str,
      
        2946
            ) -> ConfidenceAssessment:
      
        2947
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2948
        
        2949
            async def verify_action(
      
        2950
                tool_name: str,
      
        2951
                tool_args: dict,
      
        2952
                result: str,
      
        2953
                expected: str = "",
      
        2954
            ) -> ActionVerification:
      
        2955
                raise AssertionError("Verification should not run in this scenario")
      
        2956
        
        2957
            guide_root = temp_dir / "guides" / "nginx"
      
        2958
            chapters = guide_root / "chapters"
      
        2959
            guide_root.mkdir(parents=True)
      
        2960
            chapters.mkdir()
      
        2961
            index_path = guide_root / "index.html"
      
        2962
            chapter_paths = [
      
        2963
                chapters / "01-introduction.html",
      
        2964
                chapters / "02-installation.html",
      
        2965
                chapters / "03-configuration.html",
      
        2966
                chapters / "04-basic-usage.html",
      
        2967
                chapters / "05-advanced-features.html",
      
        2968
            ]
      
        2969
            for path in (index_path, *chapter_paths[:4]):
      
        2970
                path.write_text("<html></html>\n")
      
        2971
        
        2972
            implementation_plan = temp_dir / "implementation.md"
      
        2973
            implementation_plan.write_text(
      
        2974
                "\n".join(
      
        2975
                    [
      
        2976
                        "# Implementation Plan",
      
        2977
                        "",
      
        2978
                        "## File Changes",
      
        2979
                        f"- `{guide_root}/`",
      
        2980
                        f"- `{chapters}/`",
      
        2981
                        f"- `{index_path}`",
      
        2982
                        *[f"- `{path}`" for path in chapter_paths],
      
        2983
                        "",
      
        2984
                    ]
      
        2985
                )
      
        2986
            )
      
        2987
        
        2988
            context = build_context(
      
        2989
                temp_dir=temp_dir,
      
        2990
                messages=[],
      
        2991
                safeguards=FakeSafeguards(),
      
        2992
                assess_confidence=assess_confidence,
      
        2993
                verify_action=verify_action,
      
        2994
                auto_recover=False,
      
        2995
            )
      
        2996
            persistent_messages: list[str] = []
      
        2997
            ephemeral_messages: list[str] = []
      
        2998
            context.queue_steering_message_callback = persistent_messages.append
      
        2999
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3000
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3001
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3002
            dod.implementation_plan = str(implementation_plan)
      
        3003
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3004
            dod.completed_items.extend(
      
        3005
                [
      
        3006
                    "Create the nginx directory structure",
      
        3007
                    "Create the main index.html file with proper structure",
      
        3008
                ]
      
        3009
            )
      
        3010
            sync_todos_to_definition_of_done(
      
        3011
                dod,
      
        3012
                [
      
        3013
                    {
      
        3014
                        "content": "Create each chapter file with appropriate content",
      
        3015
                        "active_form": "Creating each chapter file with appropriate content",
      
        3016
                        "status": "pending",
      
        3017
                    }
      
        3018
                ],
      
        3019
            )
      
        3020
            tool_call = ToolCall(
      
        3021
                id="write-chapter-04",
      
        3022
                name="write",
      
        3023
                arguments={
      
        3024
                    "file_path": str(chapter_paths[3]),
      
        3025
                    "content": "<html>updated</html>\n",
      
        3026
                },
      
        3027
            )
      
        3028
            executor = FakeExecutor(
      
        3029
                [
      
        3030
                    tool_outcome(
      
        3031
                        tool_call=tool_call,
      
        3032
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3033
                        is_error=False,
      
        3034
                    )
      
        3035
                ]
      
        3036
            )
      
        3037
        
        3038
            summary = TurnSummary(final_response="")
      
        3039
            await runner.execute_batch(
      
        3040
                tool_calls=[tool_call],
      
        3041
                tool_source="assistant",
      
        3042
                pending_tool_calls_seen=set(),
      
        3043
                emit=_noop_emit,
      
        3044
                summary=summary,
      
        3045
                dod=dod,
      
        3046
                executor=executor,  # type: ignore[arg-type]
      
        3047
                on_confirmation=None,
      
        3048
                on_user_question=None,
      
        3049
                emit_confirmation=None,
      
        3050
                consecutive_errors=0,
      
        3051
            )
      
        3052
        
        3053
            assert ephemeral_messages
      
        3054
            message = ephemeral_messages[-1]
      
        3055
            assert "Resume by creating `05-advanced-features.html` now." in message
      
        3056
            assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
      
        3057
            assert "refresh `TodoWrite`" not in message
      
        3058
        
        3059
        
        3060
        @pytest.mark.asyncio
      
        3061
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3062
            temp_dir: Path,
      
        3063
        ) -> None:
      
        3064
            async def assess_confidence(
      
        3065
                tool_name: str,
      
        3066
                tool_args: dict,
      
        3067
                context: str,
      
        3068
            ) -> ConfidenceAssessment:
      
        3069
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3070
        
        3071
            async def verify_action(
      
        3072
                tool_name: str,
      
        3073
                tool_args: dict,
      
        3074
                result: str,
      
        3075
                expected: str = "",
      
        3076
            ) -> ActionVerification:
      
        3077
                raise AssertionError("Verification should not run in this scenario")
      
        3078
        
        3079
            guide_root = temp_dir / "guides" / "nginx"
      
        3080
            chapters = guide_root / "chapters"
      
        3081
            guide_root.mkdir(parents=True)
      
        3082
            chapters.mkdir()
      
        3083
            index_path = guide_root / "index.html"
      
        3084
            index_path.write_text("<html></html>\n")
      
        3085
            chapter_one = chapters / "01-getting-started.html"
      
        3086
            chapter_two = chapters / "02-installation.html"
      
        3087
            chapter_one.write_text("<h1>One</h1>\n")
      
        3088
        
        3089
            implementation_plan = temp_dir / "implementation.md"
      
        3090
            implementation_plan.write_text(
      
        3091
                "\n".join(
      
        3092
                    [
      
        3093
                        "# Implementation Plan",
      
        3094
                        "",
      
        3095
                        "## File Changes",
      
        3096
                        f"- `{guide_root}/`",
      
        3097
                        f"- `{chapters}/`",
      
        3098
                        f"- `{index_path}`",
      
        3099
                        f"- `{chapter_one}`",
      
        3100
                        f"- `{chapter_two}`",
      
        3101
                        "",
      
        3102
                    ]
      
        3103
                )
      
        3104
            )
      
        3105
        
        3106
            context = build_context(
      
        3107
                temp_dir=temp_dir,
      
        3108
                messages=[],
      
        3109
                safeguards=FakeSafeguards(),
      
        3110
                assess_confidence=assess_confidence,
      
        3111
                verify_action=verify_action,
      
        3112
                auto_recover=False,
      
        3113
            )
      
        3114
            persistent_messages: list[str] = []
      
        3115
            ephemeral_messages: list[str] = []
      
        3116
            context.queue_steering_message_callback = persistent_messages.append
      
        3117
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3118
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3119
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3120
            dod.implementation_plan = str(implementation_plan)
      
        3121
            sync_todos_to_definition_of_done(
      
        3122
                dod,
      
        3123
                [
      
        3124
                    {
      
        3125
                        "content": "Create 01-getting-started.html",
      
        3126
                        "active_form": "Creating 01-getting-started.html",
      
        3127
                        "status": "completed",
      
        3128
                    },
      
        3129
                    {
      
        3130
                        "content": "Create 02-installation.html",
      
        3131
                        "active_form": "Creating 02-installation.html",
      
        3132
                        "status": "pending",
      
        3133
                    },
      
        3134
                ],
      
        3135
            )
      
        3136
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3137
        
        3138
            tool_call = ToolCall(
      
        3139
                id="todo-only",
      
        3140
                name="TodoWrite",
      
        3141
                arguments={
      
        3142
                    "todos": [
      
        3143
                        {
      
        3144
                            "content": "Create 01-getting-started.html",
      
        3145
                            "active_form": "Creating 01-getting-started.html",
      
        3146
                            "status": "completed",
      
        3147
                        },
      
        3148
                        {
      
        3149
                            "content": "Create 02-installation.html",
      
        3150
                            "active_form": "Creating 02-installation.html",
      
        3151
                            "status": "pending",
      
        3152
                        },
      
        3153
                    ]
      
        3154
                },
      
        3155
            )
      
        3156
            executor = FakeExecutor(
      
        3157
                [
      
        3158
                    tool_outcome(
      
        3159
                        tool_call=tool_call,
      
        3160
                        output="Todos updated",
      
        3161
                        is_error=False,
      
        3162
                        metadata={
      
        3163
                            "new_todos": [
      
        3164
                                {
      
        3165
                                    "content": "Create 01-getting-started.html",
      
        3166
                                    "active_form": "Creating 01-getting-started.html",
      
        3167
                                    "status": "completed",
      
        3168
                                },
      
        3169
                                {
      
        3170
                                    "content": "Create 02-installation.html",
      
        3171
                                    "active_form": "Creating 02-installation.html",
      
        3172
                                    "status": "pending",
      
        3173
                                },
      
        3174
                            ]
      
        3175
                        },
      
        3176
                    )
      
        3177
                ]
      
        3178
            )
      
        3179
        
        3180
            summary = TurnSummary(final_response="")
      
        3181
            await runner.execute_batch(
      
        3182
                tool_calls=[tool_call],
      
        3183
                tool_source="assistant",
      
        3184
                pending_tool_calls_seen=set(),
      
        3185
                emit=_noop_emit,
      
        3186
                summary=summary,
      
        3187
                dod=dod,
      
        3188
                executor=executor,  # type: ignore[arg-type]
      
        3189
                on_confirmation=None,
      
        3190
                on_user_question=None,
      
        3191
                emit_confirmation=None,
      
        3192
                consecutive_errors=0,
      
        3193
            )
      
        3194
        
        3195
            assert persistent_messages
      
        3196
            message = persistent_messages[-1]
      
        3197
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3198
            assert "Resume by creating `02-installation.html` now." in message
      
        3199
            assert "refresh `TodoWrite`" in message
      
        3200
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3201
            assert ephemeral_messages == []
      
        3202
        
        3203
        
        3204
        @pytest.mark.asyncio
      
        3205
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3206
            temp_dir: Path,
      
        3207
        ) -> None:
      
        3208
            async def assess_confidence(
      
        3209
                tool_name: str,
      
        3210
                tool_args: dict,
      
        3211
                context: str,
      
        3212
            ) -> ConfidenceAssessment:
      
        3213
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3214
        
        3215
            async def verify_action(
      
        3216
                tool_name: str,
      
        3217
                tool_args: dict,
      
        3218
                result: str,
      
        3219
                expected: str = "",
      
        3220
            ) -> ActionVerification:
      
        3221
                raise AssertionError("Verification should not run in this scenario")
      
        3222
        
        3223
            guide_root = temp_dir / "guides" / "nginx"
      
        3224
            chapters = guide_root / "chapters"
      
        3225
            guide_root.mkdir(parents=True)
      
        3226
            chapters.mkdir()
      
        3227
            index_path = guide_root / "index.html"
      
        3228
            chapter_one = chapters / "01-getting-started.html"
      
        3229
            chapter_two = chapters / "02-installation.html"
      
        3230
            index_path.write_text("<html></html>\n")
      
        3231
            chapter_one.write_text("<h1>One</h1>\n")
      
        3232
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3233
        
        3234
            implementation_plan = temp_dir / "implementation.md"
      
        3235
            implementation_plan.write_text(
      
        3236
                "\n".join(
      
        3237
                    [
      
        3238
                        "# Implementation Plan",
      
        3239
                        "",
      
        3240
                        "## File Changes",
      
        3241
                        f"- `{guide_root}/`",
      
        3242
                        f"- `{chapters}/`",
      
        3243
                        f"- `{index_path}`",
      
        3244
                        f"- `{chapter_one}`",
      
        3245
                        f"- `{chapter_two}`",
      
        3246
                        "",
      
        3247
                    ]
      
        3248
                )
      
        3249
            )
      
        3250
        
        3251
            context = build_context(
      
        3252
                temp_dir=temp_dir,
      
        3253
                messages=[],
      
        3254
                safeguards=FakeSafeguards(),
      
        3255
                assess_confidence=assess_confidence,
      
        3256
                verify_action=verify_action,
      
        3257
                auto_recover=False,
      
        3258
            )
      
        3259
            queued_messages: list[str] = []
      
        3260
            context.queue_steering_message_callback = queued_messages.append
      
        3261
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3262
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3263
            dod.implementation_plan = str(implementation_plan)
      
        3264
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3265
            sync_todos_to_definition_of_done(
      
        3266
                dod,
      
        3267
                [
      
        3268
                    {
      
        3269
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3270
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3271
                        "status": "pending",
      
        3272
                    },
      
        3273
                    {
      
        3274
                        "content": "Verify all guide files are linked and complete",
      
        3275
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3276
                        "status": "pending",
      
        3277
                    },
      
        3278
                ],
      
        3279
                project_root=temp_dir,
      
        3280
            )
      
        3281
        
        3282
            tool_call = ToolCall(
      
        3283
                id="todo-only",
      
        3284
                name="TodoWrite",
      
        3285
                arguments={
      
        3286
                    "todos": [
      
        3287
                        {
      
        3288
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3289
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3290
                            "status": "pending",
      
        3291
                        },
      
        3292
                        {
      
        3293
                            "content": "Verify all guide files are linked and complete",
      
        3294
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3295
                            "status": "pending",
      
        3296
                        },
      
        3297
                    ]
      
        3298
                },
      
        3299
            )
      
        3300
            executor = FakeExecutor(
      
        3301
                [
      
        3302
                    tool_outcome(
      
        3303
                        tool_call=tool_call,
      
        3304
                        output="Todos updated",
      
        3305
                        is_error=False,
      
        3306
                        metadata={
      
        3307
                            "new_todos": [
      
        3308
                                {
      
        3309
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3310
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3311
                                    "status": "pending",
      
        3312
                                },
      
        3313
                                {
      
        3314
                                    "content": "Verify all guide files are linked and complete",
      
        3315
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        3316
                                    "status": "pending",
      
        3317
                                },
      
        3318
                            ]
      
        3319
                        },
      
        3320
                    )
      
        3321
                ]
      
        3322
            )
      
        3323
        
        3324
            summary = TurnSummary(final_response="")
      
        3325
            await runner.execute_batch(
      
        3326
                tool_calls=[tool_call],
      
        3327
                tool_source="assistant",
      
        3328
                pending_tool_calls_seen=set(),
      
        3329
                emit=_noop_emit,
      
        3330
                summary=summary,
      
        3331
                dod=dod,
      
        3332
                executor=executor,  # type: ignore[arg-type]
      
        3333
                on_confirmation=None,
      
        3334
                on_user_question=None,
      
        3335
                emit_confirmation=None,
      
        3336
                consecutive_errors=0,
      
        3337
            )
      
        3338
        
        3339
            assert queued_messages
      
        3340
            message = queued_messages[-1]
      
        3341
            assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
      
        3342
            assert "Verify all guide files are linked and complete" in message
      
        3343
            assert "Move to verification once no specific mismatch remains." in message
      
        3344
            assert "reopen reference materials" in message
      
        3345
            assert "Fortran guide structure" not in message
      
        3346
        
        3347
        
        3348
        @pytest.mark.asyncio
      
        3349
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        3350
            temp_dir: Path,
      
        3351
        ) -> None:
      
        3352
            async def assess_confidence(
      
        3353
                tool_name: str,
      
        3354
                tool_args: dict,
      
        3355
                context: str,
      
        3356
            ) -> ConfidenceAssessment:
      
        3357
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3358
        
        3359
            async def verify_action(
      
        3360
                tool_name: str,
      
        3361
                tool_args: dict,
      
        3362
                result: str,
      
        3363
                expected: str = "",
      
        3364
            ) -> ActionVerification:
      
        3365
                raise AssertionError("Verification should not run in this scenario")
      
        3366
        
        3367
            guide_root = temp_dir / "guides" / "nginx"
      
        3368
            chapters = guide_root / "chapters"
      
        3369
            guide_root.mkdir(parents=True)
      
        3370
            chapters.mkdir()
      
        3371
            index_path = guide_root / "index.html"
      
        3372
            index_path.write_text(
      
        3373
                "\n".join(
      
        3374
                    [
      
        3375
                        "<!DOCTYPE html>",
      
        3376
                        "<html>",
      
        3377
                        "<body>",
      
        3378
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        3379
                        "</body>",
      
        3380
                        "</html>",
      
        3381
                        "",
      
        3382
                    ]
      
        3383
                )
      
        3384
            )
      
        3385
        
        3386
            implementation_plan = temp_dir / "implementation.md"
      
        3387
            implementation_plan.write_text(
      
        3388
                "\n".join(
      
        3389
                    [
      
        3390
                        "# Implementation Plan",
      
        3391
                        "",
      
        3392
                        "## File Changes",
      
        3393
                        f"- `{guide_root}/`",
      
        3394
                        f"- `{chapters}/`",
      
        3395
                        f"- `{index_path}`",
      
        3396
                        "",
      
        3397
                    ]
      
        3398
                )
      
        3399
            )
      
        3400
        
        3401
            context = build_context(
      
        3402
                temp_dir=temp_dir,
      
        3403
                messages=[],
      
        3404
                safeguards=FakeSafeguards(),
      
        3405
                assess_confidence=assess_confidence,
      
        3406
                verify_action=verify_action,
      
        3407
                auto_recover=False,
      
        3408
            )
      
        3409
            queued_messages: list[str] = []
      
        3410
            context.queue_steering_message_callback = queued_messages.append
      
        3411
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3412
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3413
            dod.implementation_plan = str(implementation_plan)
      
        3414
            dod.touched_files.append(str(index_path))
      
        3415
            sync_todos_to_definition_of_done(
      
        3416
                dod,
      
        3417
                [
      
        3418
                    {
      
        3419
                        "content": "Examine the existing Fortran guide structure",
      
        3420
                        "active_form": "Examining the existing Fortran guide structure",
      
        3421
                        "status": "completed",
      
        3422
                    },
      
        3423
                    {
      
        3424
                        "content": "Create the nginx directory structure",
      
        3425
                        "active_form": "Creating the nginx directory structure",
      
        3426
                        "status": "completed",
      
        3427
                    },
      
        3428
                    {
      
        3429
                        "content": "Write the introduction chapter",
      
        3430
                        "active_form": "Writing the introduction chapter",
      
        3431
                        "status": "pending",
      
        3432
                    },
      
        3433
                ],
      
        3434
                project_root=temp_dir,
      
        3435
            )
      
        3436
        
        3437
            tool_call = ToolCall(
      
        3438
                id="todo-next-mutation",
      
        3439
                name="TodoWrite",
      
        3440
                arguments={
      
        3441
                    "todos": [
      
        3442
                        {
      
        3443
                            "content": "Examine the existing Fortran guide structure",
      
        3444
                            "active_form": "Examining the existing Fortran guide structure",
      
        3445
                            "status": "completed",
      
        3446
                        },
      
        3447
                        {
      
        3448
                            "content": "Create the nginx directory structure",
      
        3449
                            "active_form": "Creating the nginx directory structure",
      
        3450
                            "status": "completed",
      
        3451
                        },
      
        3452
                        {
      
        3453
                            "content": "Write the introduction chapter",
      
        3454
                            "active_form": "Writing the introduction chapter",
      
        3455
                            "status": "pending",
      
        3456
                        },
      
        3457
                    ]
      
        3458
                },
      
        3459
            )
      
        3460
            executor = FakeExecutor(
      
        3461
                [
      
        3462
                    tool_outcome(
      
        3463
                        tool_call=tool_call,
      
        3464
                        output="Todos updated",
      
        3465
                        is_error=False,
      
        3466
                        metadata={
      
        3467
                            "new_todos": [
      
        3468
                                {
      
        3469
                                    "content": "Examine the existing Fortran guide structure",
      
        3470
                                    "active_form": "Examining the existing Fortran guide structure",
      
        3471
                                    "status": "completed",
      
        3472
                                },
      
        3473
                                {
      
        3474
                                    "content": "Create the nginx directory structure",
      
        3475
                                    "active_form": "Creating the nginx directory structure",
      
        3476
                                    "status": "completed",
      
        3477
                                },
      
        3478
                                {
      
        3479
                                    "content": "Write the introduction chapter",
      
        3480
                                    "active_form": "Writing the introduction chapter",
      
        3481
                                    "status": "pending",
      
        3482
                                },
      
        3483
                            ]
      
        3484
                        },
      
        3485
                    )
      
        3486
                ]
      
        3487
            )
      
        3488
        
        3489
            summary = TurnSummary(final_response="")
      
        3490
            await runner.execute_batch(
      
        3491
                tool_calls=[tool_call],
      
        3492
                tool_source="assistant",
      
        3493
                pending_tool_calls_seen=set(),
      
        3494
                emit=_noop_emit,
      
        3495
                summary=summary,
      
        3496
                dod=dod,
      
        3497
                executor=executor,  # type: ignore[arg-type]
      
        3498
                on_confirmation=None,
      
        3499
                on_user_question=None,
      
        3500
                emit_confirmation=None,
      
        3501
                consecutive_errors=0,
      
        3502
            )
      
        3503
        
        3504
            assert queued_messages
      
        3505
            message = queued_messages[-1]
      
        3506
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3507
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3508
            assert "Resume by creating `01-introduction.html` now." in message
      
        3509
            assert "Prefer one `write` call for `" in message
      
        3510
            assert "01-introduction.html` instead of more rereads." in message
      
        3511
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3512
        
        3513
        
        3514
        @pytest.mark.asyncio
      
        3515
        async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
      
        3516
            temp_dir: Path,
      
        3517
        ) -> None:
      
        3518
            async def assess_confidence(
      
        3519
                tool_name: str,
      
        3520
                tool_args: dict,
      
        3521
                context: str,
      
        3522
            ) -> ConfidenceAssessment:
      
        3523
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3524
        
        3525
            async def verify_action(
      
        3526
                tool_name: str,
      
        3527
                tool_args: dict,
      
        3528
                result: str,
      
        3529
                expected: str = "",
      
        3530
            ) -> ActionVerification:
      
        3531
                raise AssertionError("Verification should not run in this scenario")
      
        3532
        
        3533
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        3534
            chapters = guide_root / "chapters"
      
        3535
            chapters.mkdir(parents=True)
      
        3536
            index_path = guide_root / "index.html"
      
        3537
            implementation_plan = temp_dir / "implementation.md"
      
        3538
            implementation_plan.write_text(
      
        3539
                "\n".join(
      
        3540
                    [
      
        3541
                        "# Implementation Plan",
      
        3542
                        "",
      
        3543
                        "## File Changes",
      
        3544
                        f"- `{chapters}/`",
      
        3545
                        f"- `{index_path}`",
      
        3546
                        "",
      
        3547
                    ]
      
        3548
                )
      
        3549
            )
      
        3550
        
        3551
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3552
            dod.implementation_plan = str(implementation_plan)
      
        3553
            sync_todos_to_definition_of_done(
      
        3554
                dod,
      
        3555
                [
      
        3556
                    {
      
        3557
                        "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        3558
                        "active_form": "Examining the existing Fortran guide structure",
      
        3559
                        "status": "completed",
      
        3560
                    },
      
        3561
                    {
      
        3562
                        "content": "Create the new nginx guide directory structure",
      
        3563
                        "active_form": "Creating the new nginx guide directory structure",
      
        3564
                        "status": "completed",
      
        3565
                    },
      
        3566
                    {
      
        3567
                        "content": "Create a new index.html for the nginx guide",
      
        3568
                        "active_form": "Creating a new index.html for the nginx guide",
      
        3569
                        "status": "pending",
      
        3570
                    },
      
        3571
                    {
      
        3572
                        "content": "Create the first chapter for the nginx guide",
      
        3573
                        "active_form": "Creating the first chapter for the nginx guide",
      
        3574
                        "status": "pending",
      
        3575
                    },
      
        3576
                ],
      
        3577
                project_root=temp_dir,
      
        3578
            )
      
        3579
        
        3580
            queued_messages: list[str] = []
      
        3581
            context = build_context(
      
        3582
                temp_dir=temp_dir,
      
        3583
                messages=[],
      
        3584
                safeguards=FakeSafeguards(),
      
        3585
                assess_confidence=assess_confidence,
      
        3586
                verify_action=verify_action,
      
        3587
                auto_recover=False,
      
        3588
            )
      
        3589
            context.queue_steering_message_callback = queued_messages.append
      
        3590
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3591
        
        3592
            todos = [
      
        3593
                {
      
        3594
                    "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        3595
                    "active_form": "Examining the existing Fortran guide structure",
      
        3596
                    "status": "completed",
      
        3597
                },
      
        3598
                {
      
        3599
                    "content": "Create the new nginx guide directory structure",
      
        3600
                    "active_form": "Creating the new nginx guide directory structure",
      
        3601
                    "status": "completed",
      
        3602
                },
      
        3603
                {
      
        3604
                    "content": "Create a new index.html for the nginx guide",
      
        3605
                    "active_form": "Creating a new index.html for the nginx guide",
      
        3606
                    "status": "pending",
      
        3607
                },
      
        3608
                {
      
        3609
                    "content": "Create the first chapter for the nginx guide",
      
        3610
                    "active_form": "Creating the first chapter for the nginx guide",
      
        3611
                    "status": "pending",
      
        3612
                },
      
        3613
            ]
      
        3614
            tool_call = ToolCall(
      
        3615
                id="todo-index-before-chapter",
      
        3616
                name="TodoWrite",
      
        3617
                arguments={"todos": todos},
      
        3618
            )
      
        3619
            executor = FakeExecutor(
      
        3620
                [
      
        3621
                    tool_outcome(
      
        3622
                        tool_call=tool_call,
      
        3623
                        output="Todos updated",
      
        3624
                        is_error=False,
      
        3625
                        metadata={"new_todos": todos},
      
        3626
                    )
      
        3627
                ]
      
        3628
            )
      
        3629
        
        3630
            summary = TurnSummary(final_response="")
      
        3631
            await runner.execute_batch(
      
        3632
                tool_calls=[tool_call],
      
        3633
                tool_source="assistant",
      
        3634
                pending_tool_calls_seen=set(),
      
        3635
                emit=_noop_emit,
      
        3636
                summary=summary,
      
        3637
                dod=dod,
      
        3638
                executor=executor,  # type: ignore[arg-type]
      
        3639
                on_confirmation=None,
      
        3640
                on_user_question=None,
      
        3641
                emit_confirmation=None,
      
        3642
                consecutive_errors=0,
      
        3643
            )
      
        3644
        
        3645
            assert queued_messages
      
        3646
            message = queued_messages[-1]
      
        3647
            assert "Continue with the next pending item: `Create a new index.html for the nginx guide`." in message
      
        3648
            assert "Resume by creating `index.html` now." in message
      
        3649
            assert f"Prefer one `write` call for `{index_path.resolve(strict=False)}`" in message
      
        3650
            assert "01-introduction.html" not in message
      
        3651
        
        3652
        
        3653
        @pytest.mark.asyncio
      
        3654
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        3655
            temp_dir: Path,
      
        3656
        ) -> None:
      
        3657
            async def assess_confidence(
      
        3658
                tool_name: str,
      
        3659
                tool_args: dict,
      
        3660
                context: str,
      
        3661
            ) -> ConfidenceAssessment:
      
        3662
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3663
        
        3664
            async def verify_action(
      
        3665
                tool_name: str,
      
        3666
                tool_args: dict,
      
        3667
                result: str,
      
        3668
                expected: str = "",
      
        3669
            ) -> ActionVerification:
      
        3670
                raise AssertionError("Verification should not run in this scenario")
      
        3671
        
        3672
            guide_root = temp_dir / "guides" / "nginx"
      
        3673
            chapters = guide_root / "chapters"
      
        3674
            guide_root.mkdir(parents=True)
      
        3675
            chapters.mkdir()
      
        3676
            index_path = guide_root / "index.html"
      
        3677
            index_path.write_text(
      
        3678
                "\n".join(
      
        3679
                    [
      
        3680
                        "<html>",
      
        3681
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        3682
                        '<a href="chapters/installation.html">Installation</a>',
      
        3683
                        "</html>",
      
        3684
                    ]
      
        3685
                )
      
        3686
                + "\n"
      
        3687
            )
      
        3688
        
        3689
            implementation_plan = temp_dir / "implementation.md"
      
        3690
            implementation_plan.write_text(
      
        3691
                "\n".join(
      
        3692
                    [
      
        3693
                        "# Implementation Plan",
      
        3694
                        "",
      
        3695
                        "## File Changes",
      
        3696
                        f"- `{guide_root}/`",
      
        3697
                        f"- `{chapters}/`",
      
        3698
                        f"- `{index_path}`",
      
        3699
                        "",
      
        3700
                    ]
      
        3701
                )
      
        3702
            )
      
        3703
        
        3704
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3705
            dod.implementation_plan = str(implementation_plan)
      
        3706
            dod.pending_items = [
      
        3707
                "Write the introduction chapter",
      
        3708
                "Complete the requested work",
      
        3709
            ]
      
        3710
            dod.touched_files.append(str(index_path))
      
        3711
        
        3712
            queued_messages: list[str] = []
      
        3713
            context = build_context(
      
        3714
                temp_dir=temp_dir,
      
        3715
                messages=[],
      
        3716
                safeguards=FakeSafeguards(),
      
        3717
                assess_confidence=assess_confidence,
      
        3718
                verify_action=verify_action,
      
        3719
                auto_recover=False,
      
        3720
            )
      
        3721
            context.queue_steering_message_callback = queued_messages.append
      
        3722
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3723
        
        3724
            tool_call = ToolCall(
      
        3725
                id="todo-1",
      
        3726
                name="TodoWrite",
      
        3727
                arguments={
      
        3728
                    "todos": [
      
        3729
                        {
      
        3730
                            "content": "Write the introduction chapter",
      
        3731
                            "activeForm": "Writing the introduction chapter",
      
        3732
                            "status": "pending",
      
        3733
                        }
      
        3734
                    ]
      
        3735
                },
      
        3736
            )
      
        3737
            executor = FakeExecutor(
      
        3738
                [
      
        3739
                    tool_outcome(
      
        3740
                        tool_call=tool_call,
      
        3741
                        output="Todos updated",
      
        3742
                        is_error=False,
      
        3743
                        metadata={
      
        3744
                            "new_todos": [
      
        3745
                                {
      
        3746
                                    "content": "Write the introduction chapter",
      
        3747
                                    "active_form": "Writing the introduction chapter",
      
        3748
                                    "status": "pending",
      
        3749
                                }
      
        3750
                            ]
      
        3751
                        },
      
        3752
                    )
      
        3753
                ]
      
        3754
            )
      
        3755
        
        3756
            summary = TurnSummary(final_response="")
      
        3757
            await runner.execute_batch(
      
        3758
                tool_calls=[tool_call],
      
        3759
                tool_source="assistant",
      
        3760
                pending_tool_calls_seen=set(),
      
        3761
                emit=_noop_emit,
      
        3762
                summary=summary,
      
        3763
                dod=dod,
      
        3764
                executor=executor,  # type: ignore[arg-type]
      
        3765
                on_confirmation=None,
      
        3766
                on_user_question=None,
      
        3767
                emit_confirmation=None,
      
        3768
                consecutive_errors=0,
      
        3769
            )
      
        3770
        
        3771
            assert queued_messages
      
        3772
            message = queued_messages[-1]
      
        3773
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3774
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3775
            assert "Resume by creating `introduction.html` now." in message
      
        3776
            assert "Prefer one `write` call for `" in message
      
        3777
            assert "introduction.html` instead of more rereads." in message
      
        3778
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3779
        
        3780
        
        3781
        @pytest.mark.asyncio
      
        3782
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        3783
            temp_dir: Path,
      
        3784
        ) -> None:
      
        3785
            async def assess_confidence(
      
        3786
                tool_name: str,
      
        3787
                tool_args: dict,
      
        3788
                context: str,
      
        3789
            ) -> ConfidenceAssessment:
      
        3790
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3791
        
        3792
            async def verify_action(
      
        3793
                tool_name: str,
      
        3794
                tool_args: dict,
      
        3795
                result: str,
      
        3796
                expected: str = "",
      
        3797
            ) -> ActionVerification:
      
        3798
                raise AssertionError("Verification should not run in this scenario")
      
        3799
        
        3800
            guide_root = temp_dir / "guides" / "nginx"
      
        3801
            chapters = guide_root / "chapters"
      
        3802
            guide_root.mkdir(parents=True)
      
        3803
            chapters.mkdir()
      
        3804
            index_path = guide_root / "index.html"
      
        3805
            chapter_one = chapters / "01-introduction.html"
      
        3806
            index_path.write_text(
      
        3807
                "\n".join(
      
        3808
                    [
      
        3809
                        "<html>",
      
        3810
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        3811
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        3812
                        "</html>",
      
        3813
                    ]
      
        3814
                )
      
        3815
                + "\n"
      
        3816
            )
      
        3817
            chapter_one.write_text("<html></html>\n")
      
        3818
        
        3819
            implementation_plan = temp_dir / "implementation.md"
      
        3820
            implementation_plan.write_text(
      
        3821
                "\n".join(
      
        3822
                    [
      
        3823
                        "# Implementation Plan",
      
        3824
                        "",
      
        3825
                        "## File Changes",
      
        3826
                        f"- `{guide_root}/`",
      
        3827
                        f"- `{chapters}/`",
      
        3828
                        f"- `{index_path}`",
      
        3829
                        "",
      
        3830
                    ]
      
        3831
                )
      
        3832
            )
      
        3833
        
        3834
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3835
            dod.implementation_plan = str(implementation_plan)
      
        3836
            dod.pending_items = [
      
        3837
                "Creating Chapter 2: Installation and Setup",
      
        3838
                "Complete the requested work",
      
        3839
            ]
      
        3840
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3841
        
        3842
            queued_messages: list[str] = []
      
        3843
            context = build_context(
      
        3844
                temp_dir=temp_dir,
      
        3845
                messages=[],
      
        3846
                safeguards=FakeSafeguards(),
      
        3847
                assess_confidence=assess_confidence,
      
        3848
                verify_action=verify_action,
      
        3849
                auto_recover=False,
      
        3850
            )
      
        3851
            context.queue_steering_message_callback = queued_messages.append
      
        3852
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3853
        
        3854
            tool_call = ToolCall(
      
        3855
                id="todo-1",
      
        3856
                name="TodoWrite",
      
        3857
                arguments={
      
        3858
                    "todos": [
      
        3859
                        {
      
        3860
                            "content": "Creating Chapter 2: Installation and Setup",
      
        3861
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        3862
                            "status": "pending",
      
        3863
                        }
      
        3864
                    ]
      
        3865
                },
      
        3866
            )
      
        3867
            executor = FakeExecutor(
      
        3868
                [
      
        3869
                    tool_outcome(
      
        3870
                        tool_call=tool_call,
      
        3871
                        output="Todos updated",
      
        3872
                        is_error=False,
      
        3873
                        metadata={
      
        3874
                            "new_todos": [
      
        3875
                                {
      
        3876
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        3877
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        3878
                                    "status": "pending",
      
        3879
                                }
      
        3880
                            ]
      
        3881
                        },
      
        3882
                    )
      
        3883
                ]
      
        3884
            )
      
        3885
        
        3886
            summary = TurnSummary(final_response="")
      
        3887
            await runner.execute_batch(
      
        3888
                tool_calls=[tool_call],
      
        3889
                tool_source="assistant",
      
        3890
                pending_tool_calls_seen=set(),
      
        3891
                emit=_noop_emit,
      
        3892
                summary=summary,
      
        3893
                dod=dod,
      
        3894
                executor=executor,  # type: ignore[arg-type]
      
        3895
                on_confirmation=None,
      
        3896
                on_user_question=None,
      
        3897
                emit_confirmation=None,
      
        3898
                consecutive_errors=0,
      
        3899
            )
      
        3900
        
        3901
            assert queued_messages
      
        3902
            message = queued_messages[-1]
      
        3903
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3904
            assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
      
        3905
            assert "Resume by creating `02-installation.html` now." in message
      
        3906
            assert (
      
        3907
                f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
      
        3908
                "instead of more rereads."
      
        3909
                in message
      
        3910
            )
      
        3911
            assert "Make your next response the concrete mutation tool call itself" in message
      
        3912
        
        3913
        
        3914
        @pytest.mark.asyncio
      
        3915
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        3916
            temp_dir: Path,
      
        3917
        ) -> None:
      
        3918
            async def assess_confidence(
      
        3919
                tool_name: str,
      
        3920
                tool_args: dict,
      
        3921
                context: str,
      
        3922
            ) -> ConfidenceAssessment:
      
        3923
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3924
        
        3925
            async def verify_action(
      
        3926
                tool_name: str,
      
        3927
                tool_args: dict,
      
        3928
                result: str,
      
        3929
                expected: str = "",
      
        3930
            ) -> ActionVerification:
      
        3931
                raise AssertionError("Verification should not run in this scenario")
      
        3932
        
        3933
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        3934
            reference_chapters.mkdir(parents=True)
      
        3935
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        3936
        
        3937
            guide_root = temp_dir / "guides" / "nginx"
      
        3938
            chapters = guide_root / "chapters"
      
        3939
            guide_root.mkdir(parents=True)
      
        3940
            chapters.mkdir()
      
        3941
            index_path = guide_root / "index.html"
      
        3942
            index_path.write_text("<html></html>\n")
      
        3943
        
        3944
            implementation_plan = temp_dir / "implementation.md"
      
        3945
            implementation_plan.write_text(
      
        3946
                "\n".join(
      
        3947
                    [
      
        3948
                        "# Implementation Plan",
      
        3949
                        "",
      
        3950
                        "## File Changes",
      
        3951
                        f"- `{guide_root}/`",
      
        3952
                        f"- `{chapters}/`",
      
        3953
                        f"- `{index_path}`",
      
        3954
                        "",
      
        3955
                    ]
      
        3956
                )
      
        3957
            )
      
        3958
        
        3959
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3960
            dod.implementation_plan = str(implementation_plan)
      
        3961
            dod.pending_items = [
      
        3962
                "Write the introduction chapter",
      
        3963
                "Complete the requested work",
      
        3964
            ]
      
        3965
            dod.touched_files.append(str(index_path))
      
        3966
        
        3967
            queued_messages: list[str] = []
      
        3968
            context = build_context(
      
        3969
                temp_dir=temp_dir,
      
        3970
                messages=[
      
        3971
                    Message(
      
        3972
                        role=Role.ASSISTANT,
      
        3973
                        content="",
      
        3974
                        tool_calls=[
      
        3975
                            ToolCall(
      
        3976
                                id="read-ref-1",
      
        3977
                                name="read",
      
        3978
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        3979
                            )
      
        3980
                        ],
      
        3981
                    )
      
        3982
                ],
      
        3983
                safeguards=FakeSafeguards(),
      
        3984
                assess_confidence=assess_confidence,
      
        3985
                verify_action=verify_action,
      
        3986
                auto_recover=False,
      
        3987
            )
      
        3988
            context.queue_steering_message_callback = queued_messages.append
      
        3989
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3990
        
        3991
            tool_call = ToolCall(
      
        3992
                id="todo-observed-1",
      
        3993
                name="TodoWrite",
      
        3994
                arguments={
      
        3995
                    "todos": [
      
        3996
                        {
      
        3997
                            "content": "Write the introduction chapter",
      
        3998
                            "activeForm": "Writing the introduction chapter",
      
        3999
                            "status": "pending",
      
        4000
                        }
      
        4001
                    ]
      
        4002
                },
      
        4003
            )
      
        4004
            executor = FakeExecutor(
      
        4005
                [
      
        4006
                    tool_outcome(
      
        4007
                        tool_call=tool_call,
      
        4008
                        output="Todos updated",
      
        4009
                        is_error=False,
      
        4010
                        metadata={
      
        4011
                            "new_todos": [
      
        4012
                                {
      
        4013
                                    "content": "Write the introduction chapter",
      
        4014
                                    "active_form": "Writing the introduction chapter",
      
        4015
                                    "status": "pending",
      
        4016
                                }
      
        4017
                            ]
      
        4018
                        },
      
        4019
                    )
      
        4020
                ]
      
        4021
            )
      
        4022
        
        4023
            summary = TurnSummary(final_response="")
      
        4024
            await runner.execute_batch(
      
        4025
                tool_calls=[tool_call],
      
        4026
                tool_source="assistant",
      
        4027
                pending_tool_calls_seen=set(),
      
        4028
                emit=_noop_emit,
      
        4029
                summary=summary,
      
        4030
                dod=dod,
      
        4031
                executor=executor,  # type: ignore[arg-type]
      
        4032
                on_confirmation=None,
      
        4033
                on_user_question=None,
      
        4034
                emit_confirmation=None,
      
        4035
                consecutive_errors=0,
      
        4036
            )
      
        4037
        
        4038
            assert queued_messages
      
        4039
            message = queued_messages[-1]
      
        4040
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        4041
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        4042
            assert "Resume by creating `01-introduction.html` now." in message
      
        4043
            assert (
      
        4044
                "It mirrors the observed filename pattern from another `chapters/` directory "
      
        4045
                "you already inspected."
      
        4046
                in message
      
        4047
            )
      
        4048
            assert "01-introduction.html` instead of more rereads." in message
      
        4049
        
        4050
        
        4051
        @pytest.mark.asyncio
      
        4052
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        4053
            temp_dir: Path,
      
        4054
        ) -> None:
      
        4055
            async def assess_confidence(
      
        4056
                tool_name: str,
      
        4057
                tool_args: dict,
      
        4058
                context: str,
      
        4059
            ) -> ConfidenceAssessment:
      
        4060
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4061
        
        4062
            async def verify_action(
      
        4063
                tool_name: str,
      
        4064
                tool_args: dict,
      
        4065
                result: str,
      
        4066
                expected: str = "",
      
        4067
            ) -> ActionVerification:
      
        4068
                raise AssertionError("Verification should not run in this scenario")
      
        4069
        
        4070
            guide_root = temp_dir / "guides" / "nginx"
      
        4071
            chapters = guide_root / "chapters"
      
        4072
            guide_root.mkdir(parents=True)
      
        4073
            chapters.mkdir()
      
        4074
            index_path = guide_root / "index.html"
      
        4075
            chapter_one = chapters / "01-getting-started.html"
      
        4076
            chapter_two = chapters / "02-installation.html"
      
        4077
            index_path.write_text("<html></html>\n")
      
        4078
            chapter_one.write_text("<h1>One</h1>\n")
      
        4079
        
        4080
            implementation_plan = temp_dir / "implementation.md"
      
        4081
            implementation_plan.write_text(
      
        4082
                "\n".join(
      
        4083
                    [
      
        4084
                        "# Implementation Plan",
      
        4085
                        "",
      
        4086
                        "## File Changes",
      
        4087
                        f"- `{guide_root}/`",
      
        4088
                        f"- `{chapters}/`",
      
        4089
                        f"- `{index_path}`",
      
        4090
                        f"- `{chapter_one}`",
      
        4091
                        f"- `{chapter_two}`",
      
        4092
                        "",
      
        4093
                    ]
      
        4094
                )
      
        4095
            )
      
        4096
        
        4097
            context = build_context(
      
        4098
                temp_dir=temp_dir,
      
        4099
                messages=[],
      
        4100
                safeguards=FakeSafeguards(),
      
        4101
                assess_confidence=assess_confidence,
      
        4102
                verify_action=verify_action,
      
        4103
                auto_recover=False,
      
        4104
            )
      
        4105
            queued_messages: list[str] = []
      
        4106
            context.queue_steering_message_callback = queued_messages.append
      
        4107
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4108
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4109
            dod.implementation_plan = str(implementation_plan)
      
        4110
            sync_todos_to_definition_of_done(
      
        4111
                dod,
      
        4112
                [
      
        4113
                    {
      
        4114
                        "content": "Create 01-getting-started.html",
      
        4115
                        "active_form": "Creating 01-getting-started.html",
      
        4116
                        "status": "completed",
      
        4117
                    },
      
        4118
                    {
      
        4119
                        "content": "Create 02-installation.html",
      
        4120
                        "active_form": "Creating 02-installation.html",
      
        4121
                        "status": "pending",
      
        4122
                    },
      
        4123
                ],
      
        4124
                project_root=temp_dir,
      
        4125
            )
      
        4126
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4127
        
        4128
            tool_call = ToolCall(
      
        4129
                id="working-note",
      
        4130
                name="notepad_write_working",
      
        4131
                arguments={"content": "Creating the second chapter file: Installation"},
      
        4132
            )
      
        4133
            executor = FakeExecutor(
      
        4134
                [
      
        4135
                    tool_outcome(
      
        4136
                        tool_call=tool_call,
      
        4137
                        output="Working note recorded",
      
        4138
                        is_error=False,
      
        4139
                    )
      
        4140
                ]
      
        4141
            )
      
        4142
        
        4143
            summary = TurnSummary(final_response="")
      
        4144
            await runner.execute_batch(
      
        4145
                tool_calls=[tool_call],
      
        4146
                tool_source="assistant",
      
        4147
                pending_tool_calls_seen=set(),
      
        4148
                emit=_noop_emit,
      
        4149
                summary=summary,
      
        4150
                dod=dod,
      
        4151
                executor=executor,  # type: ignore[arg-type]
      
        4152
                on_confirmation=None,
      
        4153
                on_user_question=None,
      
        4154
                emit_confirmation=None,
      
        4155
                consecutive_errors=0,
      
        4156
            )
      
        4157
        
        4158
            assert queued_messages
      
        4159
            message = queued_messages[-1]
      
        4160
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4161
            assert "Resume by creating `02-installation.html` now." in message
      
        4162
            assert "Make your next response the concrete mutation tool call itself" in message
      
        4163
            assert "refresh `TodoWrite`" in message
      
        4164
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        4165
        
        4166
        
        4167
        @pytest.mark.asyncio
      
        4168
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        4169
            temp_dir: Path,
      
        4170
        ) -> None:
      
        4171
            async def assess_confidence(
      
        4172
                tool_name: str,
      
        4173
                tool_args: dict,
      
        4174
                context: str,
      
        4175
            ) -> ConfidenceAssessment:
      
        4176
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4177
        
        4178
            async def verify_action(
      
        4179
                tool_name: str,
      
        4180
                tool_args: dict,
      
        4181
                result: str,
      
        4182
                expected: str = "",
      
        4183
            ) -> ActionVerification:
      
        4184
                raise AssertionError("Verification should not run in this scenario")
      
        4185
        
        4186
            implementation_plan = temp_dir / "implementation.md"
      
        4187
            implementation_plan.write_text(
      
        4188
                "\n".join(
      
        4189
                    [
      
        4190
                        "# Implementation Plan",
      
        4191
                        "",
      
        4192
                        "## File Changes",
      
        4193
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        4194
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        4195
                        "",
      
        4196
                    ]
      
        4197
                )
      
        4198
            )
      
        4199
        
        4200
            context = build_context(
      
        4201
                temp_dir=temp_dir,
      
        4202
                messages=[],
      
        4203
                safeguards=FakeSafeguards(),
      
        4204
                assess_confidence=assess_confidence,
      
        4205
                verify_action=verify_action,
      
        4206
                auto_recover=False,
      
        4207
            )
      
        4208
            queued_messages: list[str] = []
      
        4209
            context.queue_steering_message_callback = queued_messages.append
      
        4210
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4211
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4212
            dod.implementation_plan = str(implementation_plan)
      
        4213
            dod.pending_items.extend(
      
        4214
                [
      
        4215
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4216
                    "Create the nginx directory structure",
      
        4217
                    "Develop the main index.html file for the nginx guide",
      
        4218
                ]
      
        4219
            )
      
        4220
        
        4221
            tool_call = ToolCall(
      
        4222
                id="working-note",
      
        4223
                name="notepad_write_working",
      
        4224
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        4225
            )
      
        4226
            executor = FakeExecutor(
      
        4227
                [
      
        4228
                    tool_outcome(
      
        4229
                        tool_call=tool_call,
      
        4230
                        output="Working note recorded",
      
        4231
                        is_error=False,
      
        4232
                    )
      
        4233
                ]
      
        4234
            )
      
        4235
        
        4236
            summary = TurnSummary(final_response="")
      
        4237
            await runner.execute_batch(
      
        4238
                tool_calls=[tool_call],
      
        4239
                tool_source="assistant",
      
        4240
                pending_tool_calls_seen=set(),
      
        4241
                emit=_noop_emit,
      
        4242
                summary=summary,
      
        4243
                dod=dod,
      
        4244
                executor=executor,  # type: ignore[arg-type]
      
        4245
                on_confirmation=None,
      
        4246
                on_user_question=None,
      
        4247
                emit_confirmation=None,
      
        4248
                consecutive_errors=0,
      
        4249
            )
      
        4250
        
        4251
            assert queued_messages
      
        4252
            message = queued_messages[-1]
      
        4253
            assert (
      
        4254
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        4255
                in message
      
        4256
            )
      
        4257
            assert "one concrete evidence-gathering tool call" in message
      
        4258
            assert "Resume by creating `index.html` now." not in message
      
        4259
        
        4260
        
        4261
        @pytest.mark.asyncio
      
        4262
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        4263
            temp_dir: Path,
      
        4264
        ) -> None:
      
        4265
            async def assess_confidence(
      
        4266
                tool_name: str,
      
        4267
                tool_args: dict,
      
        4268
                context: str,
      
        4269
            ) -> ConfidenceAssessment:
      
        4270
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4271
        
        4272
            async def verify_action(
      
        4273
                tool_name: str,
      
        4274
                tool_args: dict,
      
        4275
                result: str,
      
        4276
                expected: str = "",
      
        4277
            ) -> ActionVerification:
      
        4278
                raise AssertionError("Verification should not run in this scenario")
      
        4279
        
        4280
            guide_root = temp_dir / "guides" / "nginx"
      
        4281
            chapters_dir = guide_root / "chapters"
      
        4282
            chapters_dir.mkdir(parents=True)
      
        4283
            index_path = guide_root / "index.html"
      
        4284
            first_chapter = chapters_dir / "01-introduction.html"
      
        4285
            index_path.write_text(
      
        4286
                "\n".join(
      
        4287
                    [
      
        4288
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4289
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        4290
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        4291
                    ]
      
        4292
                )
      
        4293
            )
      
        4294
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        4295
        
        4296
            implementation_plan = temp_dir / "implementation.md"
      
        4297
            implementation_plan.write_text(
      
        4298
                "\n".join(
      
        4299
                    [
      
        4300
                        "# Implementation Plan",
      
        4301
                        "",
      
        4302
                        "## File Changes",
      
        4303
                        f"- `{guide_root / 'index.html'}`",
      
        4304
                        f"- `{chapters_dir}/`",
      
        4305
                        "",
      
        4306
                    ]
      
        4307
                )
      
        4308
            )
      
        4309
        
        4310
            context = build_context(
      
        4311
                temp_dir=temp_dir,
      
        4312
                messages=[],
      
        4313
                safeguards=FakeSafeguards(),
      
        4314
                assess_confidence=assess_confidence,
      
        4315
                verify_action=verify_action,
      
        4316
                auto_recover=False,
      
        4317
            )
      
        4318
            queued_messages: list[str] = []
      
        4319
            context.queue_steering_message_callback = queued_messages.append
      
        4320
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4321
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4322
            dod.implementation_plan = str(implementation_plan)
      
        4323
            dod.pending_items.extend(
      
        4324
                [
      
        4325
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4326
                    "Create chapter files following the established pattern",
      
        4327
                ]
      
        4328
            )
      
        4329
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        4330
        
        4331
            tool_call = ToolCall(
      
        4332
                id="working-note",
      
        4333
                name="notepad_write_working",
      
        4334
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        4335
            )
      
        4336
            executor = FakeExecutor(
      
        4337
                [
      
        4338
                    tool_outcome(
      
        4339
                        tool_call=tool_call,
      
        4340
                        output="Working note recorded",
      
        4341
                        is_error=False,
      
        4342
                    )
      
        4343
                ]
      
        4344
            )
      
        4345
        
        4346
            summary = TurnSummary(final_response="")
      
        4347
            await runner.execute_batch(
      
        4348
                tool_calls=[tool_call],
      
        4349
                tool_source="assistant",
      
        4350
                pending_tool_calls_seen=set(),
      
        4351
                emit=_noop_emit,
      
        4352
                summary=summary,
      
        4353
                dod=dod,
      
        4354
                executor=executor,  # type: ignore[arg-type]
      
        4355
                on_confirmation=None,
      
        4356
                on_user_question=None,
      
        4357
                emit_confirmation=None,
      
        4358
                consecutive_errors=0,
      
        4359
            )
      
        4360
        
        4361
            assert queued_messages
      
        4362
            message = queued_messages[-1]
      
        4363
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4364
            assert "Resume by creating `02-installation.html` now." in message
      
        4365
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        4366
        
        4367
        
        4368
        @pytest.mark.asyncio
      
        4369
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        4370
            temp_dir: Path,
      
        4371
        ) -> None:
      
        4372
            async def assess_confidence(
      
        4373
                tool_name: str,
      
        4374
                tool_args: dict,
      
        4375
                context: str,
      
        4376
            ) -> ConfidenceAssessment:
      
        4377
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4378
        
        4379
            async def verify_action(
      
        4380
                tool_name: str,
      
        4381
                tool_args: dict,
      
        4382
                result: str,
      
        4383
                expected: str = "",
      
        4384
            ) -> ActionVerification:
      
        4385
                raise AssertionError("Verification should not run in this scenario")
      
        4386
        
        4387
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        4388
            chapters_dir = fortran_root / "chapters"
      
        4389
            chapters_dir.mkdir(parents=True)
      
        4390
        
        4391
            implementation_plan = temp_dir / "implementation.md"
      
        4392
            implementation_plan.write_text(
      
        4393
                "\n".join(
      
        4394
                    [
      
        4395
                        "# Implementation Plan",
      
        4396
                        "",
      
        4397
                        "## File Changes",
      
        4398
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        4399
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        4400
                        "",
      
        4401
                    ]
      
        4402
                )
      
        4403
            )
      
        4404
        
        4405
            context = build_context(
      
        4406
                temp_dir=temp_dir,
      
        4407
                messages=[],
      
        4408
                safeguards=FakeSafeguards(),
      
        4409
                assess_confidence=assess_confidence,
      
        4410
                verify_action=verify_action,
      
        4411
                auto_recover=False,
      
        4412
            )
      
        4413
            queued_messages: list[str] = []
      
        4414
            context.queue_steering_message_callback = queued_messages.append
      
        4415
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4416
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4417
            dod.implementation_plan = str(implementation_plan)
      
        4418
            dod.pending_items.extend(
      
        4419
                [
      
        4420
                    "First, examine the existing fortran guide structure and content",
      
        4421
                    "Create the nginx directory structure",
      
        4422
                    "Develop the main index.html file for nginx guide",
      
        4423
                ]
      
        4424
            )
      
        4425
        
        4426
            tool_call = ToolCall(
      
        4427
                id="glob-1",
      
        4428
                name="glob",
      
        4429
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        4430
            )
      
        4431
            executor = FakeExecutor(
      
        4432
                [
      
        4433
                    tool_outcome(
      
        4434
                        tool_call=tool_call,
      
        4435
                        output=f"{fortran_root}\n{chapters_dir}",
      
        4436
                        is_error=False,
      
        4437
                    )
      
        4438
                ]
      
        4439
            )
      
        4440
        
        4441
            summary = TurnSummary(final_response="")
      
        4442
            await runner.execute_batch(
      
        4443
                tool_calls=[tool_call],
      
        4444
                tool_source="assistant",
      
        4445
                pending_tool_calls_seen=set(),
      
        4446
                emit=_noop_emit,
      
        4447
                summary=summary,
      
        4448
                dod=dod,
      
        4449
                executor=executor,  # type: ignore[arg-type]
      
        4450
                on_confirmation=None,
      
        4451
                on_user_question=None,
      
        4452
                emit_confirmation=None,
      
        4453
                consecutive_errors=0,
      
        4454
            )
      
        4455
        
        4456
            assert queued_messages == []
      
        4457
        
        4458
        
        4459
        @pytest.mark.asyncio
      
        4460
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        4461
            temp_dir: Path,
      
        4462
        ) -> None:
      
        4463
            async def assess_confidence(
      
        4464
                tool_name: str,
      
        4465
                tool_args: dict,
      
        4466
                context: str,
      
        4467
            ) -> ConfidenceAssessment:
      
        4468
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4469
        
        4470
            async def verify_action(
      
        4471
                tool_name: str,
      
        4472
                tool_args: dict,
      
        4473
                result: str,
      
        4474
                expected: str = "",
      
        4475
            ) -> ActionVerification:
      
        4476
                raise AssertionError("Verification should not run in this scenario")
      
        4477
        
        4478
            prompt = (
      
        4479
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        4480
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        4481
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        4482
                "links and their link texts to be correct."
      
        4483
            )
      
        4484
            chapters = temp_dir / "chapters"
      
        4485
            chapters.mkdir()
      
        4486
            (chapters / "01-introduction.html").write_text(
      
        4487
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        4488
            )
      
        4489
            (chapters / "02-setup.html").write_text(
      
        4490
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        4491
            )
      
        4492
            current_block = (
      
        4493
                "<h2>Table of Contents</h2>\n"
      
        4494
                '        <ul class="chapter-list">\n'
      
        4495
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        4496
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        4497
                "        </ul>\n"
      
        4498
            )
      
        4499
            index_path = temp_dir / "index.html"
      
        4500
            index_path.write_text(current_block)
      
        4501
        
        4502
            context = build_context(
      
        4503
                temp_dir=temp_dir,
      
        4504
                messages=[],
      
        4505
                safeguards=FakeSafeguards(),
      
        4506
                assess_confidence=assess_confidence,
      
        4507
                verify_action=verify_action,
      
        4508
                auto_recover=False,
      
        4509
            )
      
        4510
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        4511
            queued_messages: list[str] = []
      
        4512
            context.queue_steering_message_callback = queued_messages.append
      
        4513
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4514
            tool_call = ToolCall(
      
        4515
                id="edit-1",
      
        4516
                name="edit",
      
        4517
                arguments={
      
        4518
                    "file_path": str(index_path),
      
        4519
                    "old_string": current_block,
      
        4520
                    "new_string": current_block,
      
        4521
                },
      
        4522
            )
      
        4523
            executor = FakeExecutor(
      
        4524
                [
      
        4525
                    tool_outcome(
      
        4526
                        tool_call=tool_call,
      
        4527
                        output=(
      
        4528
                            "[Blocked - old_string and new_string are identical - no change "
      
        4529
                            "would occur] Suggestion: Provide different old and new strings"
      
        4530
                        ),
      
        4531
                        is_error=True,
      
        4532
                        state=ToolExecutionState.BLOCKED,
      
        4533
                    )
      
        4534
                ]
      
        4535
            )
      
        4536
        
        4537
            await runner.execute_batch(
      
        4538
                tool_calls=[tool_call],
      
        4539
                tool_source="assistant",
      
        4540
                pending_tool_calls_seen=set(),
      
        4541
                emit=_noop_emit,
      
        4542
                summary=TurnSummary(final_response=""),
      
        4543
                dod=create_definition_of_done(prompt),
      
        4544
                executor=executor,  # type: ignore[arg-type]
      
        4545
                on_confirmation=None,
      
        4546
                on_user_question=None,
      
        4547
                emit_confirmation=None,
      
        4548
                consecutive_errors=0,
      
        4549
            )
      
        4550
        
        4551
            assert queued_messages == []
      
        4552
        
        4553
        
        4554
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        4555
            temp_dir: Path,
      
        4556
        ) -> None:
      
        4557
            async def assess_confidence(
      
        4558
                tool_name: str,
      
        4559
                tool_args: dict,
      
        4560
                context: str,
      
        4561
            ) -> ConfidenceAssessment:
      
        4562
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4563
        
        4564
            async def verify_action(
      
        4565
                tool_name: str,
      
        4566
                tool_args: dict,
      
        4567
                result: str,
      
        4568
                expected: str = "",
      
        4569
            ) -> ActionVerification:
      
        4570
                raise AssertionError("Verification should not run in this scenario")
      
        4571
        
        4572
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        4573
            context = build_context(
      
        4574
                temp_dir=temp_dir,
      
        4575
                messages=[
      
        4576
                    Message(
      
        4577
                        role=Role.ASSISTANT,
      
        4578
                        content=(
      
        4579
                            "Repair focus:\n"
      
        4580
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        4581
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4582
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        4583
                        ),
      
        4584
                    )
      
        4585
                ],
      
        4586
                safeguards=FakeSafeguards(),
      
        4587
                assess_confidence=assess_confidence,
      
        4588
                verify_action=verify_action,
      
        4589
            )
      
        4590
            queued: list[str] = []
      
        4591
            context.queue_steering_message_callback = queued.append
      
        4592
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4593
        
        4594
            runner._queue_blocked_html_edit_nudge(
      
        4595
                ToolCall(
      
        4596
                    id="edit-1",
      
        4597
                    name="edit",
      
        4598
                    arguments={
      
        4599
                        "file_path": str(repair_target),
      
        4600
                        "old_string": "same",
      
        4601
                        "new_string": "same",
      
        4602
                    },
      
        4603
                ),
      
        4604
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        4605
            )
      
        4606
        
        4607
            assert queued
      
        4608
            assert str(repair_target) in queued[0]
      
        4609
            assert "no on-disk change" in queued[0]
      
        4610
            assert "replace the surrounding block" in queued[0]
      
        4611
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4612
        
        4613
        
        4614
        async def _noop_emit(event: AgentEvent) -> None:
      
        4615
            return None
      
        4616
        
        4617
        
        4618
        @pytest.mark.asyncio
      
        4619
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        4620
            temp_dir: Path,
      
        4621
        ) -> None:
      
        4622
            async def assess_confidence(
      
        4623
                tool_name: str,
      
        4624
                tool_args: dict,
      
        4625
                context: str,
      
        4626
            ) -> ConfidenceAssessment:
      
        4627
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4628
        
        4629
            async def verify_action(
      
        4630
                tool_name: str,
      
        4631
                tool_args: dict,
      
        4632
                result: str,
      
        4633
                expected: str = "",
      
        4634
            ) -> ActionVerification:
      
        4635
                raise AssertionError("Verification should not run for this scenario")
      
        4636
        
        4637
            context = build_context(
      
        4638
                temp_dir=temp_dir,
      
        4639
                messages=[],
      
        4640
                safeguards=FakeSafeguards(),
      
        4641
                assess_confidence=assess_confidence,
      
        4642
                verify_action=verify_action,
      
        4643
            )
      
        4644
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4645
            tool_call = ToolCall(
      
        4646
                id="write-1",
      
        4647
                name="write",
      
        4648
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4649
            )
      
        4650
            executor = FakeExecutor(
      
        4651
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4652
            )
      
        4653
            summary = TurnSummary(final_response="")
      
        4654
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4655
            events: list[AgentEvent] = []
      
        4656
        
        4657
            async def emit(event: AgentEvent) -> None:
      
        4658
                events.append(event)
      
        4659
        
        4660
            await runner.execute_batch(
      
        4661
                tool_calls=[tool_call],
      
        4662
                tool_source="assistant",
      
        4663
                pending_tool_calls_seen=set(),
      
        4664
                emit=emit,
      
        4665
                summary=summary,
      
        4666
                dod=dod,
      
        4667
                executor=executor,  # type: ignore[arg-type]
      
        4668
                on_confirmation=None,
      
        4669
                on_user_question=None,
      
        4670
                emit_confirmation=None,
      
        4671
                consecutive_errors=0,
      
        4672
            )
      
        4673
        
        4674
            assert dod.last_verification_result == "planned"
      
        4675
            assert dod.verification_commands
      
        4676
            assert "Collect verification evidence" in dod.pending_items
      
        4677
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        4678
            assert dod.active_verification_attempt_number == 1
      
        4679
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        4680
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        4681
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        4682
            assert (
      
        4683
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4684
                == "verification-attempt-1"
      
        4685
            )
      
        4686
            assert (
      
        4687
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4688
            )
      
        4689
        
        4690
        
        4691
        @pytest.mark.asyncio
      
        4692
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        4693
            temp_dir: Path,
      
        4694
        ) -> None:
      
        4695
            async def assess_confidence(
      
        4696
                tool_name: str,
      
        4697
                tool_args: dict,
      
        4698
                context: str,
      
        4699
            ) -> ConfidenceAssessment:
      
        4700
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4701
        
        4702
            async def verify_action(
      
        4703
                tool_name: str,
      
        4704
                tool_args: dict,
      
        4705
                result: str,
      
        4706
                expected: str = "",
      
        4707
            ) -> ActionVerification:
      
        4708
                raise AssertionError("Verification should not run in this scenario")
      
        4709
        
        4710
            context = build_context(
      
        4711
                temp_dir=temp_dir,
      
        4712
                messages=[],
      
        4713
                safeguards=FakeSafeguards(),
      
        4714
                assess_confidence=assess_confidence,
      
        4715
                verify_action=verify_action,
      
        4716
            )
      
        4717
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4718
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4719
            chapters = nginx_root / "chapters"
      
        4720
            implementation_plan = temp_dir / "implementation.md"
      
        4721
            implementation_plan.write_text(
      
        4722
                "\n".join(
      
        4723
                    [
      
        4724
                        "# Implementation Plan",
      
        4725
                        "",
      
        4726
                        "## File Changes",
      
        4727
                        f"- `{chapters}/`",
      
        4728
                        f"- `{nginx_root / 'index.html'}`",
      
        4729
                        "",
      
        4730
                    ]
      
        4731
                )
      
        4732
            )
      
        4733
        
        4734
            tool_call = ToolCall(
      
        4735
                id="mkdir-1",
      
        4736
                name="bash",
      
        4737
                arguments={"command": f"mkdir -p {chapters}"},
      
        4738
            )
      
        4739
            executor = FakeExecutor(
      
        4740
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        4741
            )
      
        4742
            summary = TurnSummary(final_response="")
      
        4743
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        4744
            dod.implementation_plan = str(implementation_plan)
      
        4745
            events: list[AgentEvent] = []
      
        4746
        
        4747
            async def emit(event: AgentEvent) -> None:
      
        4748
                events.append(event)
      
        4749
        
        4750
            await runner.execute_batch(
      
        4751
                tool_calls=[tool_call],
      
        4752
                tool_source="assistant",
      
        4753
                pending_tool_calls_seen=set(),
      
        4754
                emit=emit,
      
        4755
                summary=summary,
      
        4756
                dod=dod,
      
        4757
                executor=executor,  # type: ignore[arg-type]
      
        4758
                on_confirmation=None,
      
        4759
                on_user_question=None,
      
        4760
                emit_confirmation=None,
      
        4761
                consecutive_errors=0,
      
        4762
            )
      
        4763
        
        4764
            assert dod.last_verification_result is None
      
        4765
            assert "Collect verification evidence" not in dod.pending_items
      
        4766
            assert not any(
      
        4767
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        4768
            )
      
        4769
        
        4770
        
        4771
        @pytest.mark.asyncio
      
        4772
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        4773
            temp_dir: Path,
      
        4774
        ) -> None:
      
        4775
            async def assess_confidence(
      
        4776
                tool_name: str,
      
        4777
                tool_args: dict,
      
        4778
                context: str,
      
        4779
            ) -> ConfidenceAssessment:
      
        4780
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4781
        
        4782
            async def verify_action(
      
        4783
                tool_name: str,
      
        4784
                tool_args: dict,
      
        4785
                result: str,
      
        4786
                expected: str = "",
      
        4787
            ) -> ActionVerification:
      
        4788
                raise AssertionError("Verification should not run for this scenario")
      
        4789
        
        4790
            context = build_context(
      
        4791
                temp_dir=temp_dir,
      
        4792
                messages=[],
      
        4793
                safeguards=FakeSafeguards(),
      
        4794
                assess_confidence=assess_confidence,
      
        4795
                verify_action=verify_action,
      
        4796
            )
      
        4797
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4798
            tool_call = ToolCall(
      
        4799
                id="write-1",
      
        4800
                name="write",
      
        4801
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4802
            )
      
        4803
            executor = FakeExecutor(
      
        4804
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4805
            )
      
        4806
            summary = TurnSummary(final_response="")
      
        4807
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4808
            dod.verification_commands = ["uv run pytest -q"]
      
        4809
            dod.last_verification_result = "passed"
      
        4810
            dod.verification_attempt_counter = 1
      
        4811
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        4812
            dod.active_verification_attempt_number = 1
      
        4813
            dod.evidence = [
      
        4814
                VerificationEvidence(
      
        4815
                    command="uv run pytest -q",
      
        4816
                    passed=True,
      
        4817
                    stdout="401 passed",
      
        4818
                    kind="test",
      
        4819
                )
      
        4820
            ]
      
        4821
            dod.completed_items.append("Collect verification evidence")
      
        4822
            events: list[AgentEvent] = []
      
        4823
        
        4824
            async def emit(event: AgentEvent) -> None:
      
        4825
                events.append(event)
      
        4826
        
        4827
            await runner.execute_batch(
      
        4828
                tool_calls=[tool_call],
      
        4829
                tool_source="assistant",
      
        4830
                pending_tool_calls_seen=set(),
      
        4831
                emit=emit,
      
        4832
                summary=summary,
      
        4833
                dod=dod,
      
        4834
                executor=executor,  # type: ignore[arg-type]
      
        4835
                on_confirmation=None,
      
        4836
                on_user_question=None,
      
        4837
                emit_confirmation=None,
      
        4838
                consecutive_errors=0,
      
        4839
            )
      
        4840
        
        4841
            assert dod.last_verification_result == "stale"
      
        4842
            assert dod.evidence == []
      
        4843
            assert "Collect verification evidence" in dod.pending_items
      
        4844
            assert "Collect verification evidence" not in dod.completed_items
      
        4845
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        4846
            assert dod.active_verification_attempt_number == 2
      
        4847
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        4848
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        4849
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        4850
            assert (
      
        4851
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4852
                == "verification-attempt-1"
      
        4853
            )
      
        4854
            assert (
      
        4855
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4856
            )
      
        4857
            assert (
      
        4858
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        4859
                == "verification-attempt-2"
      
        4860
            )
      
        4861
            assert (
      
        4862
                summary.workflow_timeline[-1].verification_observations[0].command
      
        4863
                == "uv run pytest -q"
      
        4864
            )
      
        4865
        
        4866
        
        4867
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        4868
            async def assess_confidence(
      
        4869
                tool_name: str,
      
        4870
                tool_args: dict,
      
        4871
                context: str,
      
        4872
            ) -> ConfidenceAssessment:
      
        4873
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4874
        
        4875
            async def verify_action(
      
        4876
                tool_name: str,
      
        4877
                tool_args: dict,
      
        4878
                result: str,
      
        4879
                expected: str = "",
      
        4880
            ) -> ActionVerification:
      
        4881
                raise AssertionError("Verification should not run in this scenario")
      
        4882
        
        4883
            repair_target = temp_dir / "guide" / "index.html"
      
        4884
            context = build_context(
      
        4885
                temp_dir=temp_dir,
      
        4886
                messages=[
      
        4887
                    Message(
      
        4888
                        role=Role.ASSISTANT,
      
        4889
                        content=(
      
        4890
                            "Repair focus:\n"
      
        4891
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        4892
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4893
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        4894
                        ),
      
        4895
                    )
      
        4896
                ],
      
        4897
                safeguards=FakeSafeguards(),
      
        4898
                assess_confidence=assess_confidence,
      
        4899
                verify_action=verify_action,
      
        4900
            )
      
        4901
            queued: list[str] = []
      
        4902
            context.queue_steering_message_callback = queued.append
      
        4903
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4904
        
        4905
            runner._queue_blocked_active_repair_nudge(
      
        4906
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        4907
            )
      
        4908
        
        4909
            assert queued
      
        4910
            assert str(repair_target) in queued[0]
      
        4911
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        4912
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4913
        
        4914
        
        4915
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        4916
            temp_dir: Path,
      
        4917
        ) -> None:
      
        4918
            async def assess_confidence(
      
        4919
                tool_name: str,
      
        4920
                tool_args: dict,
      
        4921
                context: str,
      
        4922
            ) -> ConfidenceAssessment:
      
        4923
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4924
        
        4925
            async def verify_action(
      
        4926
                tool_name: str,
      
        4927
                tool_args: dict,
      
        4928
                result: str,
      
        4929
                expected: str = "",
      
        4930
            ) -> ActionVerification:
      
        4931
                raise AssertionError("Verification should not run in this scenario")
      
        4932
        
        4933
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        4934
            stylesheet = temp_dir / "guide" / "styles.css"
      
        4935
            context = build_context(
      
        4936
                temp_dir=temp_dir,
      
        4937
                messages=[
      
        4938
                    Message(
      
        4939
                        role=Role.ASSISTANT,
      
        4940
                        content=(
      
        4941
                            "Repair focus:\n"
      
        4942
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        4943
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4944
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        4945
                        ),
      
        4946
                    )
      
        4947
                ],
      
        4948
                safeguards=FakeSafeguards(),
      
        4949
                assess_confidence=assess_confidence,
      
        4950
                verify_action=verify_action,
      
        4951
            )
      
        4952
            queued: list[str] = []
      
        4953
            context.queue_steering_message_callback = queued.append
      
        4954
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4955
        
        4956
            runner._queue_blocked_active_repair_mutation_nudge(
      
        4957
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        4958
            )
      
        4959
        
        4960
            assert queued
      
        4961
            assert str(repair_target) in queued[0]
      
        4962
            assert str(stylesheet) in queued[0]
      
        4963
            assert "before widening the change set" in queued[0]
      
        4964
        
        4965
        
        4966
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        4967
            temp_dir: Path,
      
        4968
        ) -> None:
      
        4969
            async def assess_confidence(
      
        4970
                tool_name: str,
      
        4971
                tool_args: dict,
      
        4972
                context: str,
      
        4973
            ) -> ConfidenceAssessment:
      
        4974
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4975
        
        4976
            async def verify_action(
      
        4977
                tool_name: str,
      
        4978
                tool_args: dict,
      
        4979
                result: str,
      
        4980
                expected: str = "",
      
        4981
            ) -> ActionVerification:
      
        4982
                raise AssertionError("Verification should not run in this scenario")
      
        4983
        
        4984
            context = build_context(
      
        4985
                temp_dir=temp_dir,
      
        4986
                messages=[],
      
        4987
                safeguards=FakeSafeguards(),
      
        4988
                assess_confidence=assess_confidence,
      
        4989
                verify_action=verify_action,
      
        4990
            )
      
        4991
            queued: list[str] = []
      
        4992
            context.queue_steering_message_callback = queued.append
      
        4993
            store = DefinitionOfDoneStore(temp_dir)
      
        4994
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        4995
            plan_path = temp_dir / "implementation.md"
      
        4996
            plan_path.write_text(
      
        4997
                "# File Changes\n"
      
        4998
                "- `guide/index.html`\n"
      
        4999
                "- `guide/chapters/01-getting-started.html`\n"
      
        5000
                "- `guide/chapters/02-installation.html`\n"
      
        5001
                "- `guide/chapters/03-first-website.html`\n"
      
        5002
            )
      
        5003
            dod.implementation_plan = str(plan_path)
      
        5004
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        5005
            (temp_dir / "guide" / "index.html").write_text("index")
      
        5006
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        5007
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        5008
            runner = ToolBatchRunner(context, store)
      
        5009
        
        5010
            runner._queue_blocked_late_reference_drift_nudge(
      
        5011
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        5012
                dod=dod,
      
        5013
            )
      
        5014
        
        5015
            assert queued
      
        5016
            assert "03-first-website.html" in queued[0]
      
        5017
            assert "older reference materials" in queued[0]
      
        5018
        
        5019
        
        5020
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        5021
            temp_dir: Path,
      
        5022
        ) -> None:
      
        5023
            async def assess_confidence(
      
        5024
                tool_name: str,
      
        5025
                tool_args: dict,
      
        5026
                context: str,
      
        5027
            ) -> ConfidenceAssessment:
      
        5028
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5029
        
        5030
            async def verify_action(
      
        5031
                tool_name: str,
      
        5032
                tool_args: dict,
      
        5033
                result: str,
      
        5034
                expected: str = "",
      
        5035
            ) -> ActionVerification:
      
        5036
                raise AssertionError("Verification should not run in this scenario")
      
        5037
        
        5038
            guide_root = temp_dir / "guide"
      
        5039
            chapters = guide_root / "chapters"
      
        5040
            guide_root.mkdir(parents=True)
      
        5041
            chapters.mkdir()
      
        5042
            index_path = guide_root / "index.html"
      
        5043
            chapter_one = chapters / "01-getting-started.html"
      
        5044
            chapter_two = chapters / "02-installation.html"
      
        5045
            index_path.write_text("index")
      
        5046
            chapter_one.write_text("one")
      
        5047
            chapter_two.write_text("two")
      
        5048
        
        5049
            implementation_plan = temp_dir / "implementation.md"
      
        5050
            implementation_plan.write_text(
      
        5051
                "\n".join(
      
        5052
                    [
      
        5053
                        "# Implementation Plan",
      
        5054
                        "",
      
        5055
                        "## File Changes",
      
        5056
                        f"- `{guide_root}`",
      
        5057
                        f"- `{chapters}`",
      
        5058
                        f"- `{index_path}`",
      
        5059
                        f"- `{chapter_one}`",
      
        5060
                        f"- `{chapter_two}`",
      
        5061
                        "",
      
        5062
                    ]
      
        5063
                )
      
        5064
            )
      
        5065
        
        5066
            context = build_context(
      
        5067
                temp_dir=temp_dir,
      
        5068
                messages=[],
      
        5069
                safeguards=FakeSafeguards(),
      
        5070
                assess_confidence=assess_confidence,
      
        5071
                verify_action=verify_action,
      
        5072
            )
      
        5073
            queued: list[str] = []
      
        5074
            context.queue_steering_message_callback = queued.append
      
        5075
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5076
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        5077
            dod.implementation_plan = str(implementation_plan)
      
        5078
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5079
            sync_todos_to_definition_of_done(
      
        5080
                dod,
      
        5081
                [
      
        5082
                    {
      
        5083
                        "content": "Verify all guide files are linked and complete",
      
        5084
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        5085
                        "status": "pending",
      
        5086
                    }
      
        5087
                ],
      
        5088
                project_root=temp_dir,
      
        5089
            )
      
        5090
        
        5091
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        5092
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        5093
                dod=dod,
      
        5094
            )
      
        5095
        
        5096
            assert queued
      
        5097
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        5098
            assert "Verify all guide files are linked and complete" in queued[0]
      
        5099
            assert "Do not reopen earlier reference materials." in queued[0]
      
        5100
        
        5101
        
        5102
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        5103
            temp_dir: Path,
      
        5104
        ) -> None:
      
        5105
            async def assess_confidence(
      
        5106
                tool_name: str,
      
        5107
                tool_args: dict,
      
        5108
                context: str,
      
        5109
            ) -> ConfidenceAssessment:
      
        5110
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5111
        
        5112
            async def verify_action(
      
        5113
                tool_name: str,
      
        5114
                tool_args: dict,
      
        5115
                result: str,
      
        5116
                expected: str = "",
      
        5117
            ) -> ActionVerification:
      
        5118
                raise AssertionError("Verification should not run in this scenario")
      
        5119
        
        5120
            context = build_context(
      
        5121
                temp_dir=temp_dir,
      
        5122
                messages=[],
      
        5123
                safeguards=FakeSafeguards(),
      
        5124
                assess_confidence=assess_confidence,
      
        5125
                verify_action=verify_action,
      
        5126
            )
      
        5127
            queued: list[str] = []
      
        5128
            context.queue_steering_message_callback = queued.append
      
        5129
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5130
        
        5131
            runner._queue_blocked_html_declared_target_nudge(
      
        5132
                ToolCall(
      
        5133
                    id="write-ch1",
      
        5134
                    name="write",
      
        5135
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        5136
                ),
      
        5137
                (
      
        5138
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        5139
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        5140
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        5141
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        5142
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        5143
                ),
      
        5144
            )
      
        5145
        
        5146
            assert queued
      
        5147
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        5148
            assert "`chapters/02-installation.html`" in queued[0]
      
        5149
            assert "same file now" in queued[0]
      
        5150
        
        5151
        
        5152
        @pytest.mark.asyncio
      
        5153
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        5154
            temp_dir: Path,
      
        5155
        ) -> None:
      
        5156
            async def assess_confidence(
      
        5157
                tool_name: str,
      
        5158
                tool_args: dict,
      
        5159
                context: str,
      
        5160
            ) -> ConfidenceAssessment:
      
        5161
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5162
        
        5163
            async def verify_action(
      
        5164
                tool_name: str,
      
        5165
                tool_args: dict,
      
        5166
                result: str,
      
        5167
                expected: str = "",
      
        5168
            ) -> ActionVerification:
      
        5169
                raise AssertionError("Verification should not run in this scenario")
      
        5170
        
        5171
            guide_root = temp_dir / "guides" / "nginx"
      
        5172
            chapters = guide_root / "chapters"
      
        5173
            chapters.mkdir(parents=True)
      
        5174
            index_path = guide_root / "index.html"
      
        5175
            chapter_one = chapters / "01-introduction.html"
      
        5176
            chapter_two = chapters / "02-installation.html"
      
        5177
            index_path.write_text("<html></html>\n")
      
        5178
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        5179
        
        5180
            implementation_plan = temp_dir / "implementation.md"
      
        5181
            implementation_plan.write_text(
      
        5182
                "\n".join(
      
        5183
                    [
      
        5184
                        "# Implementation Plan",
      
        5185
                        "",
      
        5186
                        "## File Changes",
      
        5187
                        f"- `{index_path}`",
      
        5188
                        f"- `{chapter_one}`",
      
        5189
                        f"- `{chapter_two}`",
      
        5190
                        "",
      
        5191
                    ]
      
        5192
                )
      
        5193
            )
      
        5194
        
        5195
            context = build_context(
      
        5196
                temp_dir=temp_dir,
      
        5197
                messages=[],
      
        5198
                safeguards=FakeSafeguards(),
      
        5199
                assess_confidence=assess_confidence,
      
        5200
                verify_action=verify_action,
      
        5201
                auto_recover=False,
      
        5202
            )
      
        5203
            queued: list[str] = []
      
        5204
            context.queue_steering_message_callback = queued.append
      
        5205
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5206
            tool_call = ToolCall(
      
        5207
                id="write-2",
      
        5208
                name="write",
      
        5209
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        5210
            )
      
        5211
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        5212
            executor = FakeExecutor(
      
        5213
                [
      
        5214
                    ToolExecutionOutcome(
      
        5215
                        tool_call=tool_call,
      
        5216
                        state=ToolExecutionState.BLOCKED,
      
        5217
                        message=Message.tool_result_message(
      
        5218
                            tool_call_id=tool_call.id,
      
        5219
                            display_content=blocked_message,
      
        5220
                            result_content=blocked_message,
      
        5221
                            is_error=True,
      
        5222
                        ),
      
        5223
                        event_content=blocked_message,
      
        5224
                        is_error=True,
      
        5225
                        result_output=blocked_message,
      
        5226
                    )
      
        5227
                ]
      
        5228
            )
      
        5229
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5230
            dod.implementation_plan = str(implementation_plan)
      
        5231
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5232
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        5233
        
        5234
            await runner.execute_batch(
      
        5235
                tool_calls=[tool_call],
      
        5236
                tool_source="assistant",
      
        5237
                pending_tool_calls_seen=set(),
      
        5238
                emit=_noop_emit,
      
        5239
                summary=TurnSummary(final_response=""),
      
        5240
                dod=dod,
      
        5241
                executor=executor,  # type: ignore[arg-type]
      
        5242
                on_confirmation=None,
      
        5243
                on_user_question=None,
      
        5244
                emit_confirmation=None,
      
        5245
                consecutive_errors=0,
      
        5246
            )
      
        5247
        
        5248
            assert queued
      
        5249
            assert "did not provide a valid `file_path`" in queued[0]
      
        5250
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        5251
            assert (
      
        5252
                f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
      
        5253
                in queued[0]
      
        5254
            )
      
        5255
            assert context.recovery_context is not None
      
        5256
            assert context.recovery_context.attempts[-1].error == blocked_message