loader Public

Watch 0 Fork 0 Star 0
Python · 284971 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.path_display import display_runtime_path
      
        20
        from loader.runtime.permissions import (
      
        21
            PermissionMode,
      
        22
            build_permission_policy,
      
        23
            load_permission_rules,
      
        24
        )
      
        25
        from loader.runtime.reasoning_types import (
      
        26
            ActionVerification,
      
        27
            ConfidenceAssessment,
      
        28
            ConfidenceLevel,
      
        29
        )
      
        30
        from loader.runtime.recovery import RecoveryContext
      
        31
        from loader.runtime.tool_batches import (
      
        32
            ToolBatchRunner,
      
        33
        )
      
        34
        from loader.runtime.tool_batches import (
      
        35
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        36
        )
      
        37
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        38
        from loader.tools.base import ToolResult as RegistryToolResult
      
        39
        from loader.tools.base import create_default_registry
      
        40
        from tests.helpers.runtime_harness import ScriptedBackend
      
        41
        
        42
        
        43
        class FakeSession:
      
        44
            def __init__(self, messages: list[Message]) -> None:
      
        45
                self.messages = list(messages)
      
        46
                self.workflow_timeline = []
      
        47
        
        48
            def append(self, message: Message) -> None:
      
        49
                self.messages.append(message)
      
        50
        
        51
            def append_workflow_timeline_entry(self, entry) -> None:
      
        52
                self.workflow_timeline.append(entry)
      
        53
        
        54
        
        55
        class FakeCodeFilter:
      
        56
            def reset(self) -> None:
      
        57
                return None
      
        58
        
        59
        
        60
        class FakeSafeguards:
      
        61
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        62
                self.action_tracker = object()
      
        63
                self.validator = object()
      
        64
                self.code_filter = FakeCodeFilter()
      
        65
                self._detect_loop_result = detect_loop_result
      
        66
        
        67
            def filter_stream_chunk(self, content: str) -> str:
      
        68
                return content
      
        69
        
        70
            def filter_complete_content(self, content: str) -> str:
      
        71
                return content
      
        72
        
        73
            def should_steer(self) -> bool:
      
        74
                return False
      
        75
        
        76
            def get_steering_message(self) -> str | None:
      
        77
                return None
      
        78
        
        79
            def record_response(self, content: str) -> None:
      
        80
                return None
      
        81
        
        82
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        83
                return False, ""
      
        84
        
        85
            def detect_loop(self) -> tuple[bool, str]:
      
        86
                return self._detect_loop_result
      
        87
        
        88
        
        89
        class FakeExecutor:
      
        90
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        91
                self._outcomes = list(outcomes)
      
        92
                self.calls: list[ToolCall] = []
      
        93
        
        94
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        95
                self.calls.append(tool_call)
      
        96
                if not self._outcomes:
      
        97
                    raise AssertionError("No fake tool outcome queued")
      
        98
                return self._outcomes.pop(0)
      
        99
        
        100
        
        101
        def build_context(
      
        102
            *,
      
        103
            temp_dir: Path,
      
        104
            messages: list[Message],
      
        105
            safeguards: FakeSafeguards,
      
        106
            assess_confidence,
      
        107
            verify_action,
      
        108
            recovery_context: RecoveryContext | None = None,
      
        109
            confidence_scoring: bool = False,
      
        110
            verification: bool = False,
      
        111
            auto_recover: bool = True,
      
        112
            min_confidence_for_action: int = 3,
      
        113
        ) -> RuntimeContext:
      
        114
            registry = create_default_registry(temp_dir)
      
        115
            registry.configure_workspace_root(temp_dir)
      
        116
            rule_status = load_permission_rules(temp_dir)
      
        117
            policy = build_permission_policy(
      
        118
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        119
                workspace_root=temp_dir,
      
        120
                tool_requirements=registry.get_tool_requirements(),
      
        121
                rules=rule_status.rules,
      
        122
            )
      
        123
            context = RuntimeContext(
      
        124
                project_root=temp_dir,
      
        125
                backend=ScriptedBackend(),
      
        126
                registry=registry,
      
        127
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        128
                config=SimpleNamespace(
      
        129
                    force_react=False,
      
        130
                    max_recovery_attempts=2,
      
        131
                    auto_recover=auto_recover,
      
        132
                    reasoning=SimpleNamespace(
      
        133
                        rollback=False,
      
        134
                        show_rollback_plan=False,
      
        135
                        completion_check=True,
      
        136
                        max_continuation_prompts=5,
      
        137
                        self_critique=False,
      
        138
                        confidence_scoring=confidence_scoring,
      
        139
                        min_confidence_for_action=min_confidence_for_action,
      
        140
                        verification=verification,
      
        141
                    ),
      
        142
                ),
      
        143
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        144
                project_context=None,
      
        145
                permission_policy=policy,
      
        146
                permission_config_status=rule_status,
      
        147
                workflow_mode="execute",
      
        148
                safeguards=safeguards,
      
        149
                reasoning=SimpleNamespace(
      
        150
                    assess_confidence=assess_confidence,
      
        151
                    verify_action=verify_action,
      
        152
                ),
      
        153
                recovery_context=recovery_context,
      
        154
            )
      
        155
            return context
      
        156
        
        157
        
        158
        def tool_outcome(
      
        159
            *,
      
        160
            tool_call: ToolCall,
      
        161
            output: str,
      
        162
            is_error: bool,
      
        163
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        164
            metadata: dict[str, object] | None = None,
      
        165
        ) -> ToolExecutionOutcome:
      
        166
            return ToolExecutionOutcome(
      
        167
                tool_call=tool_call,
      
        168
                state=state,
      
        169
                message=Message.tool_result_message(
      
        170
                    tool_call_id=tool_call.id,
      
        171
                    display_content=output,
      
        172
                    result_content=output,
      
        173
                    is_error=is_error,
      
        174
                ),
      
        175
                event_content=output,
      
        176
                is_error=is_error,
      
        177
                result_output=output,
      
        178
                registry_result=RegistryToolResult(
      
        179
                    output=output,
      
        180
                    is_error=is_error,
      
        181
                    metadata=metadata or {},
      
        182
                ),
      
        183
            )
      
        184
        
        185
        
        186
        @pytest.mark.asyncio
      
        187
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        188
            captured: dict[str, str] = {}
      
        189
        
        190
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        191
                captured["context"] = context
      
        192
                return ConfidenceAssessment(
      
        193
                    action=f"{tool_name} with {tool_args}",
      
        194
                    tool_name=tool_name,
      
        195
                    tool_args=tool_args,
      
        196
                    level=ConfidenceLevel.LOW,
      
        197
                    reasoning="Need to inspect the target first.",
      
        198
                    risks=["Unknown target file"],
      
        199
                )
      
        200
        
        201
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        202
                raise AssertionError("Verification should not run for skipped actions")
      
        203
        
        204
            context = build_context(
      
        205
                temp_dir=temp_dir,
      
        206
                messages=[
      
        207
                    Message(role=Role.USER, content="Please inspect the project."),
      
        208
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        209
                ],
      
        210
                safeguards=FakeSafeguards(),
      
        211
                assess_confidence=assess_confidence,
      
        212
                verify_action=verify_action,
      
        213
                confidence_scoring=True,
      
        214
                min_confidence_for_action=3,
      
        215
            )
      
        216
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        217
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        218
            events: list[AgentEvent] = []
      
        219
        
        220
            async def emit(event: AgentEvent) -> None:
      
        221
                events.append(event)
      
        222
        
        223
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        224
            result = await runner.execute_batch(
      
        225
                tool_calls=[tool_call],
      
        226
                tool_source="assistant",
      
        227
                pending_tool_calls_seen=set(),
      
        228
                emit=emit,
      
        229
                summary=TurnSummary(final_response=""),
      
        230
                dod=create_definition_of_done("Read the docs"),
      
        231
                executor=executor,  # type: ignore[arg-type]
      
        232
                on_confirmation=None,
      
        233
                on_user_question=None,
      
        234
                emit_confirmation=None,
      
        235
                consecutive_errors=0,
      
        236
            )
      
        237
        
        238
            assert result.actions_taken == []
      
        239
            assert executor.calls == []
      
        240
            assert "Please inspect the project." in captured["context"]
      
        241
            assert context.session.messages[-1].role == Role.USER
      
        242
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        243
            event_types = [event.type for event in events]
      
        244
            assert "confidence" in event_types
      
        245
        
        246
        
        247
        @pytest.mark.asyncio
      
        248
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        249
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        250
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        251
        
        252
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        253
                raise AssertionError("Verification should not run for failed actions")
      
        254
        
        255
            context = build_context(
      
        256
                temp_dir=temp_dir,
      
        257
                messages=[],
      
        258
                safeguards=FakeSafeguards(),
      
        259
                assess_confidence=assess_confidence,
      
        260
                verify_action=verify_action,
      
        261
                auto_recover=True,
      
        262
            )
      
        263
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        264
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        265
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        266
            summary = TurnSummary(final_response="")
      
        267
            events: list[AgentEvent] = []
      
        268
        
        269
            async def emit(event: AgentEvent) -> None:
      
        270
                events.append(event)
      
        271
        
        272
            await runner.execute_batch(
      
        273
                tool_calls=[tool_call],
      
        274
                tool_source="assistant",
      
        275
                pending_tool_calls_seen=set(),
      
        276
                emit=emit,
      
        277
                summary=summary,
      
        278
                dod=create_definition_of_done("Run tests"),
      
        279
                executor=executor,  # type: ignore[arg-type]
      
        280
                on_confirmation=None,
      
        281
                on_user_question=None,
      
        282
                emit_confirmation=None,
      
        283
                consecutive_errors=0,
      
        284
            )
      
        285
        
        286
            assert context.recovery_context is not None
      
        287
            assert summary.tool_result_messages
      
        288
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        289
            assert any(event.type == "recovery" for event in events)
      
        290
        
        291
        
        292
        @pytest.mark.asyncio
      
        293
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        294
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        295
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        296
        
        297
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        298
                raise AssertionError("Verification should not run for this scenario")
      
        299
        
        300
            context = build_context(
      
        301
                temp_dir=temp_dir,
      
        302
                messages=[],
      
        303
                safeguards=FakeSafeguards(),
      
        304
                assess_confidence=assess_confidence,
      
        305
                verify_action=verify_action,
      
        306
                auto_recover=False,
      
        307
            )
      
        308
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        309
            tool_call = ToolCall(
      
        310
                id="bash-1",
      
        311
                name="bash",
      
        312
                arguments={"command": "python -m http.server 8000", "background": True},
      
        313
            )
      
        314
            metadata = {
      
        315
                "job_id": "bash-1",
      
        316
                "status": "running",
      
        317
                "background": True,
      
        318
            }
      
        319
            executor = FakeExecutor(
      
        320
                [
      
        321
                    tool_outcome(
      
        322
                        tool_call=tool_call,
      
        323
                        output="Started bash job bash-1",
      
        324
                        is_error=False,
      
        325
                        metadata=metadata,
      
        326
                    )
      
        327
                ]
      
        328
            )
      
        329
            events: list[AgentEvent] = []
      
        330
        
        331
            async def emit(event: AgentEvent) -> None:
      
        332
                events.append(event)
      
        333
        
        334
            await runner.execute_batch(
      
        335
                tool_calls=[tool_call],
      
        336
                tool_source="assistant",
      
        337
                pending_tool_calls_seen=set(),
      
        338
                emit=emit,
      
        339
                summary=TurnSummary(final_response=""),
      
        340
                dod=create_definition_of_done("Launch a preview server"),
      
        341
                executor=executor,  # type: ignore[arg-type]
      
        342
                on_confirmation=None,
      
        343
                on_user_question=None,
      
        344
                emit_confirmation=None,
      
        345
                consecutive_errors=0,
      
        346
            )
      
        347
        
        348
            tool_result = next(event for event in events if event.type == "tool_result")
      
        349
            assert tool_result.tool_metadata == metadata
      
        350
        
        351
        
        352
        @pytest.mark.asyncio
      
        353
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        354
            verification_calls: list[str] = []
      
        355
        
        356
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        357
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        358
        
        359
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        360
                verification_calls.append(result)
      
        361
                return ActionVerification(
      
        362
                    tool_name=tool_name,
      
        363
                    tool_args=tool_args,
      
        364
                    expected_outcome="Success",
      
        365
                    actual_result=result,
      
        366
                    verified=False,
      
        367
                    discrepancies=["File contents did not match"],
      
        368
                    needs_correction=True,
      
        369
                    correction_suggestion="Read the file before editing again.",
      
        370
                )
      
        371
        
        372
            existing_recovery = RecoveryContext(
      
        373
                original_tool="edit",
      
        374
                original_args={"file_path": "README.md"},
      
        375
            )
      
        376
            context = build_context(
      
        377
                temp_dir=temp_dir,
      
        378
                messages=[],
      
        379
                safeguards=FakeSafeguards(),
      
        380
                assess_confidence=assess_confidence,
      
        381
                verify_action=verify_action,
      
        382
                recovery_context=existing_recovery,
      
        383
                verification=True,
      
        384
            )
      
        385
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        386
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        387
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        388
            events: list[AgentEvent] = []
      
        389
        
        390
            async def emit(event: AgentEvent) -> None:
      
        391
                events.append(event)
      
        392
        
        393
            await runner.execute_batch(
      
        394
                tool_calls=[tool_call],
      
        395
                tool_source="assistant",
      
        396
                pending_tool_calls_seen=set(),
      
        397
                emit=emit,
      
        398
                summary=TurnSummary(final_response=""),
      
        399
                dod=create_definition_of_done("Read the docs"),
      
        400
                executor=executor,  # type: ignore[arg-type]
      
        401
                on_confirmation=None,
      
        402
                on_user_question=None,
      
        403
                emit_confirmation=None,
      
        404
                consecutive_errors=0,
      
        405
            )
      
        406
        
        407
            assert verification_calls == ["file contents"]
      
        408
            assert context.recovery_context is existing_recovery
      
        409
            assert existing_recovery.successful_steps == [
      
        410
                ("read", {"file_path": "README.md"})
      
        411
            ]
      
        412
            assert context.session.messages[-1].role == Role.TOOL
      
        413
            assert context.session.messages[-1].content == "file contents"
      
        414
            assert any(event.type == "verification" for event in events)
      
        415
        
        416
        
        417
        @pytest.mark.asyncio
      
        418
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        419
            temp_dir: Path,
      
        420
        ) -> None:
      
        421
            async def assess_confidence(
      
        422
                tool_name: str,
      
        423
                tool_args: dict,
      
        424
                context: str,
      
        425
            ) -> ConfidenceAssessment:
      
        426
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        427
        
        428
            async def verify_action(
      
        429
                tool_name: str,
      
        430
                tool_args: dict,
      
        431
                result: str,
      
        432
                expected: str = "",
      
        433
            ) -> ActionVerification:
      
        434
                raise AssertionError("Verification should not run for this scenario")
      
        435
        
        436
            existing_recovery = RecoveryContext(
      
        437
                original_tool="read",
      
        438
                original_args={"file_path": "chapters/04-data-types.html"},
      
        439
            )
      
        440
            existing_recovery.add_attempt(
      
        441
                "read",
      
        442
                {"file_path": "chapters/04-data-types.html"},
      
        443
                "File not found",
      
        444
            )
      
        445
            context = build_context(
      
        446
                temp_dir=temp_dir,
      
        447
                messages=[],
      
        448
                safeguards=FakeSafeguards(),
      
        449
                assess_confidence=assess_confidence,
      
        450
                verify_action=verify_action,
      
        451
                recovery_context=existing_recovery,
      
        452
                auto_recover=False,
      
        453
            )
      
        454
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        455
            tool_call = ToolCall(
      
        456
                id="bash-1",
      
        457
                name="bash",
      
        458
                arguments={"command": "ls chapters"},
      
        459
            )
      
        460
            executor = FakeExecutor(
      
        461
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        462
            )
      
        463
        
        464
            summary = TurnSummary(final_response="")
      
        465
            await runner.execute_batch(
      
        466
                tool_calls=[tool_call],
      
        467
                tool_source="assistant",
      
        468
                pending_tool_calls_seen=set(),
      
        469
                emit=_noop_emit,
      
        470
                summary=summary,
      
        471
                dod=create_definition_of_done("Fix the chapter links"),
      
        472
                executor=executor,  # type: ignore[arg-type]
      
        473
                on_confirmation=None,
      
        474
                on_user_question=None,
      
        475
                emit_confirmation=None,
      
        476
                consecutive_errors=0,
      
        477
            )
      
        478
        
        479
            assert context.recovery_context is existing_recovery
      
        480
            assert existing_recovery.successful_steps == [
      
        481
                ("bash", {"command": "ls chapters"})
      
        482
            ]
      
        483
        
        484
        
        485
        @pytest.mark.asyncio
      
        486
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        487
            temp_dir: Path,
      
        488
        ) -> None:
      
        489
            async def assess_confidence(
      
        490
                tool_name: str,
      
        491
                tool_args: dict,
      
        492
                context: str,
      
        493
            ) -> ConfidenceAssessment:
      
        494
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        495
        
        496
            async def verify_action(
      
        497
                tool_name: str,
      
        498
                tool_args: dict,
      
        499
                result: str,
      
        500
                expected: str = "",
      
        501
            ) -> ActionVerification:
      
        502
                raise AssertionError("Verification should not run for this scenario")
      
        503
        
        504
            existing_recovery = RecoveryContext(
      
        505
                original_tool="read",
      
        506
                original_args={"file_path": "chapters/04-data-types.html"},
      
        507
            )
      
        508
            existing_recovery.add_attempt(
      
        509
                "read",
      
        510
                {"file_path": "chapters/04-data-types.html"},
      
        511
                "File not found",
      
        512
            )
      
        513
            context = build_context(
      
        514
                temp_dir=temp_dir,
      
        515
                messages=[],
      
        516
                safeguards=FakeSafeguards(),
      
        517
                assess_confidence=assess_confidence,
      
        518
                verify_action=verify_action,
      
        519
                recovery_context=existing_recovery,
      
        520
                auto_recover=False,
      
        521
            )
      
        522
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        523
            tool_call = ToolCall(
      
        524
                id="patch-1",
      
        525
                name="patch",
      
        526
                arguments={
      
        527
                    "file_path": "index.html",
      
        528
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        529
                },
      
        530
            )
      
        531
            executor = FakeExecutor(
      
        532
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        533
            )
      
        534
        
        535
            summary = TurnSummary(final_response="")
      
        536
            await runner.execute_batch(
      
        537
                tool_calls=[tool_call],
      
        538
                tool_source="assistant",
      
        539
                pending_tool_calls_seen=set(),
      
        540
                emit=_noop_emit,
      
        541
                summary=summary,
      
        542
                dod=create_definition_of_done("Fix the chapter links"),
      
        543
                executor=executor,  # type: ignore[arg-type]
      
        544
                on_confirmation=None,
      
        545
                on_user_question=None,
      
        546
                emit_confirmation=None,
      
        547
                consecutive_errors=0,
      
        548
            )
      
        549
        
        550
            assert context.recovery_context is None
      
        551
        
        552
        
        553
        @pytest.mark.asyncio
      
        554
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        555
            temp_dir: Path,
      
        556
        ) -> None:
      
        557
            async def assess_confidence(
      
        558
                tool_name: str,
      
        559
                tool_args: dict,
      
        560
                context: str,
      
        561
            ) -> ConfidenceAssessment:
      
        562
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        563
        
        564
            async def verify_action(
      
        565
                tool_name: str,
      
        566
                tool_args: dict,
      
        567
                result: str,
      
        568
                expected: str = "",
      
        569
            ) -> ActionVerification:
      
        570
                raise AssertionError("Verification should not run for this scenario")
      
        571
        
        572
            messages = [
      
        573
                Message(
      
        574
                    role=Role.TOOL,
      
        575
                    content=(
      
        576
                        "Observation [glob]: Result: "
      
        577
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        578
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        579
                        f"{temp_dir}/chapters/03-basics.html"
      
        580
                    ),
      
        581
                    tool_results=[],
      
        582
                ),
      
        583
                Message(
      
        584
                    role=Role.ASSISTANT,
      
        585
                    content="I already inspected the first chapter title.",
      
        586
                    tool_calls=[
      
        587
                        ToolCall(
      
        588
                            id="read-ch1",
      
        589
                            name="read",
      
        590
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        591
                        )
      
        592
                    ],
      
        593
                ),
      
        594
                Message.tool_result_message(
      
        595
                    tool_call_id="read-ch1",
      
        596
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        598
                ),
      
        599
                Message(
      
        600
                    role=Role.ASSISTANT,
      
        601
                    content="I should update the index now.",
      
        602
                    tool_calls=[
      
        603
                        ToolCall(
      
        604
                            id="read-index",
      
        605
                            name="read",
      
        606
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        607
                        )
      
        608
                    ],
      
        609
                ),
      
        610
            ]
      
        611
            context = build_context(
      
        612
                temp_dir=temp_dir,
      
        613
                messages=messages,
      
        614
                safeguards=FakeSafeguards(),
      
        615
                assess_confidence=assess_confidence,
      
        616
                verify_action=verify_action,
      
        617
                auto_recover=False,
      
        618
            )
      
        619
            (temp_dir / "chapters").mkdir()
      
        620
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        621
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        622
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        623
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        624
            implementation_plan = temp_dir / "implementation.md"
      
        625
            implementation_plan.write_text(
      
        626
                "\n".join(
      
        627
                    [
      
        628
                        "# Implementation Plan",
      
        629
                        "",
      
        630
                        "## File Changes",
      
        631
                        f"- `{temp_dir / 'index.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        635
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        636
                    ]
      
        637
                )
      
        638
            )
      
        639
            context.session.current_task = (
      
        640
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        641
            )
      
        642
            persistent_messages: list[str] = []
      
        643
            ephemeral_messages: list[str] = []
      
        644
            context.queue_steering_message_callback = persistent_messages.append
      
        645
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        646
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        647
            tool_call = ToolCall(
      
        648
                id="read-dup",
      
        649
                name="read",
      
        650
                arguments={"file_path": str(temp_dir / "index.html")},
      
        651
            )
      
        652
            duplicate_message = (
      
        653
                "[Skipped - duplicate action: Already read "
      
        654
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        655
                "reuse the earlier read result instead of rereading]"
      
        656
            )
      
        657
            executor = FakeExecutor(
      
        658
                [
      
        659
                    ToolExecutionOutcome(
      
        660
                        tool_call=tool_call,
      
        661
                        state=ToolExecutionState.DUPLICATE,
      
        662
                        message=Message.tool_result_message(
      
        663
                            tool_call_id=tool_call.id,
      
        664
                            display_content=duplicate_message,
      
        665
                            result_content=duplicate_message,
      
        666
                        ),
      
        667
                        event_content=duplicate_message,
      
        668
                        is_error=False,
      
        669
                        result_output=duplicate_message,
      
        670
                    )
      
        671
                ]
      
        672
            )
      
        673
        
        674
            summary = TurnSummary(final_response="")
      
        675
            dod = create_definition_of_done("Fix the chapter links")
      
        676
            dod.implementation_plan = str(implementation_plan)
      
        677
            dod.pending_items.append("Create the remaining chapter files")
      
        678
            await runner.execute_batch(
      
        679
                tool_calls=[tool_call],
      
        680
                tool_source="assistant",
      
        681
                pending_tool_calls_seen=set(),
      
        682
                emit=_noop_emit,
      
        683
                summary=summary,
      
        684
                dod=dod,
      
        685
                executor=executor,  # type: ignore[arg-type]
      
        686
                on_confirmation=None,
      
        687
                on_user_question=None,
      
        688
                emit_confirmation=None,
      
        689
                consecutive_errors=0,
      
        690
            )
      
        691
        
        692
            assert len(persistent_messages) == 1
      
        693
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        694
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        695
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        696
            assert (
      
        697
                "Prefer one `write` call for "
      
        698
                f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
      
        699
                in persistent_messages[0]
      
        700
            )
      
        701
            assert ephemeral_messages == []
      
        702
        
        703
        
        704
        @pytest.mark.asyncio
      
        705
        async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
      
        706
            temp_dir: Path,
      
        707
        ) -> None:
      
        708
            async def assess_confidence(
      
        709
                tool_name: str,
      
        710
                tool_args: dict,
      
        711
                context: str,
      
        712
            ) -> ConfidenceAssessment:
      
        713
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        714
        
        715
            async def verify_action(
      
        716
                tool_name: str,
      
        717
                tool_args: dict,
      
        718
                result: str,
      
        719
                expected: str = "",
      
        720
            ) -> ActionVerification:
      
        721
                raise AssertionError("Verification should not run for this scenario")
      
        722
        
        723
            guide_root = temp_dir / "guide"
      
        724
            chapters = guide_root / "chapters"
      
        725
            chapters.mkdir(parents=True)
      
        726
            index = guide_root / "index.html"
      
        727
            chapter_one = chapters / "01-introduction.html"
      
        728
            index.write_text(
      
        729
                '<a href="chapters/01-introduction.html">Intro</a>\n'
      
        730
                '<a href="chapters/02-installation.html">Install</a>\n'
      
        731
            )
      
        732
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        733
        
        734
            implementation_plan = temp_dir / "implementation.md"
      
        735
            implementation_plan.write_text(
      
        736
                "\n".join(
      
        737
                    [
      
        738
                        "# Implementation Plan",
      
        739
                        "",
      
        740
                        "## File Changes",
      
        741
                        f"- `{index}`",
      
        742
                        f"- `{chapters}/` (directory for chapter files)",
      
        743
                    ]
      
        744
                )
      
        745
            )
      
        746
        
        747
            messages = [
      
        748
                Message(
      
        749
                    role=Role.ASSISTANT,
      
        750
                    content="I should keep building the guide.",
      
        751
                    tool_calls=[
      
        752
                        ToolCall(
      
        753
                            id="read-index",
      
        754
                            name="read",
      
        755
                            arguments={"file_path": str(index)},
      
        756
                        )
      
        757
                    ],
      
        758
                ),
      
        759
            ]
      
        760
            context = build_context(
      
        761
                temp_dir=temp_dir,
      
        762
                messages=messages,
      
        763
                safeguards=FakeSafeguards(),
      
        764
                assess_confidence=assess_confidence,
      
        765
                verify_action=verify_action,
      
        766
                auto_recover=False,
      
        767
            )
      
        768
            context.session.current_task = f"Build the guide rooted at {index}."
      
        769
            persistent_messages: list[str] = []
      
        770
            ephemeral_messages: list[str] = []
      
        771
            context.queue_steering_message_callback = persistent_messages.append
      
        772
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        773
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        774
            tool_call = ToolCall(
      
        775
                id="read-dup-rooted",
      
        776
                name="read",
      
        777
                arguments={"file_path": str(index)},
      
        778
            )
      
        779
            duplicate_message = (
      
        780
                "[Skipped - duplicate action: Already read "
      
        781
                f"{index} recently without any intervening changes; "
      
        782
                "reuse the earlier read result instead of rereading]"
      
        783
            )
      
        784
            executor = FakeExecutor(
      
        785
                [
      
        786
                    ToolExecutionOutcome(
      
        787
                        tool_call=tool_call,
      
        788
                        state=ToolExecutionState.DUPLICATE,
      
        789
                        message=Message.tool_result_message(
      
        790
                            tool_call_id=tool_call.id,
      
        791
                            display_content=duplicate_message,
      
        792
                            result_content=duplicate_message,
      
        793
                        ),
      
        794
                        event_content=duplicate_message,
      
        795
                        is_error=False,
      
        796
                        result_output=duplicate_message,
      
        797
                    )
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
      
        803
            dod.implementation_plan = str(implementation_plan)
      
        804
            dod.touched_files = [str(index), str(chapter_one)]
      
        805
            dod.completed_items = ["Create chapter files with appropriate content"]
      
        806
            dod.pending_items.append("Create the remaining chapter files")
      
        807
        
        808
            await runner.execute_batch(
      
        809
                tool_calls=[tool_call],
      
        810
                tool_source="assistant",
      
        811
                pending_tool_calls_seen=set(),
      
        812
                emit=_noop_emit,
      
        813
                summary=summary,
      
        814
                dod=dod,
      
        815
                executor=executor,  # type: ignore[arg-type]
      
        816
                on_confirmation=None,
      
        817
                on_user_question=None,
      
        818
                emit_confirmation=None,
      
        819
                consecutive_errors=0,
      
        820
            )
      
        821
        
        822
            assert len(persistent_messages) == 1
      
        823
            assert "Create the remaining chapter files" in persistent_messages[0]
      
        824
            assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
      
        825
            assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
      
        826
            assert ephemeral_messages == []
      
        827
        
        828
        
        829
        @pytest.mark.asyncio
      
        830
        async def test_tool_batch_runner_duplicate_read_after_edit_mismatch_steers_to_mutation(
      
        831
            temp_dir: Path,
      
        832
        ) -> None:
      
        833
            async def assess_confidence(
      
        834
                tool_name: str,
      
        835
                tool_args: dict,
      
        836
                context: str,
      
        837
            ) -> ConfidenceAssessment:
      
        838
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        839
        
        840
            async def verify_action(
      
        841
                tool_name: str,
      
        842
                tool_args: dict,
      
        843
                result: str,
      
        844
                expected: str = "",
      
        845
            ) -> ActionVerification:
      
        846
                raise AssertionError("Verification should not run for this scenario")
      
        847
        
        848
            target = temp_dir / "guide" / "chapters" / "02-installation.html"
      
        849
            target.parent.mkdir(parents=True)
      
        850
            target.write_text(
      
        851
                "<h1>Chapter 2: Installation Guide</h1>\n"
      
        852
                "<p>This chapter is still too thin.</p>\n"
      
        853
            )
      
        854
            recovery_context = RecoveryContext(
      
        855
                original_tool="edit",
      
        856
                original_args={
      
        857
                    "file_path": str(target),
      
        858
                    "old_string": "<h1>Installation</h1>",
      
        859
                    "new_string": "<h1>Installation</h1><p>Expanded.</p>",
      
        860
                },
      
        861
                max_retries=2,
      
        862
            )
      
        863
            recovery_context.add_attempt(
      
        864
                "edit",
      
        865
                {
      
        866
                    "file_path": str(target),
      
        867
                    "old_string": "<h1>Installation</h1>",
      
        868
                    "new_string": "<h1>Installation</h1><p>Expanded.</p>",
      
        869
                },
      
        870
                "old_string not found in file. Make sure it matches exactly.",
      
        871
            )
      
        872
            context = build_context(
      
        873
                temp_dir=temp_dir,
      
        874
                messages=[],
      
        875
                safeguards=FakeSafeguards(),
      
        876
                assess_confidence=assess_confidence,
      
        877
                verify_action=verify_action,
      
        878
                recovery_context=recovery_context,
      
        879
                auto_recover=False,
      
        880
            )
      
        881
            persistent_messages: list[str] = []
      
        882
            context.queue_steering_message_callback = persistent_messages.append
      
        883
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        884
            tool_call = ToolCall(
      
        885
                id="read-dup-after-edit-miss",
      
        886
                name="read",
      
        887
                arguments={"file_path": str(target)},
      
        888
            )
      
        889
            duplicate_message = (
      
        890
                "[Skipped - duplicate action: Already read "
      
        891
                f"{target} recently without any intervening changes; "
      
        892
                "reuse the earlier read result instead of rereading]"
      
        893
            )
      
        894
            executor = FakeExecutor(
      
        895
                [
      
        896
                    ToolExecutionOutcome(
      
        897
                        tool_call=tool_call,
      
        898
                        state=ToolExecutionState.DUPLICATE,
      
        899
                        message=Message.tool_result_message(
      
        900
                            tool_call_id=tool_call.id,
      
        901
                            display_content=duplicate_message,
      
        902
                            result_content=duplicate_message,
      
        903
                        ),
      
        904
                        event_content=duplicate_message,
      
        905
                        is_error=False,
      
        906
                        result_output=duplicate_message,
      
        907
                    )
      
        908
                ]
      
        909
            )
      
        910
            dod = create_definition_of_done("Expand thin generated guide chapters.")
      
        911
        
        912
            await runner.execute_batch(
      
        913
                tool_calls=[tool_call],
      
        914
                tool_source="assistant",
      
        915
                pending_tool_calls_seen=set(),
      
        916
                emit=_noop_emit,
      
        917
                summary=TurnSummary(final_response=""),
      
        918
                dod=dod,
      
        919
                executor=executor,  # type: ignore[arg-type]
      
        920
                on_confirmation=None,
      
        921
                on_user_question=None,
      
        922
                emit_confirmation=None,
      
        923
                consecutive_errors=0,
      
        924
            )
      
        925
        
        926
            assert len(persistent_messages) == 1
      
        927
            assert "last edit" in persistent_messages[0]
      
        928
            assert "`old_string` did not exactly match" in persistent_messages[0]
      
        929
            assert "send one concrete mutation now" in persistent_messages[0]
      
        930
            assert "`write` with the complete replacement content" in persistent_messages[0]
      
        931
        
        932
        
        933
        @pytest.mark.asyncio
      
        934
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        935
            temp_dir: Path,
      
        936
        ) -> None:
      
        937
            async def assess_confidence(
      
        938
                tool_name: str,
      
        939
                tool_args: dict,
      
        940
                context: str,
      
        941
            ) -> ConfidenceAssessment:
      
        942
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        943
        
        944
            async def verify_action(
      
        945
                tool_name: str,
      
        946
                tool_args: dict,
      
        947
                result: str,
      
        948
                expected: str = "",
      
        949
            ) -> ActionVerification:
      
        950
                raise AssertionError("Verification should not run for this scenario")
      
        951
        
        952
            context = build_context(
      
        953
                temp_dir=temp_dir,
      
        954
                messages=[],
      
        955
                safeguards=FakeSafeguards(),
      
        956
                assess_confidence=assess_confidence,
      
        957
                verify_action=verify_action,
      
        958
                auto_recover=False,
      
        959
            )
      
        960
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        961
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        962
            sync_todos_to_definition_of_done(
      
        963
                dod,
      
        964
                [
      
        965
                    {
      
        966
                        "content": "Create 03-first-website.html",
      
        967
                        "active_form": "Creating 03-first-website.html",
      
        968
                        "status": "pending",
      
        969
                    },
      
        970
                    {
      
        971
                        "content": "Create 04-configuration-basics.html",
      
        972
                        "active_form": "Creating 04-configuration-basics.html",
      
        973
                        "status": "pending",
      
        974
                    },
      
        975
                ],
      
        976
            )
      
        977
        
        978
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        979
            chapter_path.parent.mkdir(parents=True)
      
        980
            write_call = ToolCall(
      
        981
                id="write-ch3",
      
        982
                name="write",
      
        983
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        984
            )
      
        985
            stale_todo_call = ToolCall(
      
        986
                id="todo-stale",
      
        987
                name="TodoWrite",
      
        988
                arguments={
      
        989
                    "todos": [
      
        990
                        {
      
        991
                            "content": "Create 03-first-website.html",
      
        992
                            "active_form": "Creating 03-first-website.html",
      
        993
                            "status": "pending",
      
        994
                        },
      
        995
                        {
      
        996
                            "content": "Create 04-configuration-basics.html",
      
        997
                            "active_form": "Creating 04-configuration-basics.html",
      
        998
                            "status": "pending",
      
        999
                        },
      
        1000
                    ]
      
        1001
                },
      
        1002
            )
      
        1003
            executor = FakeExecutor(
      
        1004
                [
      
        1005
                    tool_outcome(
      
        1006
                        tool_call=write_call,
      
        1007
                        output=f"Successfully wrote {chapter_path}",
      
        1008
                        is_error=False,
      
        1009
                    ),
      
        1010
                    tool_outcome(
      
        1011
                        tool_call=stale_todo_call,
      
        1012
                        output="Todos updated",
      
        1013
                        is_error=False,
      
        1014
                        metadata={
      
        1015
                            "new_todos": [
      
        1016
                                {
      
        1017
                                    "content": "Create 03-first-website.html",
      
        1018
                                    "active_form": "Creating 03-first-website.html",
      
        1019
                                    "status": "pending",
      
        1020
                                },
      
        1021
                                {
      
        1022
                                    "content": "Create 04-configuration-basics.html",
      
        1023
                                    "active_form": "Creating 04-configuration-basics.html",
      
        1024
                                    "status": "pending",
      
        1025
                                },
      
        1026
                            ]
      
        1027
                        },
      
        1028
                    ),
      
        1029
                ]
      
        1030
            )
      
        1031
        
        1032
            summary = TurnSummary(final_response="")
      
        1033
            await runner.execute_batch(
      
        1034
                tool_calls=[write_call, stale_todo_call],
      
        1035
                tool_source="assistant",
      
        1036
                pending_tool_calls_seen=set(),
      
        1037
                emit=_noop_emit,
      
        1038
                summary=summary,
      
        1039
                dod=dod,
      
        1040
                executor=executor,  # type: ignore[arg-type]
      
        1041
                on_confirmation=None,
      
        1042
                on_user_question=None,
      
        1043
                emit_confirmation=None,
      
        1044
                consecutive_errors=0,
      
        1045
            )
      
        1046
        
        1047
            assert "Create 03-first-website.html" in dod.completed_items
      
        1048
            assert "Create 03-first-website.html" not in dod.pending_items
      
        1049
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        1050
        
        1051
        
        1052
        @pytest.mark.asyncio
      
        1053
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        1054
            temp_dir: Path,
      
        1055
        ) -> None:
      
        1056
            async def assess_confidence(
      
        1057
                tool_name: str,
      
        1058
                tool_args: dict,
      
        1059
                context: str,
      
        1060
            ) -> ConfidenceAssessment:
      
        1061
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1062
        
        1063
            async def verify_action(
      
        1064
                tool_name: str,
      
        1065
                tool_args: dict,
      
        1066
                result: str,
      
        1067
                expected: str = "",
      
        1068
            ) -> ActionVerification:
      
        1069
                raise AssertionError("Verification should not run for this scenario")
      
        1070
        
        1071
            chapters = temp_dir / "chapters"
      
        1072
            chapters.mkdir()
      
        1073
            (chapters / "01-introduction.html").write_text(
      
        1074
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1075
            )
      
        1076
            (chapters / "02-setup.html").write_text(
      
        1077
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1078
            )
      
        1079
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        1080
        
        1081
            context = build_context(
      
        1082
                temp_dir=temp_dir,
      
        1083
                messages=[],
      
        1084
                safeguards=FakeSafeguards(),
      
        1085
                assess_confidence=assess_confidence,
      
        1086
                verify_action=verify_action,
      
        1087
                auto_recover=False,
      
        1088
            )
      
        1089
            context.session.current_task = (
      
        1090
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        1091
            )
      
        1092
            persistent_messages: list[str] = []
      
        1093
            ephemeral_messages: list[str] = []
      
        1094
            context.queue_steering_message_callback = persistent_messages.append
      
        1095
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1096
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1097
            tool_call = ToolCall(
      
        1098
                id="glob-1",
      
        1099
                name="glob",
      
        1100
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        1101
            )
      
        1102
            executor = FakeExecutor(
      
        1103
                [
      
        1104
                    tool_outcome(
      
        1105
                        tool_call=tool_call,
      
        1106
                        output="\n".join(
      
        1107
                            [
      
        1108
                                str(chapters / "01-introduction.html"),
      
        1109
                                str(chapters / "02-setup.html"),
      
        1110
                            ]
      
        1111
                        ),
      
        1112
                        is_error=False,
      
        1113
                    )
      
        1114
                ]
      
        1115
            )
      
        1116
        
        1117
            summary = TurnSummary(final_response="")
      
        1118
            await runner.execute_batch(
      
        1119
                tool_calls=[tool_call],
      
        1120
                tool_source="assistant",
      
        1121
                pending_tool_calls_seen=set(),
      
        1122
                emit=_noop_emit,
      
        1123
                summary=summary,
      
        1124
                dod=create_definition_of_done("Fix the chapter links"),
      
        1125
                executor=executor,  # type: ignore[arg-type]
      
        1126
                on_confirmation=None,
      
        1127
                on_user_question=None,
      
        1128
                emit_confirmation=None,
      
        1129
                consecutive_errors=0,
      
        1130
            )
      
        1131
        
        1132
            assert persistent_messages == []
      
        1133
            assert ephemeral_messages == []
      
        1134
            assert len(summary.tool_result_messages) == 1
      
        1135
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        1136
        
        1137
        
        1138
        @pytest.mark.asyncio
      
        1139
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        1140
            temp_dir: Path,
      
        1141
        ) -> None:
      
        1142
            async def assess_confidence(
      
        1143
                tool_name: str,
      
        1144
                tool_args: dict,
      
        1145
                context: str,
      
        1146
            ) -> ConfidenceAssessment:
      
        1147
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1148
        
        1149
            async def verify_action(
      
        1150
                tool_name: str,
      
        1151
                tool_args: dict,
      
        1152
                result: str,
      
        1153
                expected: str = "",
      
        1154
            ) -> ActionVerification:
      
        1155
                raise AssertionError("Verification should not run for this scenario")
      
        1156
        
        1157
            chapters = temp_dir / "chapters"
      
        1158
            chapters.mkdir()
      
        1159
            (chapters / "01-introduction.html").write_text(
      
        1160
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1161
            )
      
        1162
            (chapters / "02-setup.html").write_text(
      
        1163
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1164
            )
      
        1165
            index_path = temp_dir / "index.html"
      
        1166
            old_block = (
      
        1167
                '<ul class="chapter-list">\n'
      
        1168
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        1169
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        1170
                "</ul>\n"
      
        1171
            )
      
        1172
            new_block = (
      
        1173
                '<ul class="chapter-list">\n'
      
        1174
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1175
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1176
                "</ul>\n"
      
        1177
            )
      
        1178
            index_path.write_text(new_block)
      
        1179
        
        1180
            context = build_context(
      
        1181
                temp_dir=temp_dir,
      
        1182
                messages=[],
      
        1183
                safeguards=FakeSafeguards(),
      
        1184
                assess_confidence=assess_confidence,
      
        1185
                verify_action=verify_action,
      
        1186
                auto_recover=False,
      
        1187
            )
      
        1188
            context.session.current_task = (
      
        1189
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1190
            )
      
        1191
            persistent_messages: list[str] = []
      
        1192
            ephemeral_messages: list[str] = []
      
        1193
            context.queue_steering_message_callback = persistent_messages.append
      
        1194
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1195
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1196
            tool_call = ToolCall(
      
        1197
                id="edit-1",
      
        1198
                name="edit",
      
        1199
                arguments={
      
        1200
                    "file_path": str(index_path),
      
        1201
                    "old_string": old_block,
      
        1202
                    "new_string": new_block,
      
        1203
                },
      
        1204
            )
      
        1205
            executor = FakeExecutor(
      
        1206
                [
      
        1207
                    tool_outcome(
      
        1208
                        tool_call=tool_call,
      
        1209
                        output=f"Successfully edited {index_path}",
      
        1210
                        is_error=False,
      
        1211
                    )
      
        1212
                ]
      
        1213
            )
      
        1214
        
        1215
            summary = TurnSummary(final_response="")
      
        1216
            await runner.execute_batch(
      
        1217
                tool_calls=[tool_call],
      
        1218
                tool_source="assistant",
      
        1219
                pending_tool_calls_seen=set(),
      
        1220
                emit=_noop_emit,
      
        1221
                summary=summary,
      
        1222
                dod=create_definition_of_done(
      
        1223
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1224
                ),
      
        1225
                executor=executor,  # type: ignore[arg-type]
      
        1226
                on_confirmation=None,
      
        1227
                on_user_question=None,
      
        1228
                emit_confirmation=None,
      
        1229
                consecutive_errors=0,
      
        1230
            )
      
        1231
        
        1232
            assert all(
      
        1233
                "Semantic verification preview:" not in message.content
      
        1234
                for message in summary.tool_result_messages
      
        1235
            )
      
        1236
            assert persistent_messages == []
      
        1237
            assert ephemeral_messages == []
      
        1238
        
        1239
        
        1240
        @pytest.mark.asyncio
      
        1241
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1242
            temp_dir: Path,
      
        1243
        ) -> None:
      
        1244
            async def assess_confidence(
      
        1245
                tool_name: str,
      
        1246
                tool_args: dict,
      
        1247
                context: str,
      
        1248
            ) -> ConfidenceAssessment:
      
        1249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1250
        
        1251
            async def verify_action(
      
        1252
                tool_name: str,
      
        1253
                tool_args: dict,
      
        1254
                result: str,
      
        1255
                expected: str = "",
      
        1256
            ) -> ActionVerification:
      
        1257
                raise AssertionError("Verification should not run for this scenario")
      
        1258
        
        1259
            chapters = temp_dir / "chapters"
      
        1260
            chapters.mkdir()
      
        1261
            (chapters / "01-introduction.html").write_text(
      
        1262
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1263
            )
      
        1264
            (chapters / "02-setup.html").write_text(
      
        1265
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1266
            )
      
        1267
            index_path = temp_dir / "index.html"
      
        1268
            index_path.write_text(
      
        1269
                "<h2>Table of Contents</h2>\n"
      
        1270
                '<ul class="chapter-list">\n'
      
        1271
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1272
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1273
                "</ul>\n"
      
        1274
            )
      
        1275
        
        1276
            prompt = (
      
        1277
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1278
                "for the structure and cadence of the guide. We are going to make an all "
      
        1279
                "new equally thorough guide on how to use the nginx tool."
      
        1280
            )
      
        1281
        
        1282
            context = build_context(
      
        1283
                temp_dir=temp_dir,
      
        1284
                messages=[],
      
        1285
                safeguards=FakeSafeguards(),
      
        1286
                assess_confidence=assess_confidence,
      
        1287
                verify_action=verify_action,
      
        1288
                auto_recover=False,
      
        1289
            )
      
        1290
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1291
            persistent_messages: list[str] = []
      
        1292
            ephemeral_messages: list[str] = []
      
        1293
            context.queue_steering_message_callback = persistent_messages.append
      
        1294
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1295
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1296
            tool_call = ToolCall(
      
        1297
                id="read-index",
      
        1298
                name="read",
      
        1299
                arguments={"file_path": str(index_path)},
      
        1300
            )
      
        1301
            executor = FakeExecutor(
      
        1302
                [
      
        1303
                    tool_outcome(
      
        1304
                        tool_call=tool_call,
      
        1305
                        output=index_path.read_text(),
      
        1306
                        is_error=False,
      
        1307
                    )
      
        1308
                ]
      
        1309
            )
      
        1310
        
        1311
            summary = TurnSummary(final_response="")
      
        1312
            await runner.execute_batch(
      
        1313
                tool_calls=[tool_call],
      
        1314
                tool_source="assistant",
      
        1315
                pending_tool_calls_seen=set(),
      
        1316
                emit=_noop_emit,
      
        1317
                summary=summary,
      
        1318
                dod=create_definition_of_done(prompt),
      
        1319
                executor=executor,  # type: ignore[arg-type]
      
        1320
                on_confirmation=None,
      
        1321
                on_user_question=None,
      
        1322
                emit_confirmation=None,
      
        1323
                consecutive_errors=0,
      
        1324
            )
      
        1325
        
        1326
            assert persistent_messages == []
      
        1327
            assert ephemeral_messages == []
      
        1328
            assert all(
      
        1329
                "Semantic verification preview:" not in message.content
      
        1330
                for message in summary.tool_result_messages
      
        1331
            )
      
        1332
        
        1333
        
        1334
        @pytest.mark.asyncio
      
        1335
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1336
            temp_dir: Path,
      
        1337
        ) -> None:
      
        1338
            async def assess_confidence(
      
        1339
                tool_name: str,
      
        1340
                tool_args: dict,
      
        1341
                context: str,
      
        1342
            ) -> ConfidenceAssessment:
      
        1343
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1344
        
        1345
            async def verify_action(
      
        1346
                tool_name: str,
      
        1347
                tool_args: dict,
      
        1348
                result: str,
      
        1349
                expected: str = "",
      
        1350
            ) -> ActionVerification:
      
        1351
                raise AssertionError("Verification should not run for this scenario")
      
        1352
        
        1353
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1354
            reference.parent.mkdir(parents=True)
      
        1355
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1356
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1357
            chapters = nginx_root / "chapters"
      
        1358
            implementation_plan = temp_dir / "implementation.md"
      
        1359
            implementation_plan.write_text(
      
        1360
                "\n".join(
      
        1361
                    [
      
        1362
                        "# Implementation Plan",
      
        1363
                        "",
      
        1364
                        "## File Changes",
      
        1365
                        f"- `{chapters}/`",
      
        1366
                        f"- `{nginx_root / 'index.html'}`",
      
        1367
                        "",
      
        1368
                    ]
      
        1369
                )
      
        1370
            )
      
        1371
        
        1372
            context = build_context(
      
        1373
                temp_dir=temp_dir,
      
        1374
                messages=[],
      
        1375
                safeguards=FakeSafeguards(),
      
        1376
                assess_confidence=assess_confidence,
      
        1377
                verify_action=verify_action,
      
        1378
                auto_recover=False,
      
        1379
            )
      
        1380
            persistent_messages: list[str] = []
      
        1381
            ephemeral_messages: list[str] = []
      
        1382
            context.queue_steering_message_callback = persistent_messages.append
      
        1383
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1384
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1385
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1386
            dod.implementation_plan = str(implementation_plan)
      
        1387
            sync_todos_to_definition_of_done(
      
        1388
                dod,
      
        1389
                [
      
        1390
                    {
      
        1391
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1392
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1393
                        "status": "pending",
      
        1394
                    },
      
        1395
                    {
      
        1396
                        "content": "Create the nginx directory structure",
      
        1397
                        "active_form": "Working on: Create the nginx directory structure",
      
        1398
                        "status": "pending",
      
        1399
                    },
      
        1400
                    {
      
        1401
                        "content": "Create the nginx index.html file",
      
        1402
                        "active_form": "Working on: Create the nginx index.html file",
      
        1403
                        "status": "pending",
      
        1404
                    },
      
        1405
                ],
      
        1406
            )
      
        1407
            tool_call = ToolCall(
      
        1408
                id="read-reference",
      
        1409
                name="read",
      
        1410
                arguments={"file_path": str(reference)},
      
        1411
            )
      
        1412
            executor = FakeExecutor(
      
        1413
                [
      
        1414
                    tool_outcome(
      
        1415
                        tool_call=tool_call,
      
        1416
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1417
                        is_error=False,
      
        1418
                    )
      
        1419
                ]
      
        1420
            )
      
        1421
        
        1422
            summary = TurnSummary(final_response="")
      
        1423
            await runner.execute_batch(
      
        1424
                tool_calls=[tool_call],
      
        1425
                tool_source="assistant",
      
        1426
                pending_tool_calls_seen=set(),
      
        1427
                emit=_noop_emit,
      
        1428
                summary=summary,
      
        1429
                dod=dod,
      
        1430
                executor=executor,  # type: ignore[arg-type]
      
        1431
                on_confirmation=None,
      
        1432
                on_user_question=None,
      
        1433
                emit_confirmation=None,
      
        1434
                consecutive_errors=0,
      
        1435
            )
      
        1436
        
        1437
            assert (
      
        1438
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1439
                in dod.completed_items
      
        1440
            )
      
        1441
            assert any(
      
        1442
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1443
                in message
      
        1444
                for message in persistent_messages
      
        1445
            )
      
        1446
            assert any(
      
        1447
                "Resume by creating `chapters/` now." in message
      
        1448
                for message in persistent_messages
      
        1449
            )
      
        1450
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1451
            assert ephemeral_messages == []
      
        1452
        
        1453
        
        1454
        @pytest.mark.asyncio
      
        1455
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1456
            temp_dir: Path,
      
        1457
        ) -> None:
      
        1458
            async def assess_confidence(
      
        1459
                tool_name: str,
      
        1460
                tool_args: dict,
      
        1461
                context: str,
      
        1462
            ) -> ConfidenceAssessment:
      
        1463
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1464
        
        1465
            async def verify_action(
      
        1466
                tool_name: str,
      
        1467
                tool_args: dict,
      
        1468
                result: str,
      
        1469
                expected: str = "",
      
        1470
            ) -> ActionVerification:
      
        1471
                raise AssertionError("Verification should not run for this scenario")
      
        1472
        
        1473
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1474
            reference.parent.mkdir(parents=True)
      
        1475
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1476
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1477
            chapters = nginx_root / "chapters"
      
        1478
            implementation_plan = temp_dir / "implementation.md"
      
        1479
            implementation_plan.write_text(
      
        1480
                "\n".join(
      
        1481
                    [
      
        1482
                        "# Implementation Plan",
      
        1483
                        "",
      
        1484
                        "## File Changes",
      
        1485
                        f"- `{nginx_root / 'index.html'}`",
      
        1486
                        f"- `{chapters}/`",
      
        1487
                        "",
      
        1488
                    ]
      
        1489
                )
      
        1490
            )
      
        1491
        
        1492
            context = build_context(
      
        1493
                temp_dir=temp_dir,
      
        1494
                messages=[],
      
        1495
                safeguards=FakeSafeguards(),
      
        1496
                assess_confidence=assess_confidence,
      
        1497
                verify_action=verify_action,
      
        1498
                auto_recover=False,
      
        1499
            )
      
        1500
            persistent_messages: list[str] = []
      
        1501
            ephemeral_messages: list[str] = []
      
        1502
            context.queue_steering_message_callback = persistent_messages.append
      
        1503
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1504
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1505
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1506
            dod.implementation_plan = str(implementation_plan)
      
        1507
            sync_todos_to_definition_of_done(
      
        1508
                dod,
      
        1509
                [
      
        1510
                    {
      
        1511
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1512
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1513
                        "status": "pending",
      
        1514
                    },
      
        1515
                    {
      
        1516
                        "content": "Create the nginx directory structure",
      
        1517
                        "active_form": "Working on: Create the nginx directory structure",
      
        1518
                        "status": "pending",
      
        1519
                    },
      
        1520
                    {
      
        1521
                        "content": "Create the nginx index.html file",
      
        1522
                        "active_form": "Working on: Create the nginx index.html file",
      
        1523
                        "status": "pending",
      
        1524
                    },
      
        1525
                ],
      
        1526
                project_root=temp_dir,
      
        1527
            )
      
        1528
            tool_call = ToolCall(
      
        1529
                id="read-reference-index-first",
      
        1530
                name="read",
      
        1531
                arguments={"file_path": str(reference)},
      
        1532
            )
      
        1533
            executor = FakeExecutor(
      
        1534
                [
      
        1535
                    tool_outcome(
      
        1536
                        tool_call=tool_call,
      
        1537
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1538
                        is_error=False,
      
        1539
                    )
      
        1540
                ]
      
        1541
            )
      
        1542
        
        1543
            summary = TurnSummary(final_response="")
      
        1544
            await runner.execute_batch(
      
        1545
                tool_calls=[tool_call],
      
        1546
                tool_source="assistant",
      
        1547
                pending_tool_calls_seen=set(),
      
        1548
                emit=_noop_emit,
      
        1549
                summary=summary,
      
        1550
                dod=dod,
      
        1551
                executor=executor,  # type: ignore[arg-type]
      
        1552
                on_confirmation=None,
      
        1553
                on_user_question=None,
      
        1554
                emit_confirmation=None,
      
        1555
                consecutive_errors=0,
      
        1556
            )
      
        1557
        
        1558
            assert persistent_messages
      
        1559
            assert any(
      
        1560
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1561
                in message
      
        1562
                for message in persistent_messages
      
        1563
            )
      
        1564
            assert any(
      
        1565
                "Resume by creating `chapters/` now." in message
      
        1566
                for message in persistent_messages
      
        1567
            )
      
        1568
            assert all(
      
        1569
                "Next step: create `index.html`." not in message
      
        1570
                for message in persistent_messages
      
        1571
            )
      
        1572
            assert ephemeral_messages == []
      
        1573
        
        1574
        
        1575
        @pytest.mark.asyncio
      
        1576
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1577
            temp_dir: Path,
      
        1578
        ) -> None:
      
        1579
            async def assess_confidence(
      
        1580
                tool_name: str,
      
        1581
                tool_args: dict,
      
        1582
                context: str,
      
        1583
            ) -> ConfidenceAssessment:
      
        1584
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1585
        
        1586
            async def verify_action(
      
        1587
                tool_name: str,
      
        1588
                tool_args: dict,
      
        1589
                result: str,
      
        1590
                expected: str = "",
      
        1591
            ) -> ActionVerification:
      
        1592
                raise AssertionError("Verification should not run for this scenario")
      
        1593
        
        1594
            reference = temp_dir / "fortran" / "index.html"
      
        1595
            reference.parent.mkdir(parents=True)
      
        1596
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1597
        
        1598
            messages = [
      
        1599
                Message(
      
        1600
                    role=Role.TOOL,
      
        1601
                    content=(
      
        1602
                        "Observation [read]: Result: "
      
        1603
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1604
                    ),
      
        1605
                )
      
        1606
            ]
      
        1607
            context = build_context(
      
        1608
                temp_dir=temp_dir,
      
        1609
                messages=messages,
      
        1610
                safeguards=FakeSafeguards(),
      
        1611
                assess_confidence=assess_confidence,
      
        1612
                verify_action=verify_action,
      
        1613
                auto_recover=False,
      
        1614
            )
      
        1615
            prompt = (
      
        1616
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1617
                "for the structure and cadence of the guide. We are going to make an all "
      
        1618
                "new equally thorough guide on how to use the nginx tool."
      
        1619
            )
      
        1620
            context.session.current_task = prompt
      
        1621
            persistent_messages: list[str] = []
      
        1622
            ephemeral_messages: list[str] = []
      
        1623
            context.queue_steering_message_callback = persistent_messages.append
      
        1624
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1625
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1626
            dod = create_definition_of_done(prompt)
      
        1627
            sync_todos_to_definition_of_done(
      
        1628
                dod,
      
        1629
                [
      
        1630
                    {
      
        1631
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1632
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1633
                        "status": "completed",
      
        1634
                    },
      
        1635
                    {
      
        1636
                        "content": "Create the nginx directory structure",
      
        1637
                        "active_form": "Working on: Create the nginx directory structure",
      
        1638
                        "status": "pending",
      
        1639
                    },
      
        1640
                    {
      
        1641
                        "content": "Create the nginx index.html file",
      
        1642
                        "active_form": "Working on: Create the nginx index.html file",
      
        1643
                        "status": "pending",
      
        1644
                    },
      
        1645
                ],
      
        1646
            )
      
        1647
            tool_call = ToolCall(
      
        1648
                id="read-dup",
      
        1649
                name="read",
      
        1650
                arguments={"file_path": str(reference)},
      
        1651
            )
      
        1652
            duplicate_message = (
      
        1653
                "[Skipped - duplicate action: Already read "
      
        1654
                f"{reference} recently without any intervening changes; "
      
        1655
                "reuse the earlier read result instead of rereading]"
      
        1656
            )
      
        1657
            executor = FakeExecutor(
      
        1658
                [
      
        1659
                    ToolExecutionOutcome(
      
        1660
                        tool_call=tool_call,
      
        1661
                        state=ToolExecutionState.DUPLICATE,
      
        1662
                        message=Message.tool_result_message(
      
        1663
                            tool_call_id=tool_call.id,
      
        1664
                            display_content=duplicate_message,
      
        1665
                            result_content=duplicate_message,
      
        1666
                        ),
      
        1667
                        event_content=duplicate_message,
      
        1668
                        is_error=False,
      
        1669
                        result_output=duplicate_message,
      
        1670
                    )
      
        1671
                ]
      
        1672
            )
      
        1673
        
        1674
            summary = TurnSummary(final_response="")
      
        1675
            await runner.execute_batch(
      
        1676
                tool_calls=[tool_call],
      
        1677
                tool_source="assistant",
      
        1678
                pending_tool_calls_seen=set(),
      
        1679
                emit=_noop_emit,
      
        1680
                summary=summary,
      
        1681
                dod=dod,
      
        1682
                executor=executor,  # type: ignore[arg-type]
      
        1683
                on_confirmation=None,
      
        1684
                on_user_question=None,
      
        1685
                emit_confirmation=None,
      
        1686
                consecutive_errors=0,
      
        1687
            )
      
        1688
        
        1689
            assert len(persistent_messages) == 1
      
        1690
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1691
            assert (
      
        1692
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1693
                in persistent_messages[0]
      
        1694
            )
      
        1695
            assert "Update `" not in persistent_messages[0]
      
        1696
            assert ephemeral_messages == []
      
        1697
        
        1698
        
        1699
        @pytest.mark.asyncio
      
        1700
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1701
            temp_dir: Path,
      
        1702
        ) -> None:
      
        1703
            async def assess_confidence(
      
        1704
                tool_name: str,
      
        1705
                tool_args: dict,
      
        1706
                context: str,
      
        1707
            ) -> ConfidenceAssessment:
      
        1708
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1709
        
        1710
            async def verify_action(
      
        1711
                tool_name: str,
      
        1712
                tool_args: dict,
      
        1713
                result: str,
      
        1714
                expected: str = "",
      
        1715
            ) -> ActionVerification:
      
        1716
                raise AssertionError("Verification should not run for this scenario")
      
        1717
        
        1718
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1719
            chapters = guide_root / "chapters"
      
        1720
            chapters.mkdir(parents=True)
      
        1721
            chapter_one = chapters / "01-introduction.html"
      
        1722
            chapter_one.write_text("<html></html>\n")
      
        1723
            index_path = guide_root / "index.html"
      
        1724
        
        1725
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1726
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1727
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1728
        
        1729
            implementation_plan = temp_dir / "implementation.md"
      
        1730
            implementation_plan.write_text(
      
        1731
                "\n".join(
      
        1732
                    [
      
        1733
                        "# Implementation Plan",
      
        1734
                        "",
      
        1735
                        "## File Changes",
      
        1736
                        f"- `{guide_root}/`",
      
        1737
                        f"- `{chapters}/`",
      
        1738
                        f"- `{index_path}`",
      
        1739
                        f"- `{chapter_one}`",
      
        1740
                        f"- `{chapters / '02-installation.html'}`",
      
        1741
                        "",
      
        1742
                    ]
      
        1743
                )
      
        1744
            )
      
        1745
        
        1746
            context = build_context(
      
        1747
                temp_dir=temp_dir,
      
        1748
                messages=[],
      
        1749
                safeguards=FakeSafeguards(),
      
        1750
                assess_confidence=assess_confidence,
      
        1751
                verify_action=verify_action,
      
        1752
                auto_recover=False,
      
        1753
            )
      
        1754
            persistent_messages: list[str] = []
      
        1755
            ephemeral_messages: list[str] = []
      
        1756
            context.queue_steering_message_callback = persistent_messages.append
      
        1757
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1758
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1759
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1760
            dod.implementation_plan = str(implementation_plan)
      
        1761
            dod.touched_files.append(str(chapter_one))
      
        1762
            sync_todos_to_definition_of_done(
      
        1763
                dod,
      
        1764
                [
      
        1765
                    {
      
        1766
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1767
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1768
                        "status": "pending",
      
        1769
                    },
      
        1770
                    {
      
        1771
                        "content": "Create each chapter file with appropriate content",
      
        1772
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1773
                        "status": "pending",
      
        1774
                    },
      
        1775
                    {
      
        1776
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1777
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1778
                        "status": "pending",
      
        1779
                    },
      
        1780
                ],
      
        1781
            )
      
        1782
            tool_call = ToolCall(
      
        1783
                id="read-reference-chapter",
      
        1784
                name="read",
      
        1785
                arguments={"file_path": str(reference)},
      
        1786
            )
      
        1787
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1788
            executor = FakeExecutor(
      
        1789
                [
      
        1790
                    ToolExecutionOutcome(
      
        1791
                        tool_call=tool_call,
      
        1792
                        state=ToolExecutionState.EXECUTED,
      
        1793
                        message=Message.tool_result_message(
      
        1794
                            tool_call_id=tool_call.id,
      
        1795
                            display_content=read_output,
      
        1796
                            result_content=read_output,
      
        1797
                        ),
      
        1798
                        event_content=read_output,
      
        1799
                        is_error=False,
      
        1800
                        result_output=read_output,
      
        1801
                    )
      
        1802
                ]
      
        1803
            )
      
        1804
        
        1805
            summary = TurnSummary(final_response="")
      
        1806
            await runner.execute_batch(
      
        1807
                tool_calls=[tool_call],
      
        1808
                tool_source="assistant",
      
        1809
                pending_tool_calls_seen=set(),
      
        1810
                emit=_noop_emit,
      
        1811
                summary=summary,
      
        1812
                dod=dod,
      
        1813
                executor=executor,  # type: ignore[arg-type]
      
        1814
                on_confirmation=None,
      
        1815
                on_user_question=None,
      
        1816
                emit_confirmation=None,
      
        1817
                consecutive_errors=0,
      
        1818
            )
      
        1819
        
        1820
            assert persistent_messages
      
        1821
            assert any(
      
        1822
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1823
                in message
      
        1824
                for message in persistent_messages
      
        1825
            )
      
        1826
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1827
            assert not any(
      
        1828
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1829
                in message
      
        1830
                for message in persistent_messages
      
        1831
            )
      
        1832
            assert ephemeral_messages == []
      
        1833
        
        1834
        
        1835
        @pytest.mark.asyncio
      
        1836
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1837
            temp_dir: Path,
      
        1838
        ) -> None:
      
        1839
            async def assess_confidence(
      
        1840
                tool_name: str,
      
        1841
                tool_args: dict,
      
        1842
                context: str,
      
        1843
            ) -> ConfidenceAssessment:
      
        1844
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1845
        
        1846
            async def verify_action(
      
        1847
                tool_name: str,
      
        1848
                tool_args: dict,
      
        1849
                result: str,
      
        1850
                expected: str = "",
      
        1851
            ) -> ActionVerification:
      
        1852
                raise AssertionError("Verification should not run for this scenario")
      
        1853
        
        1854
            guide_root = temp_dir / "guides" / "nginx"
      
        1855
            chapters = guide_root / "chapters"
      
        1856
            guide_root.mkdir(parents=True)
      
        1857
            chapters.mkdir()
      
        1858
            index_path = guide_root / "index.html"
      
        1859
            chapter_one = chapters / "01-getting-started.html"
      
        1860
            chapter_two = chapters / "02-installation.html"
      
        1861
            index_path.write_text("<html></html>\n")
      
        1862
            chapter_one.write_text("<h1>One</h1>\n")
      
        1863
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1864
        
        1865
            implementation_plan = temp_dir / "implementation.md"
      
        1866
            implementation_plan.write_text(
      
        1867
                "\n".join(
      
        1868
                    [
      
        1869
                        "# Implementation Plan",
      
        1870
                        "",
      
        1871
                        "## File Changes",
      
        1872
                        f"- `{guide_root}/`",
      
        1873
                        f"- `{chapters}/`",
      
        1874
                        f"- `{index_path}`",
      
        1875
                        f"- `{chapter_one}`",
      
        1876
                        f"- `{chapter_two}`",
      
        1877
                        "",
      
        1878
                    ]
      
        1879
                )
      
        1880
            )
      
        1881
        
        1882
            context = build_context(
      
        1883
                temp_dir=temp_dir,
      
        1884
                messages=[],
      
        1885
                safeguards=FakeSafeguards(),
      
        1886
                assess_confidence=assess_confidence,
      
        1887
                verify_action=verify_action,
      
        1888
                auto_recover=False,
      
        1889
            )
      
        1890
            persistent_messages: list[str] = []
      
        1891
            ephemeral_messages: list[str] = []
      
        1892
            context.queue_steering_message_callback = persistent_messages.append
      
        1893
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1894
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1895
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1896
            dod.implementation_plan = str(implementation_plan)
      
        1897
            dod.pending_items = [
      
        1898
                "Create 07-performance-tuning.html",
      
        1899
                "Verify all guide files are linked and complete",
      
        1900
                "Complete the requested work",
      
        1901
            ]
      
        1902
        
        1903
            tool_call = ToolCall(
      
        1904
                id="read-dup",
      
        1905
                name="read",
      
        1906
                arguments={"file_path": str(chapter_one)},
      
        1907
            )
      
        1908
            duplicate_message = (
      
        1909
                "[Skipped - duplicate action: Already read "
      
        1910
                f"{chapter_one} recently without any intervening changes; "
      
        1911
                "reuse the earlier read result instead of rereading]"
      
        1912
            )
      
        1913
            executor = FakeExecutor(
      
        1914
                [
      
        1915
                    ToolExecutionOutcome(
      
        1916
                        tool_call=tool_call,
      
        1917
                        state=ToolExecutionState.DUPLICATE,
      
        1918
                        message=Message.tool_result_message(
      
        1919
                            tool_call_id=tool_call.id,
      
        1920
                            display_content=duplicate_message,
      
        1921
                            result_content=duplicate_message,
      
        1922
                        ),
      
        1923
                        event_content=duplicate_message,
      
        1924
                        is_error=False,
      
        1925
                        result_output=duplicate_message,
      
        1926
                    )
      
        1927
                ]
      
        1928
            )
      
        1929
        
        1930
            summary = TurnSummary(final_response="")
      
        1931
            await runner.execute_batch(
      
        1932
                tool_calls=[tool_call],
      
        1933
                tool_source="assistant",
      
        1934
                pending_tool_calls_seen=set(),
      
        1935
                emit=_noop_emit,
      
        1936
                summary=summary,
      
        1937
                dod=dod,
      
        1938
                executor=executor,  # type: ignore[arg-type]
      
        1939
                on_confirmation=None,
      
        1940
                on_user_question=None,
      
        1941
                emit_confirmation=None,
      
        1942
                consecutive_errors=0,
      
        1943
            )
      
        1944
        
        1945
            assert len(persistent_messages) == 1
      
        1946
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1947
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1948
            assert ephemeral_messages == []
      
        1949
        
        1950
        
        1951
        @pytest.mark.asyncio
      
        1952
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1953
            temp_dir: Path,
      
        1954
        ) -> None:
      
        1955
            async def assess_confidence(
      
        1956
                tool_name: str,
      
        1957
                tool_args: dict,
      
        1958
                context: str,
      
        1959
            ) -> ConfidenceAssessment:
      
        1960
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1961
        
        1962
            async def verify_action(
      
        1963
                tool_name: str,
      
        1964
                tool_args: dict,
      
        1965
                result: str,
      
        1966
                expected: str = "",
      
        1967
            ) -> ActionVerification:
      
        1968
                raise AssertionError("Verification should not run for this scenario")
      
        1969
        
        1970
            guide_root = temp_dir / "guides" / "nginx"
      
        1971
            chapters = guide_root / "chapters"
      
        1972
            guide_root.mkdir(parents=True)
      
        1973
            chapters.mkdir()
      
        1974
            index_path = guide_root / "index.html"
      
        1975
            chapter_one = chapters / "01-getting-started.html"
      
        1976
            chapter_two = chapters / "02-installation.html"
      
        1977
            index_path.write_text("<html></html>\n")
      
        1978
            chapter_one.write_text("<h1>One</h1>\n")
      
        1979
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1980
        
        1981
            implementation_plan = temp_dir / "implementation.md"
      
        1982
            implementation_plan.write_text(
      
        1983
                "\n".join(
      
        1984
                    [
      
        1985
                        "# Implementation Plan",
      
        1986
                        "",
      
        1987
                        "## File Changes",
      
        1988
                        f"- `{guide_root}/`",
      
        1989
                        f"- `{chapters}/`",
      
        1990
                        f"- `{index_path}`",
      
        1991
                        f"- `{chapter_one}`",
      
        1992
                        f"- `{chapter_two}`",
      
        1993
                        "",
      
        1994
                    ]
      
        1995
                )
      
        1996
            )
      
        1997
        
        1998
            context = build_context(
      
        1999
                temp_dir=temp_dir,
      
        2000
                messages=[],
      
        2001
                safeguards=FakeSafeguards(),
      
        2002
                assess_confidence=assess_confidence,
      
        2003
                verify_action=verify_action,
      
        2004
                auto_recover=False,
      
        2005
            )
      
        2006
            persistent_messages: list[str] = []
      
        2007
            ephemeral_messages: list[str] = []
      
        2008
            context.queue_steering_message_callback = persistent_messages.append
      
        2009
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2010
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2011
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2012
            dod.implementation_plan = str(implementation_plan)
      
        2013
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2014
            dod.pending_items = [
      
        2015
                "Create 07-performance-tuning.html",
      
        2016
                "Complete the requested work",
      
        2017
            ]
      
        2018
        
        2019
            tool_call = ToolCall(
      
        2020
                id="read-dup",
      
        2021
                name="read",
      
        2022
                arguments={"file_path": str(chapter_one)},
      
        2023
            )
      
        2024
            duplicate_message = (
      
        2025
                "[Skipped - duplicate action: Already read "
      
        2026
                f"{chapter_one} recently without any intervening changes; "
      
        2027
                "reuse the earlier read result instead of rereading]"
      
        2028
            )
      
        2029
            executor = FakeExecutor(
      
        2030
                [
      
        2031
                    ToolExecutionOutcome(
      
        2032
                        tool_call=tool_call,
      
        2033
                        state=ToolExecutionState.DUPLICATE,
      
        2034
                        message=Message.tool_result_message(
      
        2035
                            tool_call_id=tool_call.id,
      
        2036
                            display_content=duplicate_message,
      
        2037
                            result_content=duplicate_message,
      
        2038
                        ),
      
        2039
                        event_content=duplicate_message,
      
        2040
                        is_error=False,
      
        2041
                        result_output=duplicate_message,
      
        2042
                    )
      
        2043
                ]
      
        2044
            )
      
        2045
        
        2046
            summary = TurnSummary(final_response="")
      
        2047
            await runner.execute_batch(
      
        2048
                tool_calls=[tool_call],
      
        2049
                tool_source="assistant",
      
        2050
                pending_tool_calls_seen=set(),
      
        2051
                emit=_noop_emit,
      
        2052
                summary=summary,
      
        2053
                dod=dod,
      
        2054
                executor=executor,  # type: ignore[arg-type]
      
        2055
                on_confirmation=None,
      
        2056
                on_user_question=None,
      
        2057
                emit_confirmation=None,
      
        2058
                consecutive_errors=0,
      
        2059
            )
      
        2060
        
        2061
            assert len(persistent_messages) == 1
      
        2062
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        2063
            assert (
      
        2064
                "Finish with a final response now so Loader can run verification automatically."
      
        2065
                in persistent_messages[0]
      
        2066
            )
      
        2067
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        2068
            assert ephemeral_messages == []
      
        2069
        
        2070
        
        2071
        @pytest.mark.asyncio
      
        2072
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        2073
            temp_dir: Path,
      
        2074
        ) -> None:
      
        2075
            async def assess_confidence(
      
        2076
                tool_name: str,
      
        2077
                tool_args: dict,
      
        2078
                context: str,
      
        2079
            ) -> ConfidenceAssessment:
      
        2080
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2081
        
        2082
            async def verify_action(
      
        2083
                tool_name: str,
      
        2084
                tool_args: dict,
      
        2085
                result: str,
      
        2086
                expected: str = "",
      
        2087
            ) -> ActionVerification:
      
        2088
                raise AssertionError("Verification should not run for this scenario")
      
        2089
        
        2090
            guide_root = temp_dir / "guides" / "nginx"
      
        2091
            chapters = guide_root / "chapters"
      
        2092
            guide_root.mkdir(parents=True)
      
        2093
            chapters.mkdir()
      
        2094
            index_path = guide_root / "index.html"
      
        2095
            chapter_one = chapters / "01-getting-started.html"
      
        2096
            chapter_two = chapters / "02-installation.html"
      
        2097
            index_path.write_text("<html></html>\n")
      
        2098
            chapter_one.write_text("<h1>One</h1>\n")
      
        2099
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2100
        
        2101
            implementation_plan = temp_dir / "implementation.md"
      
        2102
            implementation_plan.write_text(
      
        2103
                "\n".join(
      
        2104
                    [
      
        2105
                        "# Implementation Plan",
      
        2106
                        "",
      
        2107
                        "## File Changes",
      
        2108
                        f"- `{guide_root}/`",
      
        2109
                        f"- `{chapters}/`",
      
        2110
                        f"- `{index_path}`",
      
        2111
                        f"- `{chapter_one}`",
      
        2112
                        f"- `{chapter_two}`",
      
        2113
                        "",
      
        2114
                    ]
      
        2115
                )
      
        2116
            )
      
        2117
        
        2118
            context = build_context(
      
        2119
                temp_dir=temp_dir,
      
        2120
                messages=[],
      
        2121
                safeguards=FakeSafeguards(),
      
        2122
                assess_confidence=assess_confidence,
      
        2123
                verify_action=verify_action,
      
        2124
                auto_recover=False,
      
        2125
            )
      
        2126
            persistent_messages: list[str] = []
      
        2127
            ephemeral_messages: list[str] = []
      
        2128
            context.queue_steering_message_callback = persistent_messages.append
      
        2129
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2130
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2131
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2132
            dod.implementation_plan = str(implementation_plan)
      
        2133
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2134
            dod.pending_items = [
      
        2135
                "Create 01-getting-started.html",
      
        2136
                "Creating 02-installation.html",
      
        2137
                "Complete the requested work",
      
        2138
            ]
      
        2139
        
        2140
            tool_call = ToolCall(
      
        2141
                id="read-dup-built-stale",
      
        2142
                name="read",
      
        2143
                arguments={"file_path": str(chapter_one)},
      
        2144
            )
      
        2145
            duplicate_message = (
      
        2146
                "[Skipped - duplicate action: Already read "
      
        2147
                f"{chapter_one} recently without any intervening changes; "
      
        2148
                "reuse the earlier read result instead of rereading]"
      
        2149
            )
      
        2150
            executor = FakeExecutor(
      
        2151
                [
      
        2152
                    ToolExecutionOutcome(
      
        2153
                        tool_call=tool_call,
      
        2154
                        state=ToolExecutionState.DUPLICATE,
      
        2155
                        message=Message.tool_result_message(
      
        2156
                            tool_call_id=tool_call.id,
      
        2157
                            display_content=duplicate_message,
      
        2158
                            result_content=duplicate_message,
      
        2159
                        ),
      
        2160
                        event_content=duplicate_message,
      
        2161
                        is_error=False,
      
        2162
                        result_output=duplicate_message,
      
        2163
                    )
      
        2164
                ]
      
        2165
            )
      
        2166
        
        2167
            summary = TurnSummary(final_response="")
      
        2168
            await runner.execute_batch(
      
        2169
                tool_calls=[tool_call],
      
        2170
                tool_source="assistant",
      
        2171
                pending_tool_calls_seen=set(),
      
        2172
                emit=_noop_emit,
      
        2173
                summary=summary,
      
        2174
                dod=dod,
      
        2175
                executor=executor,  # type: ignore[arg-type]
      
        2176
                on_confirmation=None,
      
        2177
                on_user_question=None,
      
        2178
                emit_confirmation=None,
      
        2179
                consecutive_errors=0,
      
        2180
            )
      
        2181
        
        2182
            assert len(persistent_messages) == 1
      
        2183
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        2184
            assert (
      
        2185
                "Finish with a final response now so Loader can run verification automatically."
      
        2186
                in persistent_messages[0]
      
        2187
            )
      
        2188
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        2189
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        2190
            assert ephemeral_messages == []
      
        2191
        
        2192
        
        2193
        @pytest.mark.asyncio
      
        2194
        async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
      
        2195
            temp_dir: Path,
      
        2196
        ) -> None:
      
        2197
            async def assess_confidence(
      
        2198
                tool_name: str,
      
        2199
                tool_args: dict,
      
        2200
                context: str,
      
        2201
            ) -> ConfidenceAssessment:
      
        2202
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2203
        
        2204
            async def verify_action(
      
        2205
                tool_name: str,
      
        2206
                tool_args: dict,
      
        2207
                result: str,
      
        2208
                expected: str = "",
      
        2209
            ) -> ActionVerification:
      
        2210
                raise AssertionError("Verification should not run for this scenario")
      
        2211
        
        2212
            guide_root = temp_dir / "guides" / "nginx"
      
        2213
            chapters = guide_root / "chapters"
      
        2214
            guide_root.mkdir(parents=True)
      
        2215
            chapters.mkdir()
      
        2216
            index_path = guide_root / "index.html"
      
        2217
            chapter_one = chapters / "01-getting-started.html"
      
        2218
            chapter_two = chapters / "02-installation.html"
      
        2219
            index_path.write_text("<html></html>\n")
      
        2220
            chapter_one.write_text("<h1>One</h1>\n")
      
        2221
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2222
        
        2223
            implementation_plan = temp_dir / "implementation.md"
      
        2224
            implementation_plan.write_text(
      
        2225
                "\n".join(
      
        2226
                    [
      
        2227
                        "# Implementation Plan",
      
        2228
                        "",
      
        2229
                        "## File Changes",
      
        2230
                        f"- `{guide_root}/`",
      
        2231
                        f"- `{chapters}/`",
      
        2232
                        f"- `{index_path}`",
      
        2233
                        f"- `{chapter_one}`",
      
        2234
                        f"- `{chapter_two}`",
      
        2235
                        "",
      
        2236
                    ]
      
        2237
                )
      
        2238
            )
      
        2239
        
        2240
            context = build_context(
      
        2241
                temp_dir=temp_dir,
      
        2242
                messages=[],
      
        2243
                safeguards=FakeSafeguards(),
      
        2244
                assess_confidence=assess_confidence,
      
        2245
                verify_action=verify_action,
      
        2246
                auto_recover=False,
      
        2247
            )
      
        2248
            persistent_messages: list[str] = []
      
        2249
            ephemeral_messages: list[str] = []
      
        2250
            context.queue_steering_message_callback = persistent_messages.append
      
        2251
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2252
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2253
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2254
            dod.implementation_plan = str(implementation_plan)
      
        2255
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2256
            sync_todos_to_definition_of_done(
      
        2257
                dod,
      
        2258
                [
      
        2259
                    {
      
        2260
                        "content": "Create 01-getting-started.html",
      
        2261
                        "active_form": "Creating 01-getting-started.html",
      
        2262
                        "status": "pending",
      
        2263
                    },
      
        2264
                    {
      
        2265
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2266
                        "active_form": "Reviewing guide consistency and linkage",
      
        2267
                        "status": "pending",
      
        2268
                    },
      
        2269
                ],
      
        2270
            )
      
        2271
        
        2272
            tool_call = ToolCall(
      
        2273
                id="read-built-review",
      
        2274
                name="read",
      
        2275
                arguments={"file_path": str(chapter_one)},
      
        2276
            )
      
        2277
            executor = FakeExecutor(
      
        2278
                [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
      
        2279
            )
      
        2280
        
        2281
            summary = TurnSummary(final_response="")
      
        2282
            await runner.execute_batch(
      
        2283
                tool_calls=[tool_call],
      
        2284
                tool_source="assistant",
      
        2285
                pending_tool_calls_seen=set(),
      
        2286
                emit=_noop_emit,
      
        2287
                summary=summary,
      
        2288
                dod=dod,
      
        2289
                executor=executor,  # type: ignore[arg-type]
      
        2290
                on_confirmation=None,
      
        2291
                on_user_question=None,
      
        2292
                emit_confirmation=None,
      
        2293
                consecutive_errors=0,
      
        2294
            )
      
        2295
        
        2296
            assert persistent_messages == []
      
        2297
            assert len(ephemeral_messages) == 1
      
        2298
            message = ephemeral_messages[0]
      
        2299
            assert "All explicitly planned artifacts already exist." in message
      
        2300
            assert "Ensure all files are properly linked and formatted consistently" in message
      
        2301
            assert "Create 01-getting-started.html" not in message
      
        2302
            assert "do not keep broad-rereading the output set" in message
      
        2303
            assert "If no specific mismatch remains, finish with a final response so Loader can verify." in message
      
        2304
        
        2305
        
        2306
        @pytest.mark.asyncio
      
        2307
        async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
      
        2308
            temp_dir: Path,
      
        2309
        ) -> None:
      
        2310
            async def assess_confidence(
      
        2311
                tool_name: str,
      
        2312
                tool_args: dict,
      
        2313
                context: str,
      
        2314
            ) -> ConfidenceAssessment:
      
        2315
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2316
        
        2317
            async def verify_action(
      
        2318
                tool_name: str,
      
        2319
                tool_args: dict,
      
        2320
                result: str,
      
        2321
                expected: str = "",
      
        2322
            ) -> ActionVerification:
      
        2323
                raise AssertionError("Verification should not run for this scenario")
      
        2324
        
        2325
            guide_root = temp_dir / "guides" / "nginx"
      
        2326
            chapters = guide_root / "chapters"
      
        2327
            guide_root.mkdir(parents=True)
      
        2328
            chapters.mkdir()
      
        2329
            index_path = guide_root / "index.html"
      
        2330
            chapter_one = chapters / "01-getting-started.html"
      
        2331
            chapter_two = chapters / "02-installation.html"
      
        2332
            index_path.write_text("<html></html>\n")
      
        2333
            chapter_one.write_text("<h1>One</h1>\n")
      
        2334
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2335
        
        2336
            implementation_plan = temp_dir / "implementation.md"
      
        2337
            implementation_plan.write_text(
      
        2338
                "\n".join(
      
        2339
                    [
      
        2340
                        "# Implementation Plan",
      
        2341
                        "",
      
        2342
                        "## File Changes",
      
        2343
                        f"- `{guide_root}/`",
      
        2344
                        f"- `{chapters}/`",
      
        2345
                        f"- `{index_path}`",
      
        2346
                        f"- `{chapter_one}`",
      
        2347
                        f"- `{chapter_two}`",
      
        2348
                        "",
      
        2349
                    ]
      
        2350
                )
      
        2351
            )
      
        2352
        
        2353
            context = build_context(
      
        2354
                temp_dir=temp_dir,
      
        2355
                messages=[],
      
        2356
                safeguards=FakeSafeguards(),
      
        2357
                assess_confidence=assess_confidence,
      
        2358
                verify_action=verify_action,
      
        2359
                auto_recover=False,
      
        2360
            )
      
        2361
            persistent_messages: list[str] = []
      
        2362
            ephemeral_messages: list[str] = []
      
        2363
            context.queue_steering_message_callback = persistent_messages.append
      
        2364
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2365
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2366
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2367
            dod.implementation_plan = str(implementation_plan)
      
        2368
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2369
        
        2370
            tool_call = ToolCall(
      
        2371
                id="read-built-verify",
      
        2372
                name="read",
      
        2373
                arguments={"file_path": str(chapter_one)},
      
        2374
            )
      
        2375
            executor = FakeExecutor(
      
        2376
                [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
      
        2377
            )
      
        2378
        
        2379
            summary = TurnSummary(final_response="")
      
        2380
            await runner.execute_batch(
      
        2381
                tool_calls=[tool_call],
      
        2382
                tool_source="assistant",
      
        2383
                pending_tool_calls_seen=set(),
      
        2384
                emit=_noop_emit,
      
        2385
                summary=summary,
      
        2386
                dod=dod,
      
        2387
                executor=executor,  # type: ignore[arg-type]
      
        2388
                on_confirmation=None,
      
        2389
                on_user_question=None,
      
        2390
                emit_confirmation=None,
      
        2391
                consecutive_errors=0,
      
        2392
            )
      
        2393
        
        2394
            assert len(persistent_messages) == 1
      
        2395
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        2396
            assert "Finish with a final response now so Loader can run verification automatically." in persistent_messages[0]
      
        2397
            assert "stop broad rereads" in persistent_messages[0]
      
        2398
            assert ephemeral_messages == []
      
        2399
            assert context.workflow_mode == "verify"
      
        2400
        
        2401
        
        2402
        @pytest.mark.asyncio
      
        2403
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        2404
            temp_dir: Path,
      
        2405
        ) -> None:
      
        2406
            async def assess_confidence(
      
        2407
                tool_name: str,
      
        2408
                tool_args: dict,
      
        2409
                context: str,
      
        2410
            ) -> ConfidenceAssessment:
      
        2411
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2412
        
        2413
            async def verify_action(
      
        2414
                tool_name: str,
      
        2415
                tool_args: dict,
      
        2416
                result: str,
      
        2417
                expected: str = "",
      
        2418
            ) -> ActionVerification:
      
        2419
                raise AssertionError("Verification should not run for this scenario")
      
        2420
        
        2421
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2422
            reference.parent.mkdir(parents=True)
      
        2423
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2424
        
        2425
            context = build_context(
      
        2426
                temp_dir=temp_dir,
      
        2427
                messages=[],
      
        2428
                safeguards=FakeSafeguards(),
      
        2429
                assess_confidence=assess_confidence,
      
        2430
                verify_action=verify_action,
      
        2431
                auto_recover=False,
      
        2432
            )
      
        2433
            persistent_messages: list[str] = []
      
        2434
            ephemeral_messages: list[str] = []
      
        2435
            context.queue_steering_message_callback = persistent_messages.append
      
        2436
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2437
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2438
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2439
            sync_todos_to_definition_of_done(
      
        2440
                dod,
      
        2441
                [
      
        2442
                    {
      
        2443
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2444
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2445
                        "status": "pending",
      
        2446
                    },
      
        2447
                    {
      
        2448
                        "content": "Create the nginx index.html file",
      
        2449
                        "active_form": "Working on: Create the nginx index.html file",
      
        2450
                        "status": "pending",
      
        2451
                    },
      
        2452
                ],
      
        2453
            )
      
        2454
            tool_call = ToolCall(
      
        2455
                id="read-reference",
      
        2456
                name="read",
      
        2457
                arguments={"file_path": str(reference)},
      
        2458
            )
      
        2459
            executor = FakeExecutor(
      
        2460
                [
      
        2461
                    tool_outcome(
      
        2462
                        tool_call=tool_call,
      
        2463
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2464
                        is_error=False,
      
        2465
                    )
      
        2466
                ]
      
        2467
            )
      
        2468
        
        2469
            summary = TurnSummary(final_response="")
      
        2470
            await runner.execute_batch(
      
        2471
                tool_calls=[tool_call],
      
        2472
                tool_source="assistant",
      
        2473
                pending_tool_calls_seen=set(),
      
        2474
                emit=_noop_emit,
      
        2475
                summary=summary,
      
        2476
                dod=dod,
      
        2477
                executor=executor,  # type: ignore[arg-type]
      
        2478
                on_confirmation=None,
      
        2479
                on_user_question=None,
      
        2480
                emit_confirmation=None,
      
        2481
                consecutive_errors=0,
      
        2482
            )
      
        2483
        
        2484
            assert any(
      
        2485
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2486
                in message
      
        2487
                for message in persistent_messages
      
        2488
            )
      
        2489
            assert any(
      
        2490
                "stop gathering more reference material and perform the change now" in message
      
        2491
                for message in persistent_messages
      
        2492
            )
      
        2493
            assert ephemeral_messages == []
      
        2494
        
        2495
        
        2496
        @pytest.mark.asyncio
      
        2497
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2498
            temp_dir: Path,
      
        2499
        ) -> None:
      
        2500
            async def assess_confidence(
      
        2501
                tool_name: str,
      
        2502
                tool_args: dict,
      
        2503
                context: str,
      
        2504
            ) -> ConfidenceAssessment:
      
        2505
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2506
        
        2507
            async def verify_action(
      
        2508
                tool_name: str,
      
        2509
                tool_args: dict,
      
        2510
                result: str,
      
        2511
                expected: str = "",
      
        2512
            ) -> ActionVerification:
      
        2513
                raise AssertionError("Verification should not run for this scenario")
      
        2514
        
        2515
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2516
            reference.parent.mkdir(parents=True)
      
        2517
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2518
        
        2519
            context = build_context(
      
        2520
                temp_dir=temp_dir,
      
        2521
                messages=[],
      
        2522
                safeguards=FakeSafeguards(),
      
        2523
                assess_confidence=assess_confidence,
      
        2524
                verify_action=verify_action,
      
        2525
                auto_recover=False,
      
        2526
            )
      
        2527
            persistent_messages: list[str] = []
      
        2528
            ephemeral_messages: list[str] = []
      
        2529
            context.queue_steering_message_callback = persistent_messages.append
      
        2530
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2531
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2532
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2533
            sync_todos_to_definition_of_done(
      
        2534
                dod,
      
        2535
                [
      
        2536
                    {
      
        2537
                        "content": "First, examine the existing fortran guide structure and content",
      
        2538
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2539
                        "status": "pending",
      
        2540
                    },
      
        2541
                    {
      
        2542
                        "content": "Create the nginx directory structure",
      
        2543
                        "active_form": "Working on: Create the nginx directory structure",
      
        2544
                        "status": "pending",
      
        2545
                    },
      
        2546
                ],
      
        2547
            )
      
        2548
            tool_call = ToolCall(
      
        2549
                id="read-reference",
      
        2550
                name="read",
      
        2551
                arguments={"file_path": str(reference)},
      
        2552
            )
      
        2553
            executor = FakeExecutor(
      
        2554
                [
      
        2555
                    tool_outcome(
      
        2556
                        tool_call=tool_call,
      
        2557
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2558
                        is_error=False,
      
        2559
                    )
      
        2560
                ]
      
        2561
            )
      
        2562
        
        2563
            summary = TurnSummary(final_response="")
      
        2564
            await runner.execute_batch(
      
        2565
                tool_calls=[tool_call],
      
        2566
                tool_source="assistant",
      
        2567
                pending_tool_calls_seen=set(),
      
        2568
                emit=_noop_emit,
      
        2569
                summary=summary,
      
        2570
                dod=dod,
      
        2571
                executor=executor,  # type: ignore[arg-type]
      
        2572
                on_confirmation=None,
      
        2573
                on_user_question=None,
      
        2574
                emit_confirmation=None,
      
        2575
                consecutive_errors=0,
      
        2576
            )
      
        2577
        
        2578
            assert persistent_messages
      
        2579
            assert any(
      
        2580
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2581
                in message
      
        2582
                for message in persistent_messages
      
        2583
            )
      
        2584
            assert ephemeral_messages == []
      
        2585
        
        2586
        
        2587
        @pytest.mark.asyncio
      
        2588
        async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
      
        2589
            temp_dir: Path,
      
        2590
        ) -> None:
      
        2591
            async def assess_confidence(
      
        2592
                tool_name: str,
      
        2593
                tool_args: dict,
      
        2594
                context: str,
      
        2595
            ) -> ConfidenceAssessment:
      
        2596
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2597
        
        2598
            async def verify_action(
      
        2599
                tool_name: str,
      
        2600
                tool_args: dict,
      
        2601
                result: str,
      
        2602
                expected: str = "",
      
        2603
            ) -> ActionVerification:
      
        2604
                raise AssertionError("Verification should not run for this scenario")
      
        2605
        
        2606
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2607
            chapters = nginx_root / "chapters"
      
        2608
            implementation_plan = temp_dir / "implementation.md"
      
        2609
            implementation_plan.write_text(
      
        2610
                "\n".join(
      
        2611
                    [
      
        2612
                        "# Implementation Plan",
      
        2613
                        "",
      
        2614
                        "## File Changes",
      
        2615
                        f"- `{chapters}/`",
      
        2616
                        f"- `{nginx_root / 'index.html'}`",
      
        2617
                        "",
      
        2618
                    ]
      
        2619
                )
      
        2620
            )
      
        2621
        
        2622
            context = build_context(
      
        2623
                temp_dir=temp_dir,
      
        2624
                messages=[],
      
        2625
                safeguards=FakeSafeguards(),
      
        2626
                assess_confidence=assess_confidence,
      
        2627
                verify_action=verify_action,
      
        2628
                auto_recover=False,
      
        2629
            )
      
        2630
            persistent_messages: list[str] = []
      
        2631
            ephemeral_messages: list[str] = []
      
        2632
            context.queue_steering_message_callback = persistent_messages.append
      
        2633
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2634
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2635
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2636
            dod.implementation_plan = str(implementation_plan)
      
        2637
            sync_todos_to_definition_of_done(
      
        2638
                dod,
      
        2639
                [
      
        2640
                    {
      
        2641
                        "content": "Create the nginx directory structure",
      
        2642
                        "active_form": "Creating the nginx directory structure",
      
        2643
                        "status": "pending",
      
        2644
                    },
      
        2645
                    {
      
        2646
                        "content": "Develop the main index.html file with proper structure",
      
        2647
                        "active_form": "Developing the main index.html file with proper structure",
      
        2648
                        "status": "pending",
      
        2649
                    },
      
        2650
                ],
      
        2651
            )
      
        2652
        
        2653
            tool_call = ToolCall(
      
        2654
                id="mkdir-nginx",
      
        2655
                name="bash",
      
        2656
                arguments={"command": f"mkdir -p {chapters}"},
      
        2657
            )
      
        2658
            executor = FakeExecutor(
      
        2659
                [
      
        2660
                    tool_outcome(
      
        2661
                        tool_call=tool_call,
      
        2662
                        output="",
      
        2663
                        is_error=False,
      
        2664
                    )
      
        2665
                ]
      
        2666
            )
      
        2667
        
        2668
            summary = TurnSummary(final_response="")
      
        2669
            await runner.execute_batch(
      
        2670
                tool_calls=[tool_call],
      
        2671
                tool_source="assistant",
      
        2672
                pending_tool_calls_seen=set(),
      
        2673
                emit=_noop_emit,
      
        2674
                summary=summary,
      
        2675
                dod=dod,
      
        2676
                executor=executor,  # type: ignore[arg-type]
      
        2677
                on_confirmation=None,
      
        2678
                on_user_question=None,
      
        2679
                emit_confirmation=None,
      
        2680
                consecutive_errors=0,
      
        2681
            )
      
        2682
        
        2683
            assert persistent_messages
      
        2684
            message = persistent_messages[-1]
      
        2685
            assert "Directory setup is complete." in message
      
        2686
            assert "Next step: create `index.html`." in message
      
        2687
            assert "Write a compact but real initial version of that file now" in message
      
        2688
            assert ephemeral_messages == []
      
        2689
        
        2690
        
        2691
        @pytest.mark.asyncio
      
        2692
        async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
      
        2693
            temp_dir: Path,
      
        2694
        ) -> None:
      
        2695
            async def assess_confidence(
      
        2696
                tool_name: str,
      
        2697
                tool_args: dict,
      
        2698
                context: str,
      
        2699
            ) -> ConfidenceAssessment:
      
        2700
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2701
        
        2702
            async def verify_action(
      
        2703
                tool_name: str,
      
        2704
                tool_args: dict,
      
        2705
                result: str,
      
        2706
                expected: str = "",
      
        2707
            ) -> ActionVerification:
      
        2708
                raise AssertionError("Verification should not run for this scenario")
      
        2709
        
        2710
            nginx_root = temp_dir / "guides" / "nginx"
      
        2711
            chapters = nginx_root / "chapters"
      
        2712
            chapters.mkdir(parents=True)
      
        2713
            index_path = nginx_root / "index.html"
      
        2714
        
        2715
            implementation_plan = temp_dir / "implementation.md"
      
        2716
            implementation_plan.write_text(
      
        2717
                "\n".join(
      
        2718
                    [
      
        2719
                        "# Implementation Plan",
      
        2720
                        "",
      
        2721
                        "## File Changes",
      
        2722
                        f"- `{chapters}/`",
      
        2723
                        f"- `{index_path}`",
      
        2724
                        f"- `{chapters / '01-introduction.html'}`",
      
        2725
                        "",
      
        2726
                    ]
      
        2727
                )
      
        2728
            )
      
        2729
        
        2730
            context = build_context(
      
        2731
                temp_dir=temp_dir,
      
        2732
                messages=[],
      
        2733
                safeguards=FakeSafeguards(),
      
        2734
                assess_confidence=assess_confidence,
      
        2735
                verify_action=verify_action,
      
        2736
                auto_recover=False,
      
        2737
            )
      
        2738
            persistent_messages: list[str] = []
      
        2739
            ephemeral_messages: list[str] = []
      
        2740
            context.queue_steering_message_callback = persistent_messages.append
      
        2741
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2742
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2743
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2744
            dod.implementation_plan = str(implementation_plan)
      
        2745
            sync_todos_to_definition_of_done(
      
        2746
                dod,
      
        2747
                [
      
        2748
                    {
      
        2749
                        "content": "Create the main index.html file with proper structure",
      
        2750
                        "active_form": "Creating the main index.html file with proper structure",
      
        2751
                        "status": "pending",
      
        2752
                    },
      
        2753
                    {
      
        2754
                        "content": "Create each chapter file with appropriate content",
      
        2755
                        "active_form": "Creating each chapter file with appropriate content",
      
        2756
                        "status": "pending",
      
        2757
                    },
      
        2758
                ],
      
        2759
            )
      
        2760
        
        2761
            tool_call = ToolCall(
      
        2762
                id="write-index",
      
        2763
                name="write",
      
        2764
                arguments={
      
        2765
                    "file_path": str(index_path),
      
        2766
                    "content": "<html></html>\n",
      
        2767
                },
      
        2768
            )
      
        2769
            executor = FakeExecutor(
      
        2770
                [
      
        2771
                    tool_outcome(
      
        2772
                        tool_call=tool_call,
      
        2773
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2774
                        is_error=False,
      
        2775
                    )
      
        2776
                ]
      
        2777
            )
      
        2778
        
        2779
            summary = TurnSummary(final_response="")
      
        2780
            await runner.execute_batch(
      
        2781
                tool_calls=[tool_call],
      
        2782
                tool_source="assistant",
      
        2783
                pending_tool_calls_seen=set(),
      
        2784
                emit=_noop_emit,
      
        2785
                summary=summary,
      
        2786
                dod=dod,
      
        2787
                executor=executor,  # type: ignore[arg-type]
      
        2788
                on_confirmation=None,
      
        2789
                on_user_question=None,
      
        2790
                emit_confirmation=None,
      
        2791
                consecutive_errors=0,
      
        2792
            )
      
        2793
        
        2794
            assert persistent_messages
      
        2795
            assert ephemeral_messages == []
      
        2796
            message = persistent_messages[-1]
      
        2797
            assert "Confirmed progress:" in message
      
        2798
            assert "Next step: create `01-introduction.html`." in message
      
        2799
            assert (
      
        2800
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2801
                in message
      
        2802
            )
      
        2803
            assert "Write a compact but real initial version of that file now" not in message
      
        2804
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2805
        
        2806
        
        2807
        @pytest.mark.asyncio
      
        2808
        async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
      
        2809
            temp_dir: Path,
      
        2810
            monkeypatch: pytest.MonkeyPatch,
      
        2811
        ) -> None:
      
        2812
            monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
      
        2813
        
        2814
            async def assess_confidence(
      
        2815
                tool_name: str,
      
        2816
                tool_args: dict,
      
        2817
                context: str,
      
        2818
            ) -> ConfidenceAssessment:
      
        2819
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2820
        
        2821
            async def verify_action(
      
        2822
                tool_name: str,
      
        2823
                tool_args: dict,
      
        2824
                result: str,
      
        2825
                expected: str = "",
      
        2826
            ) -> ActionVerification:
      
        2827
                raise AssertionError("Verification should not run for this scenario")
      
        2828
        
        2829
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2830
            chapters = nginx_root / "chapters"
      
        2831
            index_path = nginx_root / "index.html"
      
        2832
        
        2833
            implementation_plan = temp_dir / "implementation.md"
      
        2834
            implementation_plan.write_text(
      
        2835
                "\n".join(
      
        2836
                    [
      
        2837
                        "# Implementation Plan",
      
        2838
                        "",
      
        2839
                        "## File Changes",
      
        2840
                        f"- `{chapters}/`",
      
        2841
                        f"- `{index_path}`",
      
        2842
                        "",
      
        2843
                    ]
      
        2844
                )
      
        2845
            )
      
        2846
        
        2847
            context = build_context(
      
        2848
                temp_dir=temp_dir,
      
        2849
                messages=[],
      
        2850
                safeguards=FakeSafeguards(),
      
        2851
                assess_confidence=assess_confidence,
      
        2852
                verify_action=verify_action,
      
        2853
                auto_recover=False,
      
        2854
            )
      
        2855
            persistent_messages: list[str] = []
      
        2856
            context.queue_steering_message_callback = persistent_messages.append
      
        2857
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2858
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2859
            dod.implementation_plan = str(implementation_plan)
      
        2860
            sync_todos_to_definition_of_done(
      
        2861
                dod,
      
        2862
                [
      
        2863
                    {
      
        2864
                        "content": "Create the nginx directory structure",
      
        2865
                        "active_form": "Creating the nginx directory structure",
      
        2866
                        "status": "pending",
      
        2867
                    },
      
        2868
                    {
      
        2869
                        "content": "Develop the main index.html file with proper structure",
      
        2870
                        "active_form": "Developing the main index.html file with proper structure",
      
        2871
                        "status": "pending",
      
        2872
                    },
      
        2873
                ],
      
        2874
            )
      
        2875
        
        2876
            tool_call = ToolCall(
      
        2877
                id="mkdir-nginx-home",
      
        2878
                name="bash",
      
        2879
                arguments={"command": f"mkdir -p {chapters}"},
      
        2880
            )
      
        2881
            executor = FakeExecutor(
      
        2882
                [
      
        2883
                    tool_outcome(
      
        2884
                        tool_call=tool_call,
      
        2885
                        output="",
      
        2886
                        is_error=False,
      
        2887
                    )
      
        2888
                ]
      
        2889
            )
      
        2890
        
        2891
            summary = TurnSummary(final_response="")
      
        2892
            await runner.execute_batch(
      
        2893
                tool_calls=[tool_call],
      
        2894
                tool_source="assistant",
      
        2895
                pending_tool_calls_seen=set(),
      
        2896
                emit=_noop_emit,
      
        2897
                summary=summary,
      
        2898
                dod=dod,
      
        2899
                executor=executor,  # type: ignore[arg-type]
      
        2900
                on_confirmation=None,
      
        2901
                on_user_question=None,
      
        2902
                emit_confirmation=None,
      
        2903
                consecutive_errors=0,
      
        2904
            )
      
        2905
        
        2906
            assert persistent_messages
      
        2907
            message = persistent_messages[-1]
      
        2908
            assert "Next step: create `index.html`." in message
      
        2909
            assert "`~/Loader/guides/nginx/index.html`" in message
      
        2910
            assert "Write a compact but real initial version of that file now" in message
      
        2911
        
        2912
        
        2913
        @pytest.mark.asyncio
      
        2914
        async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
      
        2915
            temp_dir: Path,
      
        2916
        ) -> None:
      
        2917
            async def assess_confidence(
      
        2918
                tool_name: str,
      
        2919
                tool_args: dict,
      
        2920
                context: str,
      
        2921
            ) -> ConfidenceAssessment:
      
        2922
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2923
        
        2924
            async def verify_action(
      
        2925
                tool_name: str,
      
        2926
                tool_args: dict,
      
        2927
                result: str,
      
        2928
                expected: str = "",
      
        2929
            ) -> ActionVerification:
      
        2930
                raise AssertionError("Verification should not run in this scenario")
      
        2931
        
        2932
            nginx_root = temp_dir / "guides" / "nginx"
      
        2933
            chapters = nginx_root / "chapters"
      
        2934
            chapters.mkdir(parents=True)
      
        2935
            index_path = nginx_root / "index.html"
      
        2936
            index_path.write_text(
      
        2937
                "\n".join(
      
        2938
                    [
      
        2939
                        "<html>",
      
        2940
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2941
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2942
                        "</html>",
      
        2943
                    ]
      
        2944
                )
      
        2945
                + "\n"
      
        2946
            )
      
        2947
        
        2948
            implementation_plan = temp_dir / "implementation.md"
      
        2949
            implementation_plan.write_text(
      
        2950
                "\n".join(
      
        2951
                    [
      
        2952
                        "# Implementation Plan",
      
        2953
                        "",
      
        2954
                        "## File Changes",
      
        2955
                        f"- `{nginx_root}/`",
      
        2956
                        f"- `{chapters}/`",
      
        2957
                        f"- `{index_path}`",
      
        2958
                        f"- `{chapters / '01-introduction.html'}`",
      
        2959
                        "",
      
        2960
                    ]
      
        2961
                )
      
        2962
            )
      
        2963
        
        2964
            context = build_context(
      
        2965
                temp_dir=temp_dir,
      
        2966
                messages=[],
      
        2967
                safeguards=FakeSafeguards(),
      
        2968
                assess_confidence=assess_confidence,
      
        2969
                verify_action=verify_action,
      
        2970
                auto_recover=False,
      
        2971
            )
      
        2972
            persistent_messages: list[str] = []
      
        2973
            ephemeral_messages: list[str] = []
      
        2974
            context.queue_steering_message_callback = persistent_messages.append
      
        2975
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2976
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2977
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2978
            dod.implementation_plan = str(implementation_plan)
      
        2979
            dod.touched_files.append(str(index_path))
      
        2980
            dod.completed_items.append("Develop the main index.html file for the nginx guide")
      
        2981
            dod.pending_items.append("Create chapter files for the nginx guide")
      
        2982
        
        2983
            tool_call = ToolCall(
      
        2984
                id="read-index-self-audit",
      
        2985
                name="read",
      
        2986
                arguments={"file_path": str(index_path)},
      
        2987
            )
      
        2988
            executor = FakeExecutor(
      
        2989
                [
      
        2990
                    tool_outcome(
      
        2991
                        tool_call=tool_call,
      
        2992
                        output="1\t<html>\n",
      
        2993
                        is_error=False,
      
        2994
                    )
      
        2995
                ]
      
        2996
            )
      
        2997
        
        2998
            summary = TurnSummary(final_response="")
      
        2999
            await runner.execute_batch(
      
        3000
                tool_calls=[tool_call],
      
        3001
                tool_source="assistant",
      
        3002
                pending_tool_calls_seen=set(),
      
        3003
                emit=_noop_emit,
      
        3004
                summary=summary,
      
        3005
                dod=dod,
      
        3006
                executor=executor,  # type: ignore[arg-type]
      
        3007
                on_confirmation=None,
      
        3008
                on_user_question=None,
      
        3009
                emit_confirmation=None,
      
        3010
                consecutive_errors=0,
      
        3011
            )
      
        3012
        
        3013
            assert persistent_messages
      
        3014
            message = persistent_messages[-1]
      
        3015
            assert "You already have the current contents of `index.html` from the successful write." in message
      
        3016
            assert "Resume by creating `01-introduction.html` now." in message
      
        3017
            assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
      
        3018
            assert ephemeral_messages == []
      
        3019
        
        3020
        
        3021
        @pytest.mark.asyncio
      
        3022
        async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
      
        3023
            temp_dir: Path,
      
        3024
        ) -> None:
      
        3025
            async def assess_confidence(
      
        3026
                tool_name: str,
      
        3027
                tool_args: dict,
      
        3028
                context: str,
      
        3029
            ) -> ConfidenceAssessment:
      
        3030
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3031
        
        3032
            async def verify_action(
      
        3033
                tool_name: str,
      
        3034
                tool_args: dict,
      
        3035
                result: str,
      
        3036
                expected: str = "",
      
        3037
            ) -> ActionVerification:
      
        3038
                raise AssertionError("Verification should not run for this scenario")
      
        3039
        
        3040
            nginx_root = temp_dir / "guides" / "nginx"
      
        3041
            chapters = nginx_root / "chapters"
      
        3042
            chapters.mkdir(parents=True)
      
        3043
            index_path = nginx_root / "index.html"
      
        3044
        
        3045
            implementation_plan = temp_dir / "implementation.md"
      
        3046
            implementation_plan.write_text(
      
        3047
                "\n".join(
      
        3048
                    [
      
        3049
                        "# Implementation Plan",
      
        3050
                        "",
      
        3051
                        "## File Changes",
      
        3052
                        f"- `{chapters}/`",
      
        3053
                        f"- `{index_path}`",
      
        3054
                        f"- `{chapters / '01-introduction.html'}`",
      
        3055
                        "",
      
        3056
                    ]
      
        3057
                )
      
        3058
            )
      
        3059
        
        3060
            context = build_context(
      
        3061
                temp_dir=temp_dir,
      
        3062
                messages=[
      
        3063
                    Message(
      
        3064
                        role=Role.USER,
      
        3065
                        content=(
      
        3066
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        3067
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        3068
                        ),
      
        3069
                    )
      
        3070
                ],
      
        3071
                safeguards=FakeSafeguards(),
      
        3072
                assess_confidence=assess_confidence,
      
        3073
                verify_action=verify_action,
      
        3074
                auto_recover=False,
      
        3075
            )
      
        3076
            persistent_messages: list[str] = []
      
        3077
            ephemeral_messages: list[str] = []
      
        3078
            context.queue_steering_message_callback = persistent_messages.append
      
        3079
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3080
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3081
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3082
            dod.implementation_plan = str(implementation_plan)
      
        3083
            sync_todos_to_definition_of_done(
      
        3084
                dod,
      
        3085
                [
      
        3086
                    {
      
        3087
                        "content": "Create the main index.html file with proper structure",
      
        3088
                        "active_form": "Creating the main index.html file with proper structure",
      
        3089
                        "status": "pending",
      
        3090
                    },
      
        3091
                    {
      
        3092
                        "content": "Create each chapter file with appropriate content",
      
        3093
                        "active_form": "Creating each chapter file with appropriate content",
      
        3094
                        "status": "pending",
      
        3095
                    },
      
        3096
                ],
      
        3097
            )
      
        3098
        
        3099
            tool_call = ToolCall(
      
        3100
                id="write-index-recovered",
      
        3101
                name="write",
      
        3102
                arguments={
      
        3103
                    "file_path": str(index_path),
      
        3104
                    "content": "<html></html>\n",
      
        3105
                },
      
        3106
            )
      
        3107
            executor = FakeExecutor(
      
        3108
                [
      
        3109
                    tool_outcome(
      
        3110
                        tool_call=tool_call,
      
        3111
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        3112
                        is_error=False,
      
        3113
                    )
      
        3114
                ]
      
        3115
            )
      
        3116
        
        3117
            summary = TurnSummary(final_response="")
      
        3118
            await runner.execute_batch(
      
        3119
                tool_calls=[tool_call],
      
        3120
                tool_source="assistant",
      
        3121
                pending_tool_calls_seen=set(),
      
        3122
                emit=_noop_emit,
      
        3123
                summary=summary,
      
        3124
                dod=dod,
      
        3125
                executor=executor,  # type: ignore[arg-type]
      
        3126
                on_confirmation=None,
      
        3127
                on_user_question=None,
      
        3128
                emit_confirmation=None,
      
        3129
                consecutive_errors=0,
      
        3130
            )
      
        3131
        
        3132
            assert persistent_messages
      
        3133
            assert ephemeral_messages == []
      
        3134
            message = persistent_messages[-1]
      
        3135
            assert "Next step: create `01-introduction.html`." in message
      
        3136
            assert "Write a compact but real initial version of that file now" not in message
      
        3137
        
        3138
        
        3139
        @pytest.mark.asyncio
      
        3140
        async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
      
        3141
            temp_dir: Path,
      
        3142
        ) -> None:
      
        3143
            async def assess_confidence(
      
        3144
                tool_name: str,
      
        3145
                tool_args: dict,
      
        3146
                context: str,
      
        3147
            ) -> ConfidenceAssessment:
      
        3148
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3149
        
        3150
            async def verify_action(
      
        3151
                tool_name: str,
      
        3152
                tool_args: dict,
      
        3153
                result: str,
      
        3154
                expected: str = "",
      
        3155
            ) -> ActionVerification:
      
        3156
                raise AssertionError("Verification should not run in this scenario")
      
        3157
        
        3158
            guide_root = temp_dir / "guides" / "nginx"
      
        3159
            chapters = guide_root / "chapters"
      
        3160
            chapters.mkdir(parents=True)
      
        3161
            index_path = guide_root / "index.html"
      
        3162
            index_path.write_text(
      
        3163
                "\n".join(
      
        3164
                    [
      
        3165
                        "<html>",
      
        3166
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        3167
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        3168
                        "</html>",
      
        3169
                    ]
      
        3170
                )
      
        3171
                + "\n"
      
        3172
            )
      
        3173
        
        3174
            implementation_plan = temp_dir / "implementation.md"
      
        3175
            implementation_plan.write_text(
      
        3176
                "\n".join(
      
        3177
                    [
      
        3178
                        "# Implementation Plan",
      
        3179
                        "",
      
        3180
                        "## File Changes",
      
        3181
                        f"- `{guide_root}/`",
      
        3182
                        f"- `{chapters}/`",
      
        3183
                        f"- `{index_path}`",
      
        3184
                        "",
      
        3185
                    ]
      
        3186
                )
      
        3187
            )
      
        3188
        
        3189
            context = build_context(
      
        3190
                temp_dir=temp_dir,
      
        3191
                messages=[],
      
        3192
                safeguards=FakeSafeguards(),
      
        3193
                assess_confidence=assess_confidence,
      
        3194
                verify_action=verify_action,
      
        3195
            )
      
        3196
            queued_messages: list[str] = []
      
        3197
            context.queue_steering_message_callback = queued_messages.append
      
        3198
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3199
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3200
            dod.implementation_plan = str(implementation_plan)
      
        3201
            dod.touched_files.append(str(index_path))
      
        3202
            sync_todos_to_definition_of_done(
      
        3203
                dod,
      
        3204
                [
      
        3205
                    {
      
        3206
                        "content": "Develop the main index.html file with proper structure",
      
        3207
                        "active_form": "Developing the main index.html file with proper structure",
      
        3208
                        "status": "completed",
      
        3209
                    },
      
        3210
                    {
      
        3211
                        "content": "Create chapter files with content and structure",
      
        3212
                        "active_form": "Creating chapter files with content and structure",
      
        3213
                        "status": "pending",
      
        3214
                    },
      
        3215
                ],
      
        3216
            )
      
        3217
        
        3218
            todos = [
      
        3219
                {
      
        3220
                    "content": "Develop the main index.html file with proper structure",
      
        3221
                    "active_form": "Developing the main index.html file with proper structure",
      
        3222
                    "status": "completed",
      
        3223
                },
      
        3224
                {
      
        3225
                    "content": "Create chapter files with content and structure",
      
        3226
                    "active_form": "Creating chapter files with content and structure",
      
        3227
                    "status": "pending",
      
        3228
                },
      
        3229
            ]
      
        3230
            tool_call = ToolCall(
      
        3231
                id="todo-aggregate",
      
        3232
                name="TodoWrite",
      
        3233
                arguments={"todos": todos},
      
        3234
            )
      
        3235
            executor = FakeExecutor(
      
        3236
                [
      
        3237
                    tool_outcome(
      
        3238
                        tool_call=tool_call,
      
        3239
                        output="Todos updated",
      
        3240
                        is_error=False,
      
        3241
                        metadata={"new_todos": todos},
      
        3242
                    )
      
        3243
                ]
      
        3244
            )
      
        3245
        
        3246
            summary = TurnSummary(final_response="")
      
        3247
            await runner.execute_batch(
      
        3248
                tool_calls=[tool_call],
      
        3249
                tool_source="assistant",
      
        3250
                pending_tool_calls_seen=set(),
      
        3251
                emit=_noop_emit,
      
        3252
                summary=summary,
      
        3253
                dod=dod,
      
        3254
                executor=executor,  # type: ignore[arg-type]
      
        3255
                on_confirmation=None,
      
        3256
                on_user_question=None,
      
        3257
                emit_confirmation=None,
      
        3258
                consecutive_errors=0,
      
        3259
            )
      
        3260
        
        3261
            assert queued_messages
      
        3262
            message = queued_messages[-1]
      
        3263
            assert "Todo tracking is updated." in message
      
        3264
            assert "Next step: create `01-introduction.html`." in message
      
        3265
            assert (
      
        3266
                "Continue with the next pending item: `Create chapter files with content and structure`."
      
        3267
                not in message
      
        3268
            )
      
        3269
        
        3270
        
        3271
        @pytest.mark.asyncio
      
        3272
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        3273
            temp_dir: Path,
      
        3274
        ) -> None:
      
        3275
            async def assess_confidence(
      
        3276
                tool_name: str,
      
        3277
                tool_args: dict,
      
        3278
                context: str,
      
        3279
            ) -> ConfidenceAssessment:
      
        3280
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3281
        
        3282
            async def verify_action(
      
        3283
                tool_name: str,
      
        3284
                tool_args: dict,
      
        3285
                result: str,
      
        3286
                expected: str = "",
      
        3287
            ) -> ActionVerification:
      
        3288
                raise AssertionError("Verification should not run for this scenario")
      
        3289
        
        3290
            guide_root = temp_dir / "guides" / "nginx"
      
        3291
            chapters = guide_root / "chapters"
      
        3292
            chapters.mkdir(parents=True)
      
        3293
            index_path = guide_root / "index.html"
      
        3294
            chapter_one = chapters / "01-getting-started.html"
      
        3295
            chapter_one.write_text("<h1>One</h1>\n")
      
        3296
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3297
        
        3298
            implementation_plan = temp_dir / "implementation.md"
      
        3299
            implementation_plan.write_text(
      
        3300
                "\n".join(
      
        3301
                    [
      
        3302
                        "# Implementation Plan",
      
        3303
                        "",
      
        3304
                        "## File Changes",
      
        3305
                        f"- `{index_path}`",
      
        3306
                        f"- `{chapter_one}`",
      
        3307
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        3308
                        "",
      
        3309
                    ]
      
        3310
                )
      
        3311
            )
      
        3312
        
        3313
            context = build_context(
      
        3314
                temp_dir=temp_dir,
      
        3315
                messages=[],
      
        3316
                safeguards=FakeSafeguards(),
      
        3317
                assess_confidence=assess_confidence,
      
        3318
                verify_action=verify_action,
      
        3319
                auto_recover=False,
      
        3320
            )
      
        3321
            persistent_messages: list[str] = []
      
        3322
            ephemeral_messages: list[str] = []
      
        3323
            context.queue_steering_message_callback = persistent_messages.append
      
        3324
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3325
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3326
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3327
            dod.implementation_plan = str(implementation_plan)
      
        3328
            sync_todos_to_definition_of_done(
      
        3329
                dod,
      
        3330
                [
      
        3331
                    {
      
        3332
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3333
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3334
                        "status": "pending",
      
        3335
                    },
      
        3336
                    {
      
        3337
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        3338
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        3339
                        "status": "pending",
      
        3340
                    },
      
        3341
                ],
      
        3342
            )
      
        3343
            assert tool_batches_should_prioritize_missing_artifact(
      
        3344
                dod=dod,
      
        3345
                next_pending=dod.pending_items[0],
      
        3346
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        3347
                project_root=temp_dir,
      
        3348
            )
      
        3349
        
        3350
            tool_call = ToolCall(
      
        3351
                id="dup-read",
      
        3352
                name="read",
      
        3353
                arguments={"file_path": str(index_path)},
      
        3354
            )
      
        3355
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        3356
        
        3357
            assert persistent_messages
      
        3358
            message = persistent_messages[-1]
      
        3359
            assert "06-ssl-configuration.html" in message
      
        3360
            assert "Do not switch into review or consistency-check mode" in message
      
        3361
            assert (
      
        3362
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        3363
                not in message
      
        3364
            )
      
        3365
        
        3366
        
        3367
        @pytest.mark.asyncio
      
        3368
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        3369
            temp_dir: Path,
      
        3370
        ) -> None:
      
        3371
            async def assess_confidence(
      
        3372
                tool_name: str,
      
        3373
                tool_args: dict,
      
        3374
                context: str,
      
        3375
            ) -> ConfidenceAssessment:
      
        3376
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3377
        
        3378
            async def verify_action(
      
        3379
                tool_name: str,
      
        3380
                tool_args: dict,
      
        3381
                result: str,
      
        3382
                expected: str = "",
      
        3383
            ) -> ActionVerification:
      
        3384
                raise AssertionError("Verification should not run for this scenario")
      
        3385
        
        3386
            guide_root = temp_dir / "guides" / "nginx"
      
        3387
            chapters = guide_root / "chapters"
      
        3388
            chapters.mkdir(parents=True)
      
        3389
            index_path = guide_root / "index.html"
      
        3390
            chapter_one = chapters / "01-getting-started.html"
      
        3391
            chapter_two = chapters / "02-installation.html"
      
        3392
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3393
            chapter_one.write_text("<h1>One</h1>\n")
      
        3394
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3395
        
        3396
            implementation_plan = temp_dir / "implementation.md"
      
        3397
            implementation_plan.write_text(
      
        3398
                "\n".join(
      
        3399
                    [
      
        3400
                        "# Implementation Plan",
      
        3401
                        "",
      
        3402
                        "## File Changes",
      
        3403
                        f"- `{chapters}/`",
      
        3404
                        f"- `{index_path}`",
      
        3405
                        f"- `{chapter_one}`",
      
        3406
                        f"- `{chapter_two}`",
      
        3407
                        "",
      
        3408
                    ]
      
        3409
                )
      
        3410
            )
      
        3411
        
        3412
            context = build_context(
      
        3413
                temp_dir=temp_dir,
      
        3414
                messages=[],
      
        3415
                safeguards=FakeSafeguards(),
      
        3416
                assess_confidence=assess_confidence,
      
        3417
                verify_action=verify_action,
      
        3418
                auto_recover=False,
      
        3419
            )
      
        3420
            persistent_messages: list[str] = []
      
        3421
            ephemeral_messages: list[str] = []
      
        3422
            context.queue_steering_message_callback = persistent_messages.append
      
        3423
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3424
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3425
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3426
            dod.implementation_plan = str(implementation_plan)
      
        3427
            sync_todos_to_definition_of_done(
      
        3428
                dod,
      
        3429
                [
      
        3430
                    {
      
        3431
                        "content": "Create the guide files",
      
        3432
                        "active_form": "Working on: Create the guide files",
      
        3433
                        "status": "completed",
      
        3434
                    },
      
        3435
                    {
      
        3436
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3437
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3438
                        "status": "pending",
      
        3439
                    },
      
        3440
                ],
      
        3441
            )
      
        3442
            tool_call = ToolCall(
      
        3443
                id="write-final",
      
        3444
                name="write",
      
        3445
                arguments={
      
        3446
                    "file_path": str(chapter_two),
      
        3447
                    "content": "<h1>Two</h1>\n",
      
        3448
                },
      
        3449
            )
      
        3450
            executor = FakeExecutor(
      
        3451
                [
      
        3452
                    tool_outcome(
      
        3453
                        tool_call=tool_call,
      
        3454
                        output=f"Successfully wrote {chapter_two}",
      
        3455
                        is_error=False,
      
        3456
                    )
      
        3457
                ]
      
        3458
            )
      
        3459
        
        3460
            summary = TurnSummary(final_response="")
      
        3461
            await runner.execute_batch(
      
        3462
                tool_calls=[tool_call],
      
        3463
                tool_source="assistant",
      
        3464
                pending_tool_calls_seen=set(),
      
        3465
                emit=_noop_emit,
      
        3466
                summary=summary,
      
        3467
                dod=dod,
      
        3468
                executor=executor,  # type: ignore[arg-type]
      
        3469
                on_confirmation=None,
      
        3470
                on_user_question=None,
      
        3471
                emit_confirmation=None,
      
        3472
                consecutive_errors=0,
      
        3473
            )
      
        3474
        
        3475
            assert any(
      
        3476
                "All explicitly planned artifacts now exist on disk." in message
      
        3477
                for message in persistent_messages
      
        3478
            )
      
        3479
            assert any(
      
        3480
                "Ensure all files are properly linked and formatted consistently" in message
      
        3481
                for message in persistent_messages
      
        3482
            )
      
        3483
            assert any(
      
        3484
                "Finish with a final response once no specific mismatch remains so Loader can verify."
      
        3485
                in message
      
        3486
                for message in persistent_messages
      
        3487
            )
      
        3488
        
        3489
        
        3490
        @pytest.mark.asyncio
      
        3491
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        3492
            temp_dir: Path,
      
        3493
        ) -> None:
      
        3494
            async def assess_confidence(
      
        3495
                tool_name: str,
      
        3496
                tool_args: dict,
      
        3497
                context: str,
      
        3498
            ) -> ConfidenceAssessment:
      
        3499
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3500
        
        3501
            async def verify_action(
      
        3502
                tool_name: str,
      
        3503
                tool_args: dict,
      
        3504
                result: str,
      
        3505
                expected: str = "",
      
        3506
            ) -> ActionVerification:
      
        3507
                raise AssertionError("Verification should not run in this scenario")
      
        3508
        
        3509
            guide_root = temp_dir / "guides" / "nginx"
      
        3510
            chapters = guide_root / "chapters"
      
        3511
            guide_root.mkdir(parents=True)
      
        3512
            chapters.mkdir()
      
        3513
            index_path = guide_root / "index.html"
      
        3514
            index_path.write_text("<html></html>\n")
      
        3515
            chapter_one = chapters / "01-getting-started.html"
      
        3516
            chapter_two = chapters / "02-installation.html"
      
        3517
            implementation_plan = temp_dir / "implementation.md"
      
        3518
            implementation_plan.write_text(
      
        3519
                "\n".join(
      
        3520
                    [
      
        3521
                        "# Implementation Plan",
      
        3522
                        "",
      
        3523
                        "## File Changes",
      
        3524
                        f"- `{guide_root}/`",
      
        3525
                        f"- `{index_path}`",
      
        3526
                        f"- `{chapter_one}`",
      
        3527
                        f"- `{chapter_two}`",
      
        3528
                        "",
      
        3529
                    ]
      
        3530
                )
      
        3531
            )
      
        3532
        
        3533
            context = build_context(
      
        3534
                temp_dir=temp_dir,
      
        3535
                messages=[],
      
        3536
                safeguards=FakeSafeguards(),
      
        3537
                assess_confidence=assess_confidence,
      
        3538
                verify_action=verify_action,
      
        3539
                auto_recover=False,
      
        3540
            )
      
        3541
            persistent_messages: list[str] = []
      
        3542
            ephemeral_messages: list[str] = []
      
        3543
            context.queue_steering_message_callback = persistent_messages.append
      
        3544
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3545
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3546
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3547
            dod.implementation_plan = str(implementation_plan)
      
        3548
            sync_todos_to_definition_of_done(
      
        3549
                dod,
      
        3550
                [
      
        3551
                    {
      
        3552
                        "content": "Create the main index.html file with proper structure",
      
        3553
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        3554
                        "status": "pending",
      
        3555
                    },
      
        3556
                    {
      
        3557
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        3558
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        3559
                        "status": "pending",
      
        3560
                    },
      
        3561
                    {
      
        3562
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3563
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3564
                        "status": "pending",
      
        3565
                    },
      
        3566
                ],
      
        3567
            )
      
        3568
            tool_call = ToolCall(
      
        3569
                id="write-index",
      
        3570
                name="write",
      
        3571
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        3572
            )
      
        3573
            executor = FakeExecutor(
      
        3574
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        3575
            )
      
        3576
        
        3577
            summary = TurnSummary(final_response="")
      
        3578
            await runner.execute_batch(
      
        3579
                tool_calls=[tool_call],
      
        3580
                tool_source="assistant",
      
        3581
                pending_tool_calls_seen=set(),
      
        3582
                emit=_noop_emit,
      
        3583
                summary=summary,
      
        3584
                dod=dod,
      
        3585
                executor=executor,  # type: ignore[arg-type]
      
        3586
                on_confirmation=None,
      
        3587
                on_user_question=None,
      
        3588
                emit_confirmation=None,
      
        3589
                consecutive_errors=0,
      
        3590
            )
      
        3591
        
        3592
            assert persistent_messages
      
        3593
            assert ephemeral_messages == []
      
        3594
            message = persistent_messages[-1]
      
        3595
            assert "Next step: create `01-getting-started.html`." in message
      
        3596
            assert "Write a compact but real initial version of that file now" not in message
      
        3597
            assert "refresh `TodoWrite`" not in message
      
        3598
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3599
        
        3600
        
        3601
        @pytest.mark.asyncio
      
        3602
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        3603
            temp_dir: Path,
      
        3604
        ) -> None:
      
        3605
            async def assess_confidence(
      
        3606
                tool_name: str,
      
        3607
                tool_args: dict,
      
        3608
                context: str,
      
        3609
            ) -> ConfidenceAssessment:
      
        3610
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3611
        
        3612
            async def verify_action(
      
        3613
                tool_name: str,
      
        3614
                tool_args: dict,
      
        3615
                result: str,
      
        3616
                expected: str = "",
      
        3617
            ) -> ActionVerification:
      
        3618
                raise AssertionError("Verification should not run in this scenario")
      
        3619
        
        3620
            guide_root = temp_dir / "guides" / "nginx"
      
        3621
            chapters = guide_root / "chapters"
      
        3622
            guide_root.mkdir(parents=True)
      
        3623
            chapters.mkdir()
      
        3624
            index_path = guide_root / "index.html"
      
        3625
            index_path.write_text("<html></html>\n")
      
        3626
        
        3627
            chapter_paths = [
      
        3628
                chapters / "01-getting-started.html",
      
        3629
                chapters / "02-installation.html",
      
        3630
                chapters / "03-first-website.html",
      
        3631
                chapters / "04-configuration-basics.html",
      
        3632
                chapters / "05-advanced-configurations.html",
      
        3633
                chapters / "06-performance-tuning.html",
      
        3634
                chapters / "07-security-best-practices.html",
      
        3635
            ]
      
        3636
            for chapter in chapter_paths[:4]:
      
        3637
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        3638
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        3639
        
        3640
            implementation_plan = temp_dir / "implementation.md"
      
        3641
            implementation_plan.write_text(
      
        3642
                "\n".join(
      
        3643
                    [
      
        3644
                        "# Implementation Plan",
      
        3645
                        "",
      
        3646
                        "## File Changes",
      
        3647
                        f"- `{guide_root}/`",
      
        3648
                        f"- `{chapters}/`",
      
        3649
                        f"- `{index_path}`",
      
        3650
                        *[f"- `{path}`" for path in chapter_paths],
      
        3651
                        "",
      
        3652
                    ]
      
        3653
                )
      
        3654
            )
      
        3655
        
        3656
            context = build_context(
      
        3657
                temp_dir=temp_dir,
      
        3658
                messages=[],
      
        3659
                safeguards=FakeSafeguards(),
      
        3660
                assess_confidence=assess_confidence,
      
        3661
                verify_action=verify_action,
      
        3662
                auto_recover=False,
      
        3663
            )
      
        3664
            persistent_messages: list[str] = []
      
        3665
            ephemeral_messages: list[str] = []
      
        3666
            context.queue_steering_message_callback = persistent_messages.append
      
        3667
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3668
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3669
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3670
            dod.implementation_plan = str(implementation_plan)
      
        3671
            sync_todos_to_definition_of_done(
      
        3672
                dod,
      
        3673
                [
      
        3674
                    {
      
        3675
                        "content": "Create the nginx guide artifacts",
      
        3676
                        "active_form": "Creating nginx guide artifacts",
      
        3677
                        "status": "pending",
      
        3678
                    },
      
        3679
                    {
      
        3680
                        "content": "Verify all guide files are linked and complete",
      
        3681
                        "active_form": "Verifying guide linkage and completeness",
      
        3682
                        "status": "pending",
      
        3683
                    },
      
        3684
                ],
      
        3685
            )
      
        3686
            tool_call = ToolCall(
      
        3687
                id="write-chapter-05",
      
        3688
                name="write",
      
        3689
                arguments={
      
        3690
                    "file_path": str(chapter_paths[4]),
      
        3691
                    "content": "<h1>Advanced configurations</h1>\n",
      
        3692
                },
      
        3693
            )
      
        3694
            executor = FakeExecutor(
      
        3695
                [
      
        3696
                    tool_outcome(
      
        3697
                        tool_call=tool_call,
      
        3698
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        3699
                        is_error=False,
      
        3700
                    )
      
        3701
                ]
      
        3702
            )
      
        3703
        
        3704
            summary = TurnSummary(final_response="")
      
        3705
            await runner.execute_batch(
      
        3706
                tool_calls=[tool_call],
      
        3707
                tool_source="assistant",
      
        3708
                pending_tool_calls_seen=set(),
      
        3709
                emit=_noop_emit,
      
        3710
                summary=summary,
      
        3711
                dod=dod,
      
        3712
                executor=executor,  # type: ignore[arg-type]
      
        3713
                on_confirmation=None,
      
        3714
                on_user_question=None,
      
        3715
                emit_confirmation=None,
      
        3716
                consecutive_errors=0,
      
        3717
            )
      
        3718
        
        3719
            assert any(
      
        3720
                "Next step: create `06-performance-tuning.html`." in message
      
        3721
                for message in ephemeral_messages
      
        3722
            )
      
        3723
            assert not any(
      
        3724
                "All explicitly planned artifacts now exist on disk." in message
      
        3725
                for message in ephemeral_messages
      
        3726
            )
      
        3727
        
        3728
        
        3729
        @pytest.mark.asyncio
      
        3730
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        3731
            temp_dir: Path,
      
        3732
        ) -> None:
      
        3733
            async def assess_confidence(
      
        3734
                tool_name: str,
      
        3735
                tool_args: dict,
      
        3736
                context: str,
      
        3737
            ) -> ConfidenceAssessment:
      
        3738
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3739
        
        3740
            async def verify_action(
      
        3741
                tool_name: str,
      
        3742
                tool_args: dict,
      
        3743
                result: str,
      
        3744
                expected: str = "",
      
        3745
            ) -> ActionVerification:
      
        3746
                raise AssertionError("Verification should not run in this scenario")
      
        3747
        
        3748
            guide_root = temp_dir / "guides" / "nginx"
      
        3749
            chapters = guide_root / "chapters"
      
        3750
            guide_root.mkdir(parents=True)
      
        3751
            chapters.mkdir()
      
        3752
            index_path = guide_root / "index.html"
      
        3753
            chapter_paths = [
      
        3754
                chapters / "01-introduction.html",
      
        3755
                chapters / "02-installation.html",
      
        3756
                chapters / "03-configuration.html",
      
        3757
                chapters / "04-basic-usage.html",
      
        3758
                chapters / "05-advanced-features.html",
      
        3759
            ]
      
        3760
            for path in (index_path, *chapter_paths[:4]):
      
        3761
                path.write_text("<html></html>\n")
      
        3762
        
        3763
            implementation_plan = temp_dir / "implementation.md"
      
        3764
            implementation_plan.write_text(
      
        3765
                "\n".join(
      
        3766
                    [
      
        3767
                        "# Implementation Plan",
      
        3768
                        "",
      
        3769
                        "## File Changes",
      
        3770
                        f"- `{guide_root}/`",
      
        3771
                        f"- `{chapters}/`",
      
        3772
                        f"- `{index_path}`",
      
        3773
                        *[f"- `{path}`" for path in chapter_paths],
      
        3774
                        "",
      
        3775
                    ]
      
        3776
                )
      
        3777
            )
      
        3778
        
        3779
            context = build_context(
      
        3780
                temp_dir=temp_dir,
      
        3781
                messages=[],
      
        3782
                safeguards=FakeSafeguards(),
      
        3783
                assess_confidence=assess_confidence,
      
        3784
                verify_action=verify_action,
      
        3785
                auto_recover=False,
      
        3786
            )
      
        3787
            persistent_messages: list[str] = []
      
        3788
            ephemeral_messages: list[str] = []
      
        3789
            context.queue_steering_message_callback = persistent_messages.append
      
        3790
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3791
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3792
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3793
            dod.implementation_plan = str(implementation_plan)
      
        3794
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3795
            dod.completed_items.extend(
      
        3796
                [
      
        3797
                    "Create the nginx directory structure",
      
        3798
                    "Create the main index.html file with proper structure",
      
        3799
                ]
      
        3800
            )
      
        3801
            sync_todos_to_definition_of_done(
      
        3802
                dod,
      
        3803
                [
      
        3804
                    {
      
        3805
                        "content": "Create each chapter file with appropriate content",
      
        3806
                        "active_form": "Creating each chapter file with appropriate content",
      
        3807
                        "status": "pending",
      
        3808
                    }
      
        3809
                ],
      
        3810
            )
      
        3811
            tool_call = ToolCall(
      
        3812
                id="write-chapter-04",
      
        3813
                name="write",
      
        3814
                arguments={
      
        3815
                    "file_path": str(chapter_paths[3]),
      
        3816
                    "content": "<html>updated</html>\n",
      
        3817
                },
      
        3818
            )
      
        3819
            executor = FakeExecutor(
      
        3820
                [
      
        3821
                    tool_outcome(
      
        3822
                        tool_call=tool_call,
      
        3823
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3824
                        is_error=False,
      
        3825
                    )
      
        3826
                ]
      
        3827
            )
      
        3828
        
        3829
            summary = TurnSummary(final_response="")
      
        3830
            await runner.execute_batch(
      
        3831
                tool_calls=[tool_call],
      
        3832
                tool_source="assistant",
      
        3833
                pending_tool_calls_seen=set(),
      
        3834
                emit=_noop_emit,
      
        3835
                summary=summary,
      
        3836
                dod=dod,
      
        3837
                executor=executor,  # type: ignore[arg-type]
      
        3838
                on_confirmation=None,
      
        3839
                on_user_question=None,
      
        3840
                emit_confirmation=None,
      
        3841
                consecutive_errors=0,
      
        3842
            )
      
        3843
        
        3844
            assert ephemeral_messages
      
        3845
            message = ephemeral_messages[-1]
      
        3846
            assert "Next step: create `05-advanced-features.html`." in message
      
        3847
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3848
            assert "refresh `TodoWrite`" not in message
      
        3849
        
        3850
        
        3851
        @pytest.mark.asyncio
      
        3852
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3853
            temp_dir: Path,
      
        3854
        ) -> None:
      
        3855
            async def assess_confidence(
      
        3856
                tool_name: str,
      
        3857
                tool_args: dict,
      
        3858
                context: str,
      
        3859
            ) -> ConfidenceAssessment:
      
        3860
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3861
        
        3862
            async def verify_action(
      
        3863
                tool_name: str,
      
        3864
                tool_args: dict,
      
        3865
                result: str,
      
        3866
                expected: str = "",
      
        3867
            ) -> ActionVerification:
      
        3868
                raise AssertionError("Verification should not run in this scenario")
      
        3869
        
        3870
            guide_root = temp_dir / "guides" / "nginx"
      
        3871
            chapters = guide_root / "chapters"
      
        3872
            guide_root.mkdir(parents=True)
      
        3873
            chapters.mkdir()
      
        3874
            index_path = guide_root / "index.html"
      
        3875
            index_path.write_text("<html></html>\n")
      
        3876
            chapter_one = chapters / "01-getting-started.html"
      
        3877
            chapter_two = chapters / "02-installation.html"
      
        3878
            chapter_one.write_text("<h1>One</h1>\n")
      
        3879
        
        3880
            implementation_plan = temp_dir / "implementation.md"
      
        3881
            implementation_plan.write_text(
      
        3882
                "\n".join(
      
        3883
                    [
      
        3884
                        "# Implementation Plan",
      
        3885
                        "",
      
        3886
                        "## File Changes",
      
        3887
                        f"- `{guide_root}/`",
      
        3888
                        f"- `{chapters}/`",
      
        3889
                        f"- `{index_path}`",
      
        3890
                        f"- `{chapter_one}`",
      
        3891
                        f"- `{chapter_two}`",
      
        3892
                        "",
      
        3893
                    ]
      
        3894
                )
      
        3895
            )
      
        3896
        
        3897
            context = build_context(
      
        3898
                temp_dir=temp_dir,
      
        3899
                messages=[],
      
        3900
                safeguards=FakeSafeguards(),
      
        3901
                assess_confidence=assess_confidence,
      
        3902
                verify_action=verify_action,
      
        3903
                auto_recover=False,
      
        3904
            )
      
        3905
            persistent_messages: list[str] = []
      
        3906
            ephemeral_messages: list[str] = []
      
        3907
            context.queue_steering_message_callback = persistent_messages.append
      
        3908
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3909
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3910
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3911
            dod.implementation_plan = str(implementation_plan)
      
        3912
            sync_todos_to_definition_of_done(
      
        3913
                dod,
      
        3914
                [
      
        3915
                    {
      
        3916
                        "content": "Create 01-getting-started.html",
      
        3917
                        "active_form": "Creating 01-getting-started.html",
      
        3918
                        "status": "completed",
      
        3919
                    },
      
        3920
                    {
      
        3921
                        "content": "Create 02-installation.html",
      
        3922
                        "active_form": "Creating 02-installation.html",
      
        3923
                        "status": "pending",
      
        3924
                    },
      
        3925
                ],
      
        3926
            )
      
        3927
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3928
        
        3929
            tool_call = ToolCall(
      
        3930
                id="todo-only",
      
        3931
                name="TodoWrite",
      
        3932
                arguments={
      
        3933
                    "todos": [
      
        3934
                        {
      
        3935
                            "content": "Create 01-getting-started.html",
      
        3936
                            "active_form": "Creating 01-getting-started.html",
      
        3937
                            "status": "completed",
      
        3938
                        },
      
        3939
                        {
      
        3940
                            "content": "Create 02-installation.html",
      
        3941
                            "active_form": "Creating 02-installation.html",
      
        3942
                            "status": "pending",
      
        3943
                        },
      
        3944
                    ]
      
        3945
                },
      
        3946
            )
      
        3947
            executor = FakeExecutor(
      
        3948
                [
      
        3949
                    tool_outcome(
      
        3950
                        tool_call=tool_call,
      
        3951
                        output="Todos updated",
      
        3952
                        is_error=False,
      
        3953
                        metadata={
      
        3954
                            "new_todos": [
      
        3955
                                {
      
        3956
                                    "content": "Create 01-getting-started.html",
      
        3957
                                    "active_form": "Creating 01-getting-started.html",
      
        3958
                                    "status": "completed",
      
        3959
                                },
      
        3960
                                {
      
        3961
                                    "content": "Create 02-installation.html",
      
        3962
                                    "active_form": "Creating 02-installation.html",
      
        3963
                                    "status": "pending",
      
        3964
                                },
      
        3965
                            ]
      
        3966
                        },
      
        3967
                    )
      
        3968
                ]
      
        3969
            )
      
        3970
        
        3971
            summary = TurnSummary(final_response="")
      
        3972
            await runner.execute_batch(
      
        3973
                tool_calls=[tool_call],
      
        3974
                tool_source="assistant",
      
        3975
                pending_tool_calls_seen=set(),
      
        3976
                emit=_noop_emit,
      
        3977
                summary=summary,
      
        3978
                dod=dod,
      
        3979
                executor=executor,  # type: ignore[arg-type]
      
        3980
                on_confirmation=None,
      
        3981
                on_user_question=None,
      
        3982
                emit_confirmation=None,
      
        3983
                consecutive_errors=0,
      
        3984
            )
      
        3985
        
        3986
            assert persistent_messages
      
        3987
            message = persistent_messages[-1]
      
        3988
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        3989
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        3990
            assert "Make your next response the concrete mutation tool call itself." in message
      
        3991
            assert ephemeral_messages == []
      
        3992
        
        3993
        
        3994
        @pytest.mark.asyncio
      
        3995
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3996
            temp_dir: Path,
      
        3997
        ) -> None:
      
        3998
            async def assess_confidence(
      
        3999
                tool_name: str,
      
        4000
                tool_args: dict,
      
        4001
                context: str,
      
        4002
            ) -> ConfidenceAssessment:
      
        4003
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4004
        
        4005
            async def verify_action(
      
        4006
                tool_name: str,
      
        4007
                tool_args: dict,
      
        4008
                result: str,
      
        4009
                expected: str = "",
      
        4010
            ) -> ActionVerification:
      
        4011
                raise AssertionError("Verification should not run in this scenario")
      
        4012
        
        4013
            guide_root = temp_dir / "guides" / "nginx"
      
        4014
            chapters = guide_root / "chapters"
      
        4015
            guide_root.mkdir(parents=True)
      
        4016
            chapters.mkdir()
      
        4017
            index_path = guide_root / "index.html"
      
        4018
            chapter_one = chapters / "01-getting-started.html"
      
        4019
            chapter_two = chapters / "02-installation.html"
      
        4020
            index_path.write_text("<html></html>\n")
      
        4021
            chapter_one.write_text("<h1>One</h1>\n")
      
        4022
            chapter_two.write_text("<h1>Two</h1>\n")
      
        4023
        
        4024
            implementation_plan = temp_dir / "implementation.md"
      
        4025
            implementation_plan.write_text(
      
        4026
                "\n".join(
      
        4027
                    [
      
        4028
                        "# Implementation Plan",
      
        4029
                        "",
      
        4030
                        "## File Changes",
      
        4031
                        f"- `{guide_root}/`",
      
        4032
                        f"- `{chapters}/`",
      
        4033
                        f"- `{index_path}`",
      
        4034
                        f"- `{chapter_one}`",
      
        4035
                        f"- `{chapter_two}`",
      
        4036
                        "",
      
        4037
                    ]
      
        4038
                )
      
        4039
            )
      
        4040
        
        4041
            context = build_context(
      
        4042
                temp_dir=temp_dir,
      
        4043
                messages=[],
      
        4044
                safeguards=FakeSafeguards(),
      
        4045
                assess_confidence=assess_confidence,
      
        4046
                verify_action=verify_action,
      
        4047
                auto_recover=False,
      
        4048
            )
      
        4049
            queued_messages: list[str] = []
      
        4050
            context.queue_steering_message_callback = queued_messages.append
      
        4051
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4052
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4053
            dod.implementation_plan = str(implementation_plan)
      
        4054
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4055
            sync_todos_to_definition_of_done(
      
        4056
                dod,
      
        4057
                [
      
        4058
                    {
      
        4059
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4060
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4061
                        "status": "pending",
      
        4062
                    },
      
        4063
                    {
      
        4064
                        "content": "Verify all guide files are linked and complete",
      
        4065
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        4066
                        "status": "pending",
      
        4067
                    },
      
        4068
                ],
      
        4069
                project_root=temp_dir,
      
        4070
            )
      
        4071
        
        4072
            tool_call = ToolCall(
      
        4073
                id="todo-only",
      
        4074
                name="TodoWrite",
      
        4075
                arguments={
      
        4076
                    "todos": [
      
        4077
                        {
      
        4078
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4079
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4080
                            "status": "pending",
      
        4081
                        },
      
        4082
                        {
      
        4083
                            "content": "Verify all guide files are linked and complete",
      
        4084
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        4085
                            "status": "pending",
      
        4086
                        },
      
        4087
                    ]
      
        4088
                },
      
        4089
            )
      
        4090
            executor = FakeExecutor(
      
        4091
                [
      
        4092
                    tool_outcome(
      
        4093
                        tool_call=tool_call,
      
        4094
                        output="Todos updated",
      
        4095
                        is_error=False,
      
        4096
                        metadata={
      
        4097
                            "new_todos": [
      
        4098
                                {
      
        4099
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4100
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        4101
                                    "status": "pending",
      
        4102
                                },
      
        4103
                                {
      
        4104
                                    "content": "Verify all guide files are linked and complete",
      
        4105
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        4106
                                    "status": "pending",
      
        4107
                                },
      
        4108
                            ]
      
        4109
                        },
      
        4110
                    )
      
        4111
                ]
      
        4112
            )
      
        4113
        
        4114
            summary = TurnSummary(final_response="")
      
        4115
            await runner.execute_batch(
      
        4116
                tool_calls=[tool_call],
      
        4117
                tool_source="assistant",
      
        4118
                pending_tool_calls_seen=set(),
      
        4119
                emit=_noop_emit,
      
        4120
                summary=summary,
      
        4121
                dod=dod,
      
        4122
                executor=executor,  # type: ignore[arg-type]
      
        4123
                on_confirmation=None,
      
        4124
                on_user_question=None,
      
        4125
                emit_confirmation=None,
      
        4126
                consecutive_errors=0,
      
        4127
            )
      
        4128
        
        4129
            assert queued_messages
      
        4130
            message = queued_messages[-1]
      
        4131
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4132
            assert "Verify all guide files are linked and complete" in message
      
        4133
            assert (
      
        4134
                "Finish with a final response once no specific mismatch remains so Loader can verify."
      
        4135
                in message
      
        4136
            )
      
        4137
            assert "reopen reference materials" in message
      
        4138
            assert "Fortran guide structure" not in message
      
        4139
            assert context.workflow_mode == "execute"
      
        4140
        
        4141
        
        4142
        @pytest.mark.asyncio
      
        4143
        async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
      
        4144
            temp_dir: Path,
      
        4145
        ) -> None:
      
        4146
            async def assess_confidence(
      
        4147
                tool_name: str,
      
        4148
                tool_args: dict,
      
        4149
                context: str,
      
        4150
            ) -> ConfidenceAssessment:
      
        4151
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4152
        
        4153
            async def verify_action(
      
        4154
                tool_name: str,
      
        4155
                tool_args: dict,
      
        4156
                result: str,
      
        4157
                expected: str = "",
      
        4158
            ) -> ActionVerification:
      
        4159
                raise AssertionError("Verification should not run for this scenario")
      
        4160
        
        4161
            guide_root = temp_dir / "guides" / "nginx"
      
        4162
            chapters = guide_root / "chapters"
      
        4163
            guide_root.mkdir(parents=True)
      
        4164
            chapters.mkdir()
      
        4165
            index_path = guide_root / "index.html"
      
        4166
            chapter_one = chapters / "01-introduction.html"
      
        4167
            chapter_two = chapters / "02-installation.html"
      
        4168
            index_path.write_text(
      
        4169
                "\n".join(
      
        4170
                    [
      
        4171
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        4172
                        '<a href="chapters/02-installation.html">Install</a>',
      
        4173
                        '<a href="../index.html">Back</a>',
      
        4174
                        "",
      
        4175
                    ]
      
        4176
                )
      
        4177
            )
      
        4178
            chapter_one.write_text("<html></html>\n")
      
        4179
            chapter_two.write_text("<html></html>\n")
      
        4180
        
        4181
            implementation_plan = temp_dir / "implementation.md"
      
        4182
            implementation_plan.write_text(
      
        4183
                "\n".join(
      
        4184
                    [
      
        4185
                        "# Implementation Plan",
      
        4186
                        "",
      
        4187
                        "## File Changes",
      
        4188
                        f"- `{guide_root}/`",
      
        4189
                        f"- `{chapters}/`",
      
        4190
                        f"- `{index_path}`",
      
        4191
                        f"- `{chapter_one}`",
      
        4192
                        f"- `{chapter_two}`",
      
        4193
                        "",
      
        4194
                    ]
      
        4195
                )
      
        4196
            )
      
        4197
        
        4198
            context = build_context(
      
        4199
                temp_dir=temp_dir,
      
        4200
                messages=[],
      
        4201
                safeguards=FakeSafeguards(),
      
        4202
                assess_confidence=assess_confidence,
      
        4203
                verify_action=verify_action,
      
        4204
                auto_recover=False,
      
        4205
            )
      
        4206
            queued_messages: list[str] = []
      
        4207
            context.queue_steering_message_callback = queued_messages.append
      
        4208
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4209
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4210
            dod.implementation_plan = str(implementation_plan)
      
        4211
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4212
            sync_todos_to_definition_of_done(
      
        4213
                dod,
      
        4214
                [
      
        4215
                    {
      
        4216
                        "content": "Create chapter files following the established pattern",
      
        4217
                        "active_form": "Creating chapter files",
      
        4218
                        "status": "in_progress",
      
        4219
                    }
      
        4220
                ],
      
        4221
                project_root=temp_dir,
      
        4222
            )
      
        4223
        
        4224
            tool_call = ToolCall(
      
        4225
                id="todo-post-build",
      
        4226
                name="TodoWrite",
      
        4227
                arguments={
      
        4228
                    "todos": [
      
        4229
                        {
      
        4230
                            "content": "Create chapter files following the established pattern",
      
        4231
                            "active_form": "Creating chapter files",
      
        4232
                            "status": "in_progress",
      
        4233
                        }
      
        4234
                    ]
      
        4235
                },
      
        4236
            )
      
        4237
            executor = FakeExecutor(
      
        4238
                [
      
        4239
                    tool_outcome(
      
        4240
                        tool_call=tool_call,
      
        4241
                        output="Todos updated",
      
        4242
                        is_error=False,
      
        4243
                        metadata={
      
        4244
                            "new_todos": [
      
        4245
                                {
      
        4246
                                    "content": "Create chapter files following the established pattern",
      
        4247
                                    "active_form": "Creating chapter files",
      
        4248
                                    "status": "in_progress",
      
        4249
                                }
      
        4250
                            ]
      
        4251
                        },
      
        4252
                    )
      
        4253
                ]
      
        4254
            )
      
        4255
        
        4256
            summary = TurnSummary(final_response="")
      
        4257
            await runner.execute_batch(
      
        4258
                tool_calls=[tool_call],
      
        4259
                tool_source="assistant",
      
        4260
                pending_tool_calls_seen=set(),
      
        4261
                emit=_noop_emit,
      
        4262
                summary=summary,
      
        4263
                dod=dod,
      
        4264
                executor=executor,  # type: ignore[arg-type]
      
        4265
                on_confirmation=None,
      
        4266
                on_user_question=None,
      
        4267
                emit_confirmation=None,
      
        4268
                consecutive_errors=0,
      
        4269
            )
      
        4270
        
        4271
            assert queued_messages
      
        4272
            message = queued_messages[-1]
      
        4273
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4274
            assert "Finish with a final response now so Loader can run verification automatically." in message
      
        4275
            assert "Repair or verify the current files instead of expanding the artifact set." not in message
      
        4276
            assert context.workflow_mode == "verify"
      
        4277
        
        4278
        
        4279
        @pytest.mark.asyncio
      
        4280
        async def test_tool_batch_runner_todowrite_during_quality_repair_requires_mutation(
      
        4281
            temp_dir: Path,
      
        4282
        ) -> None:
      
        4283
            async def assess_confidence(
      
        4284
                tool_name: str,
      
        4285
                tool_args: dict,
      
        4286
                context: str,
      
        4287
            ) -> ConfidenceAssessment:
      
        4288
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4289
        
        4290
            async def verify_action(
      
        4291
                tool_name: str,
      
        4292
                tool_args: dict,
      
        4293
                result: str,
      
        4294
                expected: str = "",
      
        4295
            ) -> ActionVerification:
      
        4296
                raise AssertionError("Verification should not run for this scenario")
      
        4297
        
        4298
            guide_root = temp_dir / "guides" / "nginx"
      
        4299
            chapters = guide_root / "chapters"
      
        4300
            chapters.mkdir(parents=True)
      
        4301
            index_path = guide_root / "index.html"
      
        4302
            chapter_one = chapters / "01-introduction.html"
      
        4303
            index_path.write_text("<html></html>\n")
      
        4304
            chapter_one.write_text("<html></html>\n")
      
        4305
        
        4306
            implementation_plan = temp_dir / "implementation.md"
      
        4307
            implementation_plan.write_text(
      
        4308
                "\n".join(
      
        4309
                    [
      
        4310
                        "# Implementation Plan",
      
        4311
                        "",
      
        4312
                        "## File Changes",
      
        4313
                        f"- `{guide_root}/`",
      
        4314
                        f"- `{chapters}/`",
      
        4315
                        f"- `{index_path}`",
      
        4316
                        f"- `{chapter_one}`",
      
        4317
                        "",
      
        4318
                    ]
      
        4319
                )
      
        4320
            )
      
        4321
        
        4322
            context = build_context(
      
        4323
                temp_dir=temp_dir,
      
        4324
                messages=[
      
        4325
                    Message(
      
        4326
                        role=Role.USER,
      
        4327
                        content=(
      
        4328
                            "Repair focus:\n"
      
        4329
                            f"- Improve `{chapter_one}`: thin content (409 text chars, expected at least 1758).\n"
      
        4330
                            f"- Improve `{chapter_one}`: insufficient structured content (6 blocks, expected at least 18).\n"
      
        4331
                            f"- Immediate next step: edit `{chapter_one}`.\n"
      
        4332
                        ),
      
        4333
                    )
      
        4334
                ],
      
        4335
                safeguards=FakeSafeguards(),
      
        4336
                assess_confidence=assess_confidence,
      
        4337
                verify_action=verify_action,
      
        4338
                auto_recover=False,
      
        4339
            )
      
        4340
            context.set_workflow_mode("verify")
      
        4341
            queued_messages: list[str] = []
      
        4342
            emitted_responses: list[str] = []
      
        4343
            context.queue_steering_message_callback = queued_messages.append
      
        4344
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4345
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4346
            dod.implementation_plan = str(implementation_plan)
      
        4347
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4348
            sync_todos_to_definition_of_done(
      
        4349
                dod,
      
        4350
                [
      
        4351
                    {
      
        4352
                        "content": "Expand generated chapters to satisfy quality verification",
      
        4353
                        "active_form": "Expanding generated chapters",
      
        4354
                        "status": "in_progress",
      
        4355
                    }
      
        4356
                ],
      
        4357
                project_root=temp_dir,
      
        4358
            )
      
        4359
            pending_before_todowrite = list(dod.pending_items)
      
        4360
            completed_before_todowrite = list(dod.completed_items)
      
        4361
        
        4362
            tool_call = ToolCall(
      
        4363
                id="todo-quality",
      
        4364
                name="TodoWrite",
      
        4365
                arguments={
      
        4366
                    "todos": [
      
        4367
                        {
      
        4368
                            "content": "Expand generated chapters to satisfy quality verification",
      
        4369
                            "active_form": "Expanding generated chapters",
      
        4370
                            "status": "completed",
      
        4371
                        }
      
        4372
                    ]
      
        4373
                },
      
        4374
            )
      
        4375
            executor = FakeExecutor(
      
        4376
                [
      
        4377
                    tool_outcome(
      
        4378
                        tool_call=tool_call,
      
        4379
                        output="Todos updated",
      
        4380
                        is_error=False,
      
        4381
                        metadata={
      
        4382
                            "new_todos": [
      
        4383
                                {
      
        4384
                                    "content": "Expand generated chapters to satisfy quality verification",
      
        4385
                                    "active_form": "Expanding generated chapters",
      
        4386
                                    "status": "completed",
      
        4387
                                }
      
        4388
                            ]
      
        4389
                        },
      
        4390
                    )
      
        4391
                ]
      
        4392
            )
      
        4393
        
        4394
            async def emit(event: AgentEvent) -> None:
      
        4395
                if event.type == "response":
      
        4396
                    emitted_responses.append(str(event.content))
      
        4397
        
        4398
            summary = TurnSummary(final_response="")
      
        4399
            result = await runner.execute_batch(
      
        4400
                tool_calls=[tool_call],
      
        4401
                tool_source="assistant",
      
        4402
                pending_tool_calls_seen=set(),
      
        4403
                emit=emit,
      
        4404
                summary=summary,
      
        4405
                dod=dod,
      
        4406
                executor=executor,  # type: ignore[arg-type]
      
        4407
                on_confirmation=None,
      
        4408
                on_user_question=None,
      
        4409
                emit_confirmation=None,
      
        4410
                consecutive_errors=0,
      
        4411
            )
      
        4412
        
        4413
            assert queued_messages
      
        4414
            message = queued_messages[-1]
      
        4415
            assert "verification still has an active HTML content-quality repair" in message
      
        4416
            assert "TodoWrite cannot satisfy that verifier" in message
      
        4417
            assert f"Immediate next step: edit `{chapter_one.resolve(strict=False)}`" in message
      
        4418
            assert "thin content" in message
      
        4419
            assert "Finish with a final response now" not in message
      
        4420
            assert context.workflow_mode == "execute"
      
        4421
            assert result.halted is False
      
        4422
            assert summary.final_response == ""
      
        4423
            assert not emitted_responses
      
        4424
            assert dod.pending_items == pending_before_todowrite
      
        4425
            assert dod.completed_items == completed_before_todowrite
      
        4426
        
        4427
        
        4428
        def test_todowrite_quality_repair_nudge_uses_exact_anchor_after_stale_context(
      
        4429
            temp_dir: Path,
      
        4430
        ) -> None:
      
        4431
            async def assess_confidence(
      
        4432
                tool_name: str,
      
        4433
                tool_args: dict,
      
        4434
                context: str,
      
        4435
            ) -> ConfidenceAssessment:
      
        4436
                raise AssertionError("Confidence should not run for direct nudge test")
      
        4437
        
        4438
            async def verify_action(
      
        4439
                tool_name: str,
      
        4440
                tool_args: dict,
      
        4441
                result: str,
      
        4442
                expected: str = "",
      
        4443
            ) -> ActionVerification:
      
        4444
                raise AssertionError("Verification should not run for direct nudge test")
      
        4445
        
        4446
            guide_root = temp_dir / "guides" / "nginx"
      
        4447
            chapters = guide_root / "chapters"
      
        4448
            chapters.mkdir(parents=True)
      
        4449
            chapter_one = chapters / "05-load-balancing.html"
      
        4450
            chapter_one.write_text("<html><body><h1>Load Balancing</h1></body></html>\n")
      
        4451
            context = build_context(
      
        4452
                temp_dir=temp_dir,
      
        4453
                messages=[
      
        4454
                    Message(
      
        4455
                        role=Role.USER,
      
        4456
                        content=(
      
        4457
                            "Repair focus:\n"
      
        4458
                            f"- Improve `{chapter_one}`: thin content "
      
        4459
                            "(846 text chars, expected at least 1758).\n"
      
        4460
                            f"- Immediate next step: edit `{chapter_one}`.\n"
      
        4461
                        ),
      
        4462
                    ),
      
        4463
                    Message(
      
        4464
                        role=Role.TOOL,
      
        4465
                        content=(
      
        4466
                            "Observation [edit]: Error: Failed to complete the operation "
      
        4467
                            f"after 2 attempts for {chapter_one}. old_string not found in file."
      
        4468
                        ),
      
        4469
                    ),
      
        4470
                ],
      
        4471
                safeguards=FakeSafeguards(),
      
        4472
                assess_confidence=assess_confidence,
      
        4473
                verify_action=verify_action,
      
        4474
                auto_recover=False,
      
        4475
            )
      
        4476
            queued_messages: list[str] = []
      
        4477
            context.queue_steering_message_callback = queued_messages.append
      
        4478
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4479
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4480
        
        4481
            runner._queue_todowrite_resume_nudge(dod=dod)
      
        4482
        
        4483
            assert queued_messages
      
        4484
            message = queued_messages[-1]
      
        4485
            assert f"Immediate next step: edit `{chapter_one.resolve(strict=False)}`" in message
      
        4486
            assert "`edit(file_path=..., old_string=..., new_string=...)`" in message
      
        4487
            assert "Use this exact current closing-tail anchor as `old_string`" in message
      
        4488
            assert "```html\n</body></html>\n```" in message
      
        4489
            assert "do not call `read`, `patch`, `write`, or TodoWrite again first" in message
      
        4490
        
        4491
        
        4492
        @pytest.mark.asyncio
      
        4493
        async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
      
        4494
            temp_dir: Path,
      
        4495
        ) -> None:
      
        4496
            async def assess_confidence(
      
        4497
                tool_name: str,
      
        4498
                tool_args: dict,
      
        4499
                context: str,
      
        4500
            ) -> ConfidenceAssessment:
      
        4501
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4502
        
        4503
            async def verify_action(
      
        4504
                tool_name: str,
      
        4505
                tool_args: dict,
      
        4506
                result: str,
      
        4507
                expected: str = "",
      
        4508
            ) -> ActionVerification:
      
        4509
                raise AssertionError("Verification should not run for this scenario")
      
        4510
        
        4511
            guide_root = temp_dir / "guides" / "nginx"
      
        4512
            chapters = guide_root / "chapters"
      
        4513
            guide_root.mkdir(parents=True)
      
        4514
            chapters.mkdir()
      
        4515
            index_path = guide_root / "index.html"
      
        4516
            chapter_one = chapters / "01-introduction.html"
      
        4517
            chapter_two = chapters / "02-installation.html"
      
        4518
            index_path.write_text(
      
        4519
                "\n".join(
      
        4520
                    [
      
        4521
                        '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
      
        4522
                        '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
      
        4523
                        "",
      
        4524
                    ]
      
        4525
                )
      
        4526
            )
      
        4527
            chapter_one.write_text("<html></html>\n")
      
        4528
            chapter_two.write_text("<html></html>\n")
      
        4529
        
        4530
            implementation_plan = temp_dir / "implementation.md"
      
        4531
            implementation_plan.write_text(
      
        4532
                "\n".join(
      
        4533
                    [
      
        4534
                        "# Implementation Plan",
      
        4535
                        "",
      
        4536
                        "## File Changes",
      
        4537
                        f"- `{guide_root}/`",
      
        4538
                        f"- `{chapters}/`",
      
        4539
                        f"- `{index_path}`",
      
        4540
                        f"- `{chapter_one}`",
      
        4541
                        f"- `{chapter_two}`",
      
        4542
                        "",
      
        4543
                    ]
      
        4544
                )
      
        4545
            )
      
        4546
        
        4547
            context = build_context(
      
        4548
                temp_dir=temp_dir,
      
        4549
                messages=[],
      
        4550
                safeguards=FakeSafeguards(),
      
        4551
                assess_confidence=assess_confidence,
      
        4552
                verify_action=verify_action,
      
        4553
                auto_recover=False,
      
        4554
            )
      
        4555
            queued_messages: list[str] = []
      
        4556
            context.queue_steering_message_callback = queued_messages.append
      
        4557
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4558
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4559
            dod.implementation_plan = str(implementation_plan)
      
        4560
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4561
        
        4562
            todo_call = ToolCall(
      
        4563
                id="todo-post-build-preempt",
      
        4564
                name="TodoWrite",
      
        4565
                arguments={"todos": []},
      
        4566
            )
      
        4567
            audit_read = ToolCall(
      
        4568
                id="read-after-todo",
      
        4569
                name="read",
      
        4570
                arguments={"file_path": str(index_path)},
      
        4571
            )
      
        4572
            executor = FakeExecutor(
      
        4573
                [
      
        4574
                    tool_outcome(
      
        4575
                        tool_call=todo_call,
      
        4576
                        output="Todos updated",
      
        4577
                        is_error=False,
      
        4578
                        metadata={"new_todos": []},
      
        4579
                    ),
      
        4580
                    tool_outcome(
      
        4581
                        tool_call=audit_read,
      
        4582
                        output=index_path.read_text(),
      
        4583
                        is_error=False,
      
        4584
                    ),
      
        4585
                ]
      
        4586
            )
      
        4587
        
        4588
            summary = TurnSummary(final_response="")
      
        4589
            result = await runner.execute_batch(
      
        4590
                tool_calls=[todo_call, audit_read],
      
        4591
                tool_source="assistant",
      
        4592
                pending_tool_calls_seen=set(),
      
        4593
                emit=_noop_emit,
      
        4594
                summary=summary,
      
        4595
                dod=dod,
      
        4596
                executor=executor,  # type: ignore[arg-type]
      
        4597
                on_confirmation=None,
      
        4598
                on_user_question=None,
      
        4599
                emit_confirmation=None,
      
        4600
                consecutive_errors=0,
      
        4601
            )
      
        4602
        
        4603
            assert result.continue_after_batch is True
      
        4604
            assert result.halted is False
      
        4605
            assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
      
        4606
            assert len(summary.tool_result_messages) == 1
      
        4607
            assert context.workflow_mode == "verify"
      
        4608
            assert queued_messages
      
        4609
            assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
      
        4610
        
        4611
        
        4612
        @pytest.mark.asyncio
      
        4613
        async def test_tool_batch_runner_todowrite_complete_directory_plan_does_not_reinfer_first_child(
      
        4614
            temp_dir: Path,
      
        4615
        ) -> None:
      
        4616
            async def assess_confidence(
      
        4617
                tool_name: str,
      
        4618
                tool_args: dict,
      
        4619
                context: str,
      
        4620
            ) -> ConfidenceAssessment:
      
        4621
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4622
        
        4623
            async def verify_action(
      
        4624
                tool_name: str,
      
        4625
                tool_args: dict,
      
        4626
                result: str,
      
        4627
                expected: str = "",
      
        4628
            ) -> ActionVerification:
      
        4629
                raise AssertionError("Verification should not run for this scenario")
      
        4630
        
        4631
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        4632
            reference.parent.mkdir(parents=True)
      
        4633
            reference.write_text("<h1>Introduction</h1>\n")
      
        4634
        
        4635
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4636
            chapters = guide_root / "chapters"
      
        4637
            guide_root.mkdir(parents=True)
      
        4638
            chapters.mkdir()
      
        4639
            index_path = guide_root / "index.html"
      
        4640
            chapter_one = chapters / "01-introduction.html"
      
        4641
            chapter_two = chapters / "02-installation.html"
      
        4642
            chapter_three = chapters / "03-basic-configuration.html"
      
        4643
            index_path.write_text(
      
        4644
                "\n".join(
      
        4645
                    [
      
        4646
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4647
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        4648
                        '<a href="chapters/03-basic-configuration.html">Configuration</a>',
      
        4649
                        "",
      
        4650
                    ]
      
        4651
                )
      
        4652
            )
      
        4653
            chapter_one.write_text("<html></html>\n")
      
        4654
            chapter_two.write_text("<html></html>\n")
      
        4655
            chapter_three.write_text("<html></html>\n")
      
        4656
        
        4657
            implementation_plan = temp_dir / "implementation.md"
      
        4658
            implementation_plan.write_text(
      
        4659
                "\n".join(
      
        4660
                    [
      
        4661
                        "# Implementation Plan",
      
        4662
                        "",
      
        4663
                        "## File Changes",
      
        4664
                        f"- `{guide_root / 'index.html'}`",
      
        4665
                        f"- `{chapters}/`",
      
        4666
                        "",
      
        4667
                    ]
      
        4668
                )
      
        4669
            )
      
        4670
        
        4671
            messages = [
      
        4672
                Message(
      
        4673
                    role=Role.ASSISTANT,
      
        4674
                    content="I examined the reference guide structure.",
      
        4675
                    tool_calls=[
      
        4676
                        ToolCall(
      
        4677
                            id="read-reference-child",
      
        4678
                            name="read",
      
        4679
                            arguments={"file_path": str(reference)},
      
        4680
                        )
      
        4681
                    ],
      
        4682
                )
      
        4683
            ]
      
        4684
            context = build_context(
      
        4685
                temp_dir=temp_dir,
      
        4686
                messages=messages,
      
        4687
                safeguards=FakeSafeguards(),
      
        4688
                assess_confidence=assess_confidence,
      
        4689
                verify_action=verify_action,
      
        4690
                auto_recover=False,
      
        4691
            )
      
        4692
            queued_messages: list[str] = []
      
        4693
            context.queue_steering_message_callback = queued_messages.append
      
        4694
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4695
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        4696
            dod.implementation_plan = str(implementation_plan)
      
        4697
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4698
        
        4699
            todo_call = ToolCall(
      
        4700
                id="todo-complete-directory-plan",
      
        4701
                name="TodoWrite",
      
        4702
                arguments={"todos": []},
      
        4703
            )
      
        4704
            executor = FakeExecutor(
      
        4705
                [
      
        4706
                    tool_outcome(
      
        4707
                        tool_call=todo_call,
      
        4708
                        output="Todos updated",
      
        4709
                        is_error=False,
      
        4710
                        metadata={"new_todos": []},
      
        4711
                    )
      
        4712
                ]
      
        4713
            )
      
        4714
        
        4715
            summary = TurnSummary(final_response="")
      
        4716
            result = await runner.execute_batch(
      
        4717
                tool_calls=[todo_call],
      
        4718
                tool_source="assistant",
      
        4719
                pending_tool_calls_seen=set(),
      
        4720
                emit=_noop_emit,
      
        4721
                summary=summary,
      
        4722
                dod=dod,
      
        4723
                executor=executor,  # type: ignore[arg-type]
      
        4724
                on_confirmation=None,
      
        4725
                on_user_question=None,
      
        4726
                emit_confirmation=None,
      
        4727
                consecutive_errors=0,
      
        4728
            )
      
        4729
        
        4730
            assert result.halted is True
      
        4731
            assert result.final_response == (
      
        4732
                "Todo tracking is complete; running Loader verification on the generated "
      
        4733
                "files now."
      
        4734
            )
      
        4735
            assert summary.final_response == result.final_response
      
        4736
            assert context.workflow_mode == "verify"
      
        4737
            assert summary.tool_result_messages
      
        4738
            assert (
      
        4739
                "final response should be provided next for Loader verification"
      
        4740
                in summary.tool_result_messages[-1].content
      
        4741
            )
      
        4742
            assert "01-introduction.html" not in summary.tool_result_messages[-1].content
      
        4743
            assert "chapter files" not in summary.tool_result_messages[-1].content.lower()
      
        4744
            assert "fortran guide structure" not in summary.tool_result_messages[-1].content.lower()
      
        4745
        
        4746
        
        4747
        @pytest.mark.asyncio
      
        4748
        async def test_tool_batch_runner_preempts_post_build_observation_batch_for_verify_handoff(
      
        4749
            temp_dir: Path,
      
        4750
        ) -> None:
      
        4751
            async def assess_confidence(
      
        4752
                tool_name: str,
      
        4753
                tool_args: dict,
      
        4754
                context: str,
      
        4755
            ) -> ConfidenceAssessment:
      
        4756
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4757
        
        4758
            async def verify_action(
      
        4759
                tool_name: str,
      
        4760
                tool_args: dict,
      
        4761
                result: str,
      
        4762
                expected: str = "",
      
        4763
            ) -> ActionVerification:
      
        4764
                raise AssertionError("Verification should not run for this scenario")
      
        4765
        
        4766
            guide_root = temp_dir / "guides" / "nginx"
      
        4767
            chapters = guide_root / "chapters"
      
        4768
            guide_root.mkdir(parents=True)
      
        4769
            chapters.mkdir()
      
        4770
            index_path = guide_root / "index.html"
      
        4771
            chapter_one = chapters / "01-introduction.html"
      
        4772
            chapter_two = chapters / "02-installation.html"
      
        4773
            chapter_three = chapters / "03-configuration.html"
      
        4774
            index_path.write_text(
      
        4775
                "\n".join(
      
        4776
                    [
      
        4777
                        '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
      
        4778
                        '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
      
        4779
                        "",
      
        4780
                    ]
      
        4781
                )
      
        4782
            )
      
        4783
            chapter_one.write_text("<html></html>\n")
      
        4784
            chapter_two.write_text("<html></html>\n")
      
        4785
            chapter_three.write_text("<html></html>\n")
      
        4786
        
        4787
            implementation_plan = temp_dir / "implementation.md"
      
        4788
            implementation_plan.write_text(
      
        4789
                "\n".join(
      
        4790
                    [
      
        4791
                        "# Implementation Plan",
      
        4792
                        "",
      
        4793
                        "## File Changes",
      
        4794
                        f"- `{guide_root}/`",
      
        4795
                        f"- `{chapters}/`",
      
        4796
                        f"- `{index_path}`",
      
        4797
                        "",
      
        4798
                    ]
      
        4799
                )
      
        4800
            )
      
        4801
        
        4802
            context = build_context(
      
        4803
                temp_dir=temp_dir,
      
        4804
                messages=[],
      
        4805
                safeguards=FakeSafeguards(),
      
        4806
                assess_confidence=assess_confidence,
      
        4807
                verify_action=verify_action,
      
        4808
                auto_recover=False,
      
        4809
            )
      
        4810
            queued_messages: list[str] = []
      
        4811
            context.queue_steering_message_callback = queued_messages.append
      
        4812
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4813
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        4814
            dod.implementation_plan = str(implementation_plan)
      
        4815
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4816
        
        4817
            audit_bash = ToolCall(
      
        4818
                id="bash-post-build-audit",
      
        4819
                name="bash",
      
        4820
                arguments={"command": f"ls -la {guide_root}"},
      
        4821
            )
      
        4822
            audit_read = ToolCall(
      
        4823
                id="read-index-after-audit",
      
        4824
                name="read",
      
        4825
                arguments={"file_path": str(index_path)},
      
        4826
            )
      
        4827
            executor = FakeExecutor(
      
        4828
                [
      
        4829
                    tool_outcome(
      
        4830
                        tool_call=audit_bash,
      
        4831
                        output="total 8\n",
      
        4832
                        is_error=False,
      
        4833
                    ),
      
        4834
                    tool_outcome(
      
        4835
                        tool_call=audit_read,
      
        4836
                        output=index_path.read_text(),
      
        4837
                        is_error=False,
      
        4838
                    ),
      
        4839
                ]
      
        4840
            )
      
        4841
        
        4842
            summary = TurnSummary(final_response="")
      
        4843
            result = await runner.execute_batch(
      
        4844
                tool_calls=[audit_bash, audit_read],
      
        4845
                tool_source="assistant",
      
        4846
                pending_tool_calls_seen=set(),
      
        4847
                emit=_noop_emit,
      
        4848
                summary=summary,
      
        4849
                dod=dod,
      
        4850
                executor=executor,  # type: ignore[arg-type]
      
        4851
                on_confirmation=None,
      
        4852
                on_user_question=None,
      
        4853
                emit_confirmation=None,
      
        4854
                consecutive_errors=0,
      
        4855
            )
      
        4856
        
        4857
            assert result.continue_after_batch is True
      
        4858
            assert [call.id for call in executor.calls] == ["bash-post-build-audit"]
      
        4859
            assert context.workflow_mode == "verify"
      
        4860
            assert queued_messages
      
        4861
            assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
      
        4862
        
        4863
        
        4864
        @pytest.mark.asyncio
      
        4865
        async def test_tool_batch_runner_preempts_post_build_observation_batch_during_consistency_review(
      
        4866
            temp_dir: Path,
      
        4867
        ) -> None:
      
        4868
            async def assess_confidence(
      
        4869
                tool_name: str,
      
        4870
                tool_args: dict,
      
        4871
                context: str,
      
        4872
            ) -> ConfidenceAssessment:
      
        4873
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4874
        
        4875
            async def verify_action(
      
        4876
                tool_name: str,
      
        4877
                tool_args: dict,
      
        4878
                result: str,
      
        4879
                expected: str = "",
      
        4880
            ) -> ActionVerification:
      
        4881
                raise AssertionError("Verification should not run for this scenario")
      
        4882
        
        4883
            guide_root = temp_dir / "guides" / "nginx"
      
        4884
            chapters = guide_root / "chapters"
      
        4885
            guide_root.mkdir(parents=True)
      
        4886
            chapters.mkdir()
      
        4887
            index_path = guide_root / "index.html"
      
        4888
            chapter_one = chapters / "01-introduction.html"
      
        4889
            chapter_two = chapters / "02-installation.html"
      
        4890
            chapter_three = chapters / "03-basic-configuration.html"
      
        4891
            index_path.write_text("<html></html>\n")
      
        4892
            chapter_one.write_text("<html></html>\n")
      
        4893
            chapter_two.write_text("<html></html>\n")
      
        4894
            chapter_three.write_text("<html></html>\n")
      
        4895
        
        4896
            implementation_plan = temp_dir / "implementation.md"
      
        4897
            implementation_plan.write_text(
      
        4898
                "\n".join(
      
        4899
                    [
      
        4900
                        "# Implementation Plan",
      
        4901
                        "",
      
        4902
                        "## File Changes",
      
        4903
                        f"- `{guide_root}/`",
      
        4904
                        f"- `{chapters}/`",
      
        4905
                        f"- `{index_path}`",
      
        4906
                        "",
      
        4907
                    ]
      
        4908
                )
      
        4909
            )
      
        4910
        
        4911
            context = build_context(
      
        4912
                temp_dir=temp_dir,
      
        4913
                messages=[],
      
        4914
                safeguards=FakeSafeguards(),
      
        4915
                assess_confidence=assess_confidence,
      
        4916
                verify_action=verify_action,
      
        4917
                auto_recover=False,
      
        4918
            )
      
        4919
            queued_messages: list[str] = []
      
        4920
            queued_ephemeral: list[str] = []
      
        4921
            context.queue_steering_message_callback = queued_messages.append
      
        4922
            context.queue_ephemeral_steering_message_callback = queued_ephemeral.append
      
        4923
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4924
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        4925
            dod.implementation_plan = str(implementation_plan)
      
        4926
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4927
            sync_todos_to_definition_of_done(
      
        4928
                dod,
      
        4929
                [
      
        4930
                    {
      
        4931
                        "content": "Review the generated guide for consistency and completeness",
      
        4932
                        "active_form": "Reviewing the generated guide for consistency and completeness",
      
        4933
                        "status": "pending",
      
        4934
                    }
      
        4935
                ],
      
        4936
                project_root=temp_dir,
      
        4937
            )
      
        4938
        
        4939
            audit_read = ToolCall(
      
        4940
                id="read-index-during-review",
      
        4941
                name="read",
      
        4942
                arguments={"file_path": str(index_path)},
      
        4943
            )
      
        4944
            second_read = ToolCall(
      
        4945
                id="read-chapter-after-review",
      
        4946
                name="read",
      
        4947
                arguments={"file_path": str(chapter_one)},
      
        4948
            )
      
        4949
            executor = FakeExecutor(
      
        4950
                [
      
        4951
                    tool_outcome(
      
        4952
                        tool_call=audit_read,
      
        4953
                        output=index_path.read_text(),
      
        4954
                        is_error=False,
      
        4955
                    ),
      
        4956
                    tool_outcome(
      
        4957
                        tool_call=second_read,
      
        4958
                        output=chapter_one.read_text(),
      
        4959
                        is_error=False,
      
        4960
                    ),
      
        4961
                ]
      
        4962
            )
      
        4963
        
        4964
            summary = TurnSummary(final_response="")
      
        4965
            result = await runner.execute_batch(
      
        4966
                tool_calls=[audit_read, second_read],
      
        4967
                tool_source="assistant",
      
        4968
                pending_tool_calls_seen=set(),
      
        4969
                emit=_noop_emit,
      
        4970
                summary=summary,
      
        4971
                dod=dod,
      
        4972
                executor=executor,  # type: ignore[arg-type]
      
        4973
                on_confirmation=None,
      
        4974
                on_user_question=None,
      
        4975
                emit_confirmation=None,
      
        4976
                consecutive_errors=0,
      
        4977
            )
      
        4978
        
        4979
            assert result.continue_after_batch is True
      
        4980
            assert [call.id for call in executor.calls] == ["read-index-during-review"]
      
        4981
            queued = queued_ephemeral or queued_messages
      
        4982
            assert queued
      
        4983
            assert "All explicitly planned artifacts already exist." in queued[-1]
      
        4984
            assert "generated files" in queued[-1]
      
        4985
        
        4986
        
        4987
        @pytest.mark.asyncio
      
        4988
        async def test_tool_batch_runner_skips_post_build_user_question_during_consistency_review(
      
        4989
            temp_dir: Path,
      
        4990
        ) -> None:
      
        4991
            async def assess_confidence(
      
        4992
                tool_name: str,
      
        4993
                tool_args: dict,
      
        4994
                context: str,
      
        4995
            ) -> ConfidenceAssessment:
      
        4996
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4997
        
        4998
            async def verify_action(
      
        4999
                tool_name: str,
      
        5000
                tool_args: dict,
      
        5001
                result: str,
      
        5002
                expected: str = "",
      
        5003
            ) -> ActionVerification:
      
        5004
                raise AssertionError("Verification should not run for this scenario")
      
        5005
        
        5006
            guide_root = temp_dir / "guides" / "nginx"
      
        5007
            chapters = guide_root / "chapters"
      
        5008
            guide_root.mkdir(parents=True)
      
        5009
            chapters.mkdir()
      
        5010
            index_path = guide_root / "index.html"
      
        5011
            chapter_one = chapters / "01-introduction.html"
      
        5012
            chapter_two = chapters / "02-installation.html"
      
        5013
            index_path.write_text(
      
        5014
                "\n".join(
      
        5015
                    [
      
        5016
                        '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
      
        5017
                        '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
      
        5018
                        "",
      
        5019
                    ]
      
        5020
                )
      
        5021
            )
      
        5022
            chapter_one.write_text("<html></html>\n")
      
        5023
            chapter_two.write_text("<html></html>\n")
      
        5024
        
        5025
            implementation_plan = temp_dir / "implementation.md"
      
        5026
            implementation_plan.write_text(
      
        5027
                "\n".join(
      
        5028
                    [
      
        5029
                        "# Implementation Plan",
      
        5030
                        "",
      
        5031
                        "## File Changes",
      
        5032
                        f"- `{guide_root}/`",
      
        5033
                        f"- `{chapters}/`",
      
        5034
                        f"- `{index_path}`",
      
        5035
                        f"- `{chapter_one}`",
      
        5036
                        f"- `{chapter_two}`",
      
        5037
                        "",
      
        5038
                    ]
      
        5039
                )
      
        5040
            )
      
        5041
        
        5042
            context = build_context(
      
        5043
                temp_dir=temp_dir,
      
        5044
                messages=[],
      
        5045
                safeguards=FakeSafeguards(),
      
        5046
                assess_confidence=assess_confidence,
      
        5047
                verify_action=verify_action,
      
        5048
                auto_recover=False,
      
        5049
            )
      
        5050
            queued_messages: list[str] = []
      
        5051
            context.queue_steering_message_callback = queued_messages.append
      
        5052
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5053
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        5054
            dod.implementation_plan = str(implementation_plan)
      
        5055
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5056
            dod.pending_items = ["Ensure all files are properly linked and formatted"]
      
        5057
        
        5058
            question_call = ToolCall(
      
        5059
                id="ask-post-build-review",
      
        5060
                name="AskUserQuestion",
      
        5061
                arguments={
      
        5062
                    "question": "Which specific aspects of the reference guide should I copy?",
      
        5063
                    "context": "I already created the output files and want to ensure they match.",
      
        5064
                },
      
        5065
            )
      
        5066
            executor = FakeExecutor([])
      
        5067
        
        5068
            summary = TurnSummary(final_response="")
      
        5069
            result = await runner.execute_batch(
      
        5070
                tool_calls=[question_call],
      
        5071
                tool_source="assistant",
      
        5072
                pending_tool_calls_seen=set(),
      
        5073
                emit=_noop_emit,
      
        5074
                summary=summary,
      
        5075
                dod=dod,
      
        5076
                executor=executor,  # type: ignore[arg-type]
      
        5077
                on_confirmation=None,
      
        5078
                on_user_question=None,
      
        5079
                emit_confirmation=None,
      
        5080
                consecutive_errors=0,
      
        5081
            )
      
        5082
        
        5083
            assert result.continue_after_batch is True
      
        5084
            assert executor.calls == []
      
        5085
            assert queued_messages
      
        5086
            assert "The remaining work is review/verification of the generated files." in queued_messages[-1]
      
        5087
            assert "Do not ask the user for more clarification about the reference pattern now." in queued_messages[-1]
      
        5088
            assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
      
        5089
            assert context.workflow_mode == "verify"
      
        5090
            assert summary.tool_result_messages
      
        5091
            assert "Skipped - stale post-build user question" in summary.tool_result_messages[-1].content
      
        5092
        
        5093
        
        5094
        @pytest.mark.asyncio
      
        5095
        async def test_tool_batch_runner_rewrites_stale_todowrite_summary_from_reconciled_dod(
      
        5096
            temp_dir: Path,
      
        5097
        ) -> None:
      
        5098
            async def assess_confidence(
      
        5099
                tool_name: str,
      
        5100
                tool_args: dict,
      
        5101
                context: str,
      
        5102
            ) -> ConfidenceAssessment:
      
        5103
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        5104
        
        5105
            async def verify_action(
      
        5106
                tool_name: str,
      
        5107
                tool_args: dict,
      
        5108
                result: str,
      
        5109
                expected: str = "",
      
        5110
            ) -> ActionVerification:
      
        5111
                raise AssertionError("Verification should not run for this scenario")
      
        5112
        
        5113
            guide_root = temp_dir / "guides" / "nginx"
      
        5114
            chapters = guide_root / "chapters"
      
        5115
            guide_root.mkdir(parents=True)
      
        5116
            chapters.mkdir()
      
        5117
            index_path = guide_root / "index.html"
      
        5118
            for name in (
      
        5119
                "01-introduction.html",
      
        5120
                "02-installation.html",
      
        5121
                "03-basic-configuration.html",
      
        5122
                "04-advanced-usage.html",
      
        5123
                "05-troubleshooting.html",
      
        5124
            ):
      
        5125
                (chapters / name).write_text("<html></html>\n")
      
        5126
            index_path.write_text("<html></html>\n")
      
        5127
        
        5128
            implementation_plan = temp_dir / "implementation.md"
      
        5129
            implementation_plan.write_text(
      
        5130
                "\n".join(
      
        5131
                    [
      
        5132
                        "# Implementation Plan",
      
        5133
                        "",
      
        5134
                        "## File Changes",
      
        5135
                        f"- `{guide_root}/`",
      
        5136
                        f"- `{chapters}/`",
      
        5137
                        f"- `{index_path}`",
      
        5138
                        "",
      
        5139
                    ]
      
        5140
                )
      
        5141
            )
      
        5142
        
        5143
            context = build_context(
      
        5144
                temp_dir=temp_dir,
      
        5145
                messages=[],
      
        5146
                safeguards=FakeSafeguards(),
      
        5147
                assess_confidence=assess_confidence,
      
        5148
                verify_action=verify_action,
      
        5149
                auto_recover=False,
      
        5150
            )
      
        5151
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5152
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        5153
            dod.implementation_plan = str(implementation_plan)
      
        5154
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5155
        
        5156
            tool_call = ToolCall(
      
        5157
                id="todo-stale-summary",
      
        5158
                name="TodoWrite",
      
        5159
                arguments={
      
        5160
                    "todos": [
      
        5161
                        {
      
        5162
                            "content": "First, examine the existing fortran guide structure and content to understand the format",
      
        5163
                            "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
      
        5164
                            "status": "pending",
      
        5165
                        }
      
        5166
                    ]
      
        5167
                },
      
        5168
            )
      
        5169
            executor = FakeExecutor(
      
        5170
                [
      
        5171
                    tool_outcome(
      
        5172
                        tool_call=tool_call,
      
        5173
                        output="Todos updated",
      
        5174
                        is_error=False,
      
        5175
                        metadata={
      
        5176
                            "new_todos": [
      
        5177
                                {
      
        5178
                                    "content": "First, examine the existing fortran guide structure and content to understand the format",
      
        5179
                                    "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
      
        5180
                                    "status": "pending",
      
        5181
                                }
      
        5182
                            ]
      
        5183
                        },
      
        5184
                    )
      
        5185
                ]
      
        5186
            )
      
        5187
        
        5188
            summary = TurnSummary(final_response="")
      
        5189
            result = await runner.execute_batch(
      
        5190
                tool_calls=[tool_call],
      
        5191
                tool_source="assistant",
      
        5192
                pending_tool_calls_seen=set(),
      
        5193
                emit=_noop_emit,
      
        5194
                summary=summary,
      
        5195
                dod=dod,
      
        5196
                executor=executor,  # type: ignore[arg-type]
      
        5197
                on_confirmation=None,
      
        5198
                on_user_question=None,
      
        5199
                emit_confirmation=None,
      
        5200
                consecutive_errors=0,
      
        5201
            )
      
        5202
        
        5203
            assert result.halted is True
      
        5204
            assert result.final_response == (
      
        5205
                "Todo tracking is complete; running Loader verification on the generated "
      
        5206
                "files now."
      
        5207
            )
      
        5208
            assert summary.final_response == result.final_response
      
        5209
            assert summary.tool_result_messages
      
        5210
            message = summary.tool_result_messages[-1].content
      
        5211
            assert "updated todo list" in message
      
        5212
            assert "final response should be provided next for Loader verification" in message
      
        5213
            assert "next pending:" not in message
      
        5214
            assert "fortran guide structure" not in message.lower()
      
        5215
        
        5216
        
        5217
        @pytest.mark.asyncio
      
        5218
        async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
      
        5219
            temp_dir: Path,
      
        5220
        ) -> None:
      
        5221
            async def assess_confidence(
      
        5222
                tool_name: str,
      
        5223
                tool_args: dict,
      
        5224
                context: str,
      
        5225
            ) -> ConfidenceAssessment:
      
        5226
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        5227
        
        5228
            async def verify_action(
      
        5229
                tool_name: str,
      
        5230
                tool_args: dict,
      
        5231
                result: str,
      
        5232
                expected: str = "",
      
        5233
            ) -> ActionVerification:
      
        5234
                raise AssertionError("Verification should not run for this scenario")
      
        5235
        
        5236
            guide_root = temp_dir / "guides" / "nginx"
      
        5237
            chapters = guide_root / "chapters"
      
        5238
            guide_root.mkdir(parents=True)
      
        5239
            chapters.mkdir()
      
        5240
            index_path = guide_root / "index.html"
      
        5241
            chapter_one = chapters / "01-introduction.html"
      
        5242
            chapter_two = chapters / "02-installation.html"
      
        5243
            index_path.write_text(
      
        5244
                "\n".join(
      
        5245
                    [
      
        5246
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        5247
                        '<a href="chapters/02-installation.html">Install</a>',
      
        5248
                        '<a href="../index.html">Back</a>',
      
        5249
                        "",
      
        5250
                    ]
      
        5251
                )
      
        5252
            )
      
        5253
            chapter_one.write_text("<html></html>\n")
      
        5254
            chapter_two.write_text("<html></html>\n")
      
        5255
        
        5256
            implementation_plan = temp_dir / "implementation.md"
      
        5257
            implementation_plan.write_text(
      
        5258
                "\n".join(
      
        5259
                    [
      
        5260
                        "# Implementation Plan",
      
        5261
                        "",
      
        5262
                        "## File Changes",
      
        5263
                        f"- `{guide_root}/`",
      
        5264
                        f"- `{chapters}/`",
      
        5265
                        f"- `{index_path}`",
      
        5266
                        f"- `{chapter_one}`",
      
        5267
                        f"- `{chapter_two}`",
      
        5268
                        "",
      
        5269
                    ]
      
        5270
                )
      
        5271
            )
      
        5272
        
        5273
            context = build_context(
      
        5274
                temp_dir=temp_dir,
      
        5275
                messages=[],
      
        5276
                safeguards=FakeSafeguards(),
      
        5277
                assess_confidence=assess_confidence,
      
        5278
                verify_action=verify_action,
      
        5279
                auto_recover=False,
      
        5280
            )
      
        5281
            queued_messages: list[str] = []
      
        5282
            context.queue_steering_message_callback = queued_messages.append
      
        5283
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5284
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5285
            dod.implementation_plan = str(implementation_plan)
      
        5286
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5287
        
        5288
            tool_call = ToolCall(
      
        5289
                id="todo-post-build-expansion",
      
        5290
                name="TodoWrite",
      
        5291
                arguments={
      
        5292
                    "todos": [
      
        5293
                        {
      
        5294
                            "content": "Create index.html for nginx guide",
      
        5295
                            "activeForm": "Creating index.html",
      
        5296
                            "status": "in_progress",
      
        5297
                        },
      
        5298
                        {
      
        5299
                            "content": "Create chapter 01-introduction.html",
      
        5300
                            "activeForm": "Creating chapter 01-introduction.html",
      
        5301
                            "status": "completed",
      
        5302
                        },
      
        5303
                        {
      
        5304
                            "content": "Create chapter 02-installation.html",
      
        5305
                            "activeForm": "Creating chapter 02-installation.html",
      
        5306
                            "status": "completed",
      
        5307
                        },
      
        5308
                        {
      
        5309
                            "content": "Create chapter 08-troubleshooting.html",
      
        5310
                            "activeForm": "Creating chapter 08-troubleshooting.html",
      
        5311
                            "status": "pending",
      
        5312
                        },
      
        5313
                    ]
      
        5314
                },
      
        5315
            )
      
        5316
            executor = FakeExecutor(
      
        5317
                [
      
        5318
                    tool_outcome(
      
        5319
                        tool_call=tool_call,
      
        5320
                        output="Todos updated",
      
        5321
                        is_error=False,
      
        5322
                        metadata={
      
        5323
                            "new_todos": [
      
        5324
                                {
      
        5325
                                    "content": "Create index.html for nginx guide",
      
        5326
                                    "active_form": "Creating index.html",
      
        5327
                                    "status": "in_progress",
      
        5328
                                },
      
        5329
                                {
      
        5330
                                    "content": "Create chapter 01-introduction.html",
      
        5331
                                    "active_form": "Creating chapter 01-introduction.html",
      
        5332
                                    "status": "completed",
      
        5333
                                },
      
        5334
                                {
      
        5335
                                    "content": "Create chapter 02-installation.html",
      
        5336
                                    "active_form": "Creating chapter 02-installation.html",
      
        5337
                                    "status": "completed",
      
        5338
                                },
      
        5339
                                {
      
        5340
                                    "content": "Create chapter 08-troubleshooting.html",
      
        5341
                                    "active_form": "Creating chapter 08-troubleshooting.html",
      
        5342
                                    "status": "pending",
      
        5343
                                },
      
        5344
                            ]
      
        5345
                        },
      
        5346
                    )
      
        5347
                ]
      
        5348
            )
      
        5349
        
        5350
            summary = TurnSummary(final_response="")
      
        5351
            await runner.execute_batch(
      
        5352
                tool_calls=[tool_call],
      
        5353
                tool_source="assistant",
      
        5354
                pending_tool_calls_seen=set(),
      
        5355
                emit=_noop_emit,
      
        5356
                summary=summary,
      
        5357
                dod=dod,
      
        5358
                executor=executor,  # type: ignore[arg-type]
      
        5359
                on_confirmation=None,
      
        5360
                on_user_question=None,
      
        5361
                emit_confirmation=None,
      
        5362
                consecutive_errors=0,
      
        5363
            )
      
        5364
        
        5365
            assert queued_messages
      
        5366
            message = queued_messages[-1]
      
        5367
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        5368
            assert "Finish with a final response now so Loader can run verification automatically." in message
      
        5369
            assert "Repair or verify the current files instead of expanding the artifact set." not in message
      
        5370
            assert "08-troubleshooting.html" not in message
      
        5371
            assert context.workflow_mode == "verify"
      
        5372
        
        5373
        
        5374
        @pytest.mark.asyncio
      
        5375
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        5376
            temp_dir: Path,
      
        5377
        ) -> None:
      
        5378
            async def assess_confidence(
      
        5379
                tool_name: str,
      
        5380
                tool_args: dict,
      
        5381
                context: str,
      
        5382
            ) -> ConfidenceAssessment:
      
        5383
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5384
        
        5385
            async def verify_action(
      
        5386
                tool_name: str,
      
        5387
                tool_args: dict,
      
        5388
                result: str,
      
        5389
                expected: str = "",
      
        5390
            ) -> ActionVerification:
      
        5391
                raise AssertionError("Verification should not run in this scenario")
      
        5392
        
        5393
            guide_root = temp_dir / "guides" / "nginx"
      
        5394
            chapters = guide_root / "chapters"
      
        5395
            guide_root.mkdir(parents=True)
      
        5396
            chapters.mkdir()
      
        5397
            index_path = guide_root / "index.html"
      
        5398
            index_path.write_text(
      
        5399
                "\n".join(
      
        5400
                    [
      
        5401
                        "<!DOCTYPE html>",
      
        5402
                        "<html>",
      
        5403
                        "<body>",
      
        5404
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        5405
                        "</body>",
      
        5406
                        "</html>",
      
        5407
                        "",
      
        5408
                    ]
      
        5409
                )
      
        5410
            )
      
        5411
        
        5412
            implementation_plan = temp_dir / "implementation.md"
      
        5413
            implementation_plan.write_text(
      
        5414
                "\n".join(
      
        5415
                    [
      
        5416
                        "# Implementation Plan",
      
        5417
                        "",
      
        5418
                        "## File Changes",
      
        5419
                        f"- `{guide_root}/`",
      
        5420
                        f"- `{chapters}/`",
      
        5421
                        f"- `{index_path}`",
      
        5422
                        "",
      
        5423
                    ]
      
        5424
                )
      
        5425
            )
      
        5426
        
        5427
            context = build_context(
      
        5428
                temp_dir=temp_dir,
      
        5429
                messages=[],
      
        5430
                safeguards=FakeSafeguards(),
      
        5431
                assess_confidence=assess_confidence,
      
        5432
                verify_action=verify_action,
      
        5433
                auto_recover=False,
      
        5434
            )
      
        5435
            queued_messages: list[str] = []
      
        5436
            context.queue_steering_message_callback = queued_messages.append
      
        5437
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5438
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5439
            dod.implementation_plan = str(implementation_plan)
      
        5440
            dod.touched_files.append(str(index_path))
      
        5441
            sync_todos_to_definition_of_done(
      
        5442
                dod,
      
        5443
                [
      
        5444
                    {
      
        5445
                        "content": "Examine the existing Fortran guide structure",
      
        5446
                        "active_form": "Examining the existing Fortran guide structure",
      
        5447
                        "status": "completed",
      
        5448
                    },
      
        5449
                    {
      
        5450
                        "content": "Create the nginx directory structure",
      
        5451
                        "active_form": "Creating the nginx directory structure",
      
        5452
                        "status": "completed",
      
        5453
                    },
      
        5454
                    {
      
        5455
                        "content": "Write the introduction chapter",
      
        5456
                        "active_form": "Writing the introduction chapter",
      
        5457
                        "status": "pending",
      
        5458
                    },
      
        5459
                ],
      
        5460
                project_root=temp_dir,
      
        5461
            )
      
        5462
        
        5463
            tool_call = ToolCall(
      
        5464
                id="todo-next-mutation",
      
        5465
                name="TodoWrite",
      
        5466
                arguments={
      
        5467
                    "todos": [
      
        5468
                        {
      
        5469
                            "content": "Examine the existing Fortran guide structure",
      
        5470
                            "active_form": "Examining the existing Fortran guide structure",
      
        5471
                            "status": "completed",
      
        5472
                        },
      
        5473
                        {
      
        5474
                            "content": "Create the nginx directory structure",
      
        5475
                            "active_form": "Creating the nginx directory structure",
      
        5476
                            "status": "completed",
      
        5477
                        },
      
        5478
                        {
      
        5479
                            "content": "Write the introduction chapter",
      
        5480
                            "active_form": "Writing the introduction chapter",
      
        5481
                            "status": "pending",
      
        5482
                        },
      
        5483
                    ]
      
        5484
                },
      
        5485
            )
      
        5486
            executor = FakeExecutor(
      
        5487
                [
      
        5488
                    tool_outcome(
      
        5489
                        tool_call=tool_call,
      
        5490
                        output="Todos updated",
      
        5491
                        is_error=False,
      
        5492
                        metadata={
      
        5493
                            "new_todos": [
      
        5494
                                {
      
        5495
                                    "content": "Examine the existing Fortran guide structure",
      
        5496
                                    "active_form": "Examining the existing Fortran guide structure",
      
        5497
                                    "status": "completed",
      
        5498
                                },
      
        5499
                                {
      
        5500
                                    "content": "Create the nginx directory structure",
      
        5501
                                    "active_form": "Creating the nginx directory structure",
      
        5502
                                    "status": "completed",
      
        5503
                                },
      
        5504
                                {
      
        5505
                                    "content": "Write the introduction chapter",
      
        5506
                                    "active_form": "Writing the introduction chapter",
      
        5507
                                    "status": "pending",
      
        5508
                                },
      
        5509
                            ]
      
        5510
                        },
      
        5511
                    )
      
        5512
                ]
      
        5513
            )
      
        5514
        
        5515
            summary = TurnSummary(final_response="")
      
        5516
            await runner.execute_batch(
      
        5517
                tool_calls=[tool_call],
      
        5518
                tool_source="assistant",
      
        5519
                pending_tool_calls_seen=set(),
      
        5520
                emit=_noop_emit,
      
        5521
                summary=summary,
      
        5522
                dod=dod,
      
        5523
                executor=executor,  # type: ignore[arg-type]
      
        5524
                on_confirmation=None,
      
        5525
                on_user_question=None,
      
        5526
                emit_confirmation=None,
      
        5527
                consecutive_errors=0,
      
        5528
            )
      
        5529
        
        5530
            assert queued_messages
      
        5531
            message = queued_messages[-1]
      
        5532
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        5533
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        5534
            assert "Make your next response the concrete mutation tool call itself." in message
      
        5535
        
        5536
        
        5537
        @pytest.mark.asyncio
      
        5538
        async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
      
        5539
            temp_dir: Path,
      
        5540
        ) -> None:
      
        5541
            async def assess_confidence(
      
        5542
                tool_name: str,
      
        5543
                tool_args: dict,
      
        5544
                context: str,
      
        5545
            ) -> ConfidenceAssessment:
      
        5546
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5547
        
        5548
            async def verify_action(
      
        5549
                tool_name: str,
      
        5550
                tool_args: dict,
      
        5551
                result: str,
      
        5552
                expected: str = "",
      
        5553
            ) -> ActionVerification:
      
        5554
                raise AssertionError("Verification should not run in this scenario")
      
        5555
        
        5556
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        5557
            chapters = guide_root / "chapters"
      
        5558
            chapters.mkdir(parents=True)
      
        5559
            index_path = guide_root / "index.html"
      
        5560
            implementation_plan = temp_dir / "implementation.md"
      
        5561
            implementation_plan.write_text(
      
        5562
                "\n".join(
      
        5563
                    [
      
        5564
                        "# Implementation Plan",
      
        5565
                        "",
      
        5566
                        "## File Changes",
      
        5567
                        f"- `{chapters}/`",
      
        5568
                        f"- `{index_path}`",
      
        5569
                        "",
      
        5570
                    ]
      
        5571
                )
      
        5572
            )
      
        5573
        
        5574
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5575
            dod.implementation_plan = str(implementation_plan)
      
        5576
            sync_todos_to_definition_of_done(
      
        5577
                dod,
      
        5578
                [
      
        5579
                    {
      
        5580
                        "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        5581
                        "active_form": "Examining the existing Fortran guide structure",
      
        5582
                        "status": "completed",
      
        5583
                    },
      
        5584
                    {
      
        5585
                        "content": "Create the new nginx guide directory structure",
      
        5586
                        "active_form": "Creating the new nginx guide directory structure",
      
        5587
                        "status": "completed",
      
        5588
                    },
      
        5589
                    {
      
        5590
                        "content": "Create a new index.html for the nginx guide",
      
        5591
                        "active_form": "Creating a new index.html for the nginx guide",
      
        5592
                        "status": "pending",
      
        5593
                    },
      
        5594
                    {
      
        5595
                        "content": "Create the first chapter for the nginx guide",
      
        5596
                        "active_form": "Creating the first chapter for the nginx guide",
      
        5597
                        "status": "pending",
      
        5598
                    },
      
        5599
                ],
      
        5600
                project_root=temp_dir,
      
        5601
            )
      
        5602
        
        5603
            queued_messages: list[str] = []
      
        5604
            context = build_context(
      
        5605
                temp_dir=temp_dir,
      
        5606
                messages=[],
      
        5607
                safeguards=FakeSafeguards(),
      
        5608
                assess_confidence=assess_confidence,
      
        5609
                verify_action=verify_action,
      
        5610
                auto_recover=False,
      
        5611
            )
      
        5612
            context.queue_steering_message_callback = queued_messages.append
      
        5613
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5614
        
        5615
            todos = [
      
        5616
                {
      
        5617
                    "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        5618
                    "active_form": "Examining the existing Fortran guide structure",
      
        5619
                    "status": "completed",
      
        5620
                },
      
        5621
                {
      
        5622
                    "content": "Create the new nginx guide directory structure",
      
        5623
                    "active_form": "Creating the new nginx guide directory structure",
      
        5624
                    "status": "completed",
      
        5625
                },
      
        5626
                {
      
        5627
                    "content": "Create a new index.html for the nginx guide",
      
        5628
                    "active_form": "Creating a new index.html for the nginx guide",
      
        5629
                    "status": "pending",
      
        5630
                },
      
        5631
                {
      
        5632
                    "content": "Create the first chapter for the nginx guide",
      
        5633
                    "active_form": "Creating the first chapter for the nginx guide",
      
        5634
                    "status": "pending",
      
        5635
                },
      
        5636
            ]
      
        5637
            tool_call = ToolCall(
      
        5638
                id="todo-index-before-chapter",
      
        5639
                name="TodoWrite",
      
        5640
                arguments={"todos": todos},
      
        5641
            )
      
        5642
            executor = FakeExecutor(
      
        5643
                [
      
        5644
                    tool_outcome(
      
        5645
                        tool_call=tool_call,
      
        5646
                        output="Todos updated",
      
        5647
                        is_error=False,
      
        5648
                        metadata={"new_todos": todos},
      
        5649
                    )
      
        5650
                ]
      
        5651
            )
      
        5652
        
        5653
            summary = TurnSummary(final_response="")
      
        5654
            await runner.execute_batch(
      
        5655
                tool_calls=[tool_call],
      
        5656
                tool_source="assistant",
      
        5657
                pending_tool_calls_seen=set(),
      
        5658
                emit=_noop_emit,
      
        5659
                summary=summary,
      
        5660
                dod=dod,
      
        5661
                executor=executor,  # type: ignore[arg-type]
      
        5662
                on_confirmation=None,
      
        5663
                on_user_question=None,
      
        5664
                emit_confirmation=None,
      
        5665
                consecutive_errors=0,
      
        5666
            )
      
        5667
        
        5668
            assert queued_messages
      
        5669
            message = queued_messages[-1]
      
        5670
            assert "Todo tracking is updated. Next step: create `index.html`." in message
      
        5671
            assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
      
        5672
            assert "01-introduction.html" not in message
      
        5673
        
        5674
        
        5675
        @pytest.mark.asyncio
      
        5676
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        5677
            temp_dir: Path,
      
        5678
        ) -> None:
      
        5679
            async def assess_confidence(
      
        5680
                tool_name: str,
      
        5681
                tool_args: dict,
      
        5682
                context: str,
      
        5683
            ) -> ConfidenceAssessment:
      
        5684
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5685
        
        5686
            async def verify_action(
      
        5687
                tool_name: str,
      
        5688
                tool_args: dict,
      
        5689
                result: str,
      
        5690
                expected: str = "",
      
        5691
            ) -> ActionVerification:
      
        5692
                raise AssertionError("Verification should not run in this scenario")
      
        5693
        
        5694
            guide_root = temp_dir / "guides" / "nginx"
      
        5695
            chapters = guide_root / "chapters"
      
        5696
            guide_root.mkdir(parents=True)
      
        5697
            chapters.mkdir()
      
        5698
            index_path = guide_root / "index.html"
      
        5699
            index_path.write_text(
      
        5700
                "\n".join(
      
        5701
                    [
      
        5702
                        "<html>",
      
        5703
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        5704
                        '<a href="chapters/installation.html">Installation</a>',
      
        5705
                        "</html>",
      
        5706
                    ]
      
        5707
                )
      
        5708
                + "\n"
      
        5709
            )
      
        5710
        
        5711
            implementation_plan = temp_dir / "implementation.md"
      
        5712
            implementation_plan.write_text(
      
        5713
                "\n".join(
      
        5714
                    [
      
        5715
                        "# Implementation Plan",
      
        5716
                        "",
      
        5717
                        "## File Changes",
      
        5718
                        f"- `{guide_root}/`",
      
        5719
                        f"- `{chapters}/`",
      
        5720
                        f"- `{index_path}`",
      
        5721
                        "",
      
        5722
                    ]
      
        5723
                )
      
        5724
            )
      
        5725
        
        5726
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5727
            dod.implementation_plan = str(implementation_plan)
      
        5728
            dod.pending_items = [
      
        5729
                "Write the introduction chapter",
      
        5730
                "Complete the requested work",
      
        5731
            ]
      
        5732
            dod.touched_files.append(str(index_path))
      
        5733
        
        5734
            queued_messages: list[str] = []
      
        5735
            context = build_context(
      
        5736
                temp_dir=temp_dir,
      
        5737
                messages=[],
      
        5738
                safeguards=FakeSafeguards(),
      
        5739
                assess_confidence=assess_confidence,
      
        5740
                verify_action=verify_action,
      
        5741
                auto_recover=False,
      
        5742
            )
      
        5743
            context.queue_steering_message_callback = queued_messages.append
      
        5744
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5745
        
        5746
            tool_call = ToolCall(
      
        5747
                id="todo-1",
      
        5748
                name="TodoWrite",
      
        5749
                arguments={
      
        5750
                    "todos": [
      
        5751
                        {
      
        5752
                            "content": "Write the introduction chapter",
      
        5753
                            "activeForm": "Writing the introduction chapter",
      
        5754
                            "status": "pending",
      
        5755
                        }
      
        5756
                    ]
      
        5757
                },
      
        5758
            )
      
        5759
            executor = FakeExecutor(
      
        5760
                [
      
        5761
                    tool_outcome(
      
        5762
                        tool_call=tool_call,
      
        5763
                        output="Todos updated",
      
        5764
                        is_error=False,
      
        5765
                        metadata={
      
        5766
                            "new_todos": [
      
        5767
                                {
      
        5768
                                    "content": "Write the introduction chapter",
      
        5769
                                    "active_form": "Writing the introduction chapter",
      
        5770
                                    "status": "pending",
      
        5771
                                }
      
        5772
                            ]
      
        5773
                        },
      
        5774
                    )
      
        5775
                ]
      
        5776
            )
      
        5777
        
        5778
            summary = TurnSummary(final_response="")
      
        5779
            await runner.execute_batch(
      
        5780
                tool_calls=[tool_call],
      
        5781
                tool_source="assistant",
      
        5782
                pending_tool_calls_seen=set(),
      
        5783
                emit=_noop_emit,
      
        5784
                summary=summary,
      
        5785
                dod=dod,
      
        5786
                executor=executor,  # type: ignore[arg-type]
      
        5787
                on_confirmation=None,
      
        5788
                on_user_question=None,
      
        5789
                emit_confirmation=None,
      
        5790
                consecutive_errors=0,
      
        5791
            )
      
        5792
        
        5793
            assert queued_messages
      
        5794
            message = queued_messages[-1]
      
        5795
            assert "Todo tracking is updated. Next step: create `introduction.html`." in message
      
        5796
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        5797
            assert "Make your next response the concrete mutation tool call itself." in message
      
        5798
        
        5799
        
        5800
        @pytest.mark.asyncio
      
        5801
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        5802
            temp_dir: Path,
      
        5803
        ) -> None:
      
        5804
            async def assess_confidence(
      
        5805
                tool_name: str,
      
        5806
                tool_args: dict,
      
        5807
                context: str,
      
        5808
            ) -> ConfidenceAssessment:
      
        5809
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5810
        
        5811
            async def verify_action(
      
        5812
                tool_name: str,
      
        5813
                tool_args: dict,
      
        5814
                result: str,
      
        5815
                expected: str = "",
      
        5816
            ) -> ActionVerification:
      
        5817
                raise AssertionError("Verification should not run in this scenario")
      
        5818
        
        5819
            guide_root = temp_dir / "guides" / "nginx"
      
        5820
            chapters = guide_root / "chapters"
      
        5821
            guide_root.mkdir(parents=True)
      
        5822
            chapters.mkdir()
      
        5823
            index_path = guide_root / "index.html"
      
        5824
            chapter_one = chapters / "01-introduction.html"
      
        5825
            index_path.write_text(
      
        5826
                "\n".join(
      
        5827
                    [
      
        5828
                        "<html>",
      
        5829
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        5830
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        5831
                        "</html>",
      
        5832
                    ]
      
        5833
                )
      
        5834
                + "\n"
      
        5835
            )
      
        5836
            chapter_one.write_text("<html></html>\n")
      
        5837
        
        5838
            implementation_plan = temp_dir / "implementation.md"
      
        5839
            implementation_plan.write_text(
      
        5840
                "\n".join(
      
        5841
                    [
      
        5842
                        "# Implementation Plan",
      
        5843
                        "",
      
        5844
                        "## File Changes",
      
        5845
                        f"- `{guide_root}/`",
      
        5846
                        f"- `{chapters}/`",
      
        5847
                        f"- `{index_path}`",
      
        5848
                        "",
      
        5849
                    ]
      
        5850
                )
      
        5851
            )
      
        5852
        
        5853
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5854
            dod.implementation_plan = str(implementation_plan)
      
        5855
            dod.pending_items = [
      
        5856
                "Creating Chapter 2: Installation and Setup",
      
        5857
                "Complete the requested work",
      
        5858
            ]
      
        5859
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5860
        
        5861
            queued_messages: list[str] = []
      
        5862
            context = build_context(
      
        5863
                temp_dir=temp_dir,
      
        5864
                messages=[],
      
        5865
                safeguards=FakeSafeguards(),
      
        5866
                assess_confidence=assess_confidence,
      
        5867
                verify_action=verify_action,
      
        5868
                auto_recover=False,
      
        5869
            )
      
        5870
            context.queue_steering_message_callback = queued_messages.append
      
        5871
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5872
        
        5873
            tool_call = ToolCall(
      
        5874
                id="todo-1",
      
        5875
                name="TodoWrite",
      
        5876
                arguments={
      
        5877
                    "todos": [
      
        5878
                        {
      
        5879
                            "content": "Creating Chapter 2: Installation and Setup",
      
        5880
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        5881
                            "status": "pending",
      
        5882
                        }
      
        5883
                    ]
      
        5884
                },
      
        5885
            )
      
        5886
            executor = FakeExecutor(
      
        5887
                [
      
        5888
                    tool_outcome(
      
        5889
                        tool_call=tool_call,
      
        5890
                        output="Todos updated",
      
        5891
                        is_error=False,
      
        5892
                        metadata={
      
        5893
                            "new_todos": [
      
        5894
                                {
      
        5895
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        5896
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        5897
                                    "status": "pending",
      
        5898
                                }
      
        5899
                            ]
      
        5900
                        },
      
        5901
                    )
      
        5902
                ]
      
        5903
            )
      
        5904
        
        5905
            summary = TurnSummary(final_response="")
      
        5906
            await runner.execute_batch(
      
        5907
                tool_calls=[tool_call],
      
        5908
                tool_source="assistant",
      
        5909
                pending_tool_calls_seen=set(),
      
        5910
                emit=_noop_emit,
      
        5911
                summary=summary,
      
        5912
                dod=dod,
      
        5913
                executor=executor,  # type: ignore[arg-type]
      
        5914
                on_confirmation=None,
      
        5915
                on_user_question=None,
      
        5916
                emit_confirmation=None,
      
        5917
                consecutive_errors=0,
      
        5918
            )
      
        5919
        
        5920
            assert queued_messages
      
        5921
            message = queued_messages[-1]
      
        5922
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        5923
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        5924
            assert "Make your next response the concrete mutation tool call itself" in message
      
        5925
        
        5926
        
        5927
        @pytest.mark.asyncio
      
        5928
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        5929
            temp_dir: Path,
      
        5930
        ) -> None:
      
        5931
            async def assess_confidence(
      
        5932
                tool_name: str,
      
        5933
                tool_args: dict,
      
        5934
                context: str,
      
        5935
            ) -> ConfidenceAssessment:
      
        5936
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5937
        
        5938
            async def verify_action(
      
        5939
                tool_name: str,
      
        5940
                tool_args: dict,
      
        5941
                result: str,
      
        5942
                expected: str = "",
      
        5943
            ) -> ActionVerification:
      
        5944
                raise AssertionError("Verification should not run in this scenario")
      
        5945
        
        5946
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        5947
            reference_chapters.mkdir(parents=True)
      
        5948
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        5949
        
        5950
            guide_root = temp_dir / "guides" / "nginx"
      
        5951
            chapters = guide_root / "chapters"
      
        5952
            guide_root.mkdir(parents=True)
      
        5953
            chapters.mkdir()
      
        5954
            index_path = guide_root / "index.html"
      
        5955
            index_path.write_text("<html></html>\n")
      
        5956
        
        5957
            implementation_plan = temp_dir / "implementation.md"
      
        5958
            implementation_plan.write_text(
      
        5959
                "\n".join(
      
        5960
                    [
      
        5961
                        "# Implementation Plan",
      
        5962
                        "",
      
        5963
                        "## File Changes",
      
        5964
                        f"- `{guide_root}/`",
      
        5965
                        f"- `{chapters}/`",
      
        5966
                        f"- `{index_path}`",
      
        5967
                        "",
      
        5968
                    ]
      
        5969
                )
      
        5970
            )
      
        5971
        
        5972
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5973
            dod.implementation_plan = str(implementation_plan)
      
        5974
            dod.pending_items = [
      
        5975
                "Write the introduction chapter",
      
        5976
                "Complete the requested work",
      
        5977
            ]
      
        5978
            dod.touched_files.append(str(index_path))
      
        5979
        
        5980
            queued_messages: list[str] = []
      
        5981
            context = build_context(
      
        5982
                temp_dir=temp_dir,
      
        5983
                messages=[
      
        5984
                    Message(
      
        5985
                        role=Role.ASSISTANT,
      
        5986
                        content="",
      
        5987
                        tool_calls=[
      
        5988
                            ToolCall(
      
        5989
                                id="read-ref-1",
      
        5990
                                name="read",
      
        5991
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        5992
                            )
      
        5993
                        ],
      
        5994
                    )
      
        5995
                ],
      
        5996
                safeguards=FakeSafeguards(),
      
        5997
                assess_confidence=assess_confidence,
      
        5998
                verify_action=verify_action,
      
        5999
                auto_recover=False,
      
        6000
            )
      
        6001
            context.queue_steering_message_callback = queued_messages.append
      
        6002
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6003
        
        6004
            tool_call = ToolCall(
      
        6005
                id="todo-observed-1",
      
        6006
                name="TodoWrite",
      
        6007
                arguments={
      
        6008
                    "todos": [
      
        6009
                        {
      
        6010
                            "content": "Write the introduction chapter",
      
        6011
                            "activeForm": "Writing the introduction chapter",
      
        6012
                            "status": "pending",
      
        6013
                        }
      
        6014
                    ]
      
        6015
                },
      
        6016
            )
      
        6017
            executor = FakeExecutor(
      
        6018
                [
      
        6019
                    tool_outcome(
      
        6020
                        tool_call=tool_call,
      
        6021
                        output="Todos updated",
      
        6022
                        is_error=False,
      
        6023
                        metadata={
      
        6024
                            "new_todos": [
      
        6025
                                {
      
        6026
                                    "content": "Write the introduction chapter",
      
        6027
                                    "active_form": "Writing the introduction chapter",
      
        6028
                                    "status": "pending",
      
        6029
                                }
      
        6030
                            ]
      
        6031
                        },
      
        6032
                    )
      
        6033
                ]
      
        6034
            )
      
        6035
        
        6036
            summary = TurnSummary(final_response="")
      
        6037
            await runner.execute_batch(
      
        6038
                tool_calls=[tool_call],
      
        6039
                tool_source="assistant",
      
        6040
                pending_tool_calls_seen=set(),
      
        6041
                emit=_noop_emit,
      
        6042
                summary=summary,
      
        6043
                dod=dod,
      
        6044
                executor=executor,  # type: ignore[arg-type]
      
        6045
                on_confirmation=None,
      
        6046
                on_user_question=None,
      
        6047
                emit_confirmation=None,
      
        6048
                consecutive_errors=0,
      
        6049
            )
      
        6050
        
        6051
            assert queued_messages
      
        6052
            message = queued_messages[-1]
      
        6053
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        6054
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        6055
        
        6056
        
        6057
        @pytest.mark.asyncio
      
        6058
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        6059
            temp_dir: Path,
      
        6060
        ) -> None:
      
        6061
            async def assess_confidence(
      
        6062
                tool_name: str,
      
        6063
                tool_args: dict,
      
        6064
                context: str,
      
        6065
            ) -> ConfidenceAssessment:
      
        6066
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6067
        
        6068
            async def verify_action(
      
        6069
                tool_name: str,
      
        6070
                tool_args: dict,
      
        6071
                result: str,
      
        6072
                expected: str = "",
      
        6073
            ) -> ActionVerification:
      
        6074
                raise AssertionError("Verification should not run in this scenario")
      
        6075
        
        6076
            guide_root = temp_dir / "guides" / "nginx"
      
        6077
            chapters = guide_root / "chapters"
      
        6078
            guide_root.mkdir(parents=True)
      
        6079
            chapters.mkdir()
      
        6080
            index_path = guide_root / "index.html"
      
        6081
            chapter_one = chapters / "01-getting-started.html"
      
        6082
            chapter_two = chapters / "02-installation.html"
      
        6083
            index_path.write_text("<html></html>\n")
      
        6084
            chapter_one.write_text("<h1>One</h1>\n")
      
        6085
        
        6086
            implementation_plan = temp_dir / "implementation.md"
      
        6087
            implementation_plan.write_text(
      
        6088
                "\n".join(
      
        6089
                    [
      
        6090
                        "# Implementation Plan",
      
        6091
                        "",
      
        6092
                        "## File Changes",
      
        6093
                        f"- `{guide_root}/`",
      
        6094
                        f"- `{chapters}/`",
      
        6095
                        f"- `{index_path}`",
      
        6096
                        f"- `{chapter_one}`",
      
        6097
                        f"- `{chapter_two}`",
      
        6098
                        "",
      
        6099
                    ]
      
        6100
                )
      
        6101
            )
      
        6102
        
        6103
            context = build_context(
      
        6104
                temp_dir=temp_dir,
      
        6105
                messages=[],
      
        6106
                safeguards=FakeSafeguards(),
      
        6107
                assess_confidence=assess_confidence,
      
        6108
                verify_action=verify_action,
      
        6109
                auto_recover=False,
      
        6110
            )
      
        6111
            queued_messages: list[str] = []
      
        6112
            context.queue_steering_message_callback = queued_messages.append
      
        6113
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6114
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6115
            dod.implementation_plan = str(implementation_plan)
      
        6116
            sync_todos_to_definition_of_done(
      
        6117
                dod,
      
        6118
                [
      
        6119
                    {
      
        6120
                        "content": "Create 01-getting-started.html",
      
        6121
                        "active_form": "Creating 01-getting-started.html",
      
        6122
                        "status": "completed",
      
        6123
                    },
      
        6124
                    {
      
        6125
                        "content": "Create 02-installation.html",
      
        6126
                        "active_form": "Creating 02-installation.html",
      
        6127
                        "status": "pending",
      
        6128
                    },
      
        6129
                ],
      
        6130
                project_root=temp_dir,
      
        6131
            )
      
        6132
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        6133
        
        6134
            tool_call = ToolCall(
      
        6135
                id="working-note",
      
        6136
                name="notepad_write_working",
      
        6137
                arguments={"content": "Creating the second chapter file: Installation"},
      
        6138
            )
      
        6139
            executor = FakeExecutor(
      
        6140
                [
      
        6141
                    tool_outcome(
      
        6142
                        tool_call=tool_call,
      
        6143
                        output="Working note recorded",
      
        6144
                        is_error=False,
      
        6145
                    )
      
        6146
                ]
      
        6147
            )
      
        6148
        
        6149
            summary = TurnSummary(final_response="")
      
        6150
            await runner.execute_batch(
      
        6151
                tool_calls=[tool_call],
      
        6152
                tool_source="assistant",
      
        6153
                pending_tool_calls_seen=set(),
      
        6154
                emit=_noop_emit,
      
        6155
                summary=summary,
      
        6156
                dod=dod,
      
        6157
                executor=executor,  # type: ignore[arg-type]
      
        6158
                on_confirmation=None,
      
        6159
                on_user_question=None,
      
        6160
                emit_confirmation=None,
      
        6161
                consecutive_errors=0,
      
        6162
            )
      
        6163
        
        6164
            assert queued_messages
      
        6165
            message = queued_messages[-1]
      
        6166
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        6167
            assert "Resume by creating `02-installation.html` now." in message
      
        6168
            assert "Make your next response the concrete mutation tool call itself" in message
      
        6169
            assert "refresh `TodoWrite`" in message
      
        6170
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        6171
        
        6172
        
        6173
        @pytest.mark.asyncio
      
        6174
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        6175
            temp_dir: Path,
      
        6176
        ) -> None:
      
        6177
            async def assess_confidence(
      
        6178
                tool_name: str,
      
        6179
                tool_args: dict,
      
        6180
                context: str,
      
        6181
            ) -> ConfidenceAssessment:
      
        6182
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6183
        
        6184
            async def verify_action(
      
        6185
                tool_name: str,
      
        6186
                tool_args: dict,
      
        6187
                result: str,
      
        6188
                expected: str = "",
      
        6189
            ) -> ActionVerification:
      
        6190
                raise AssertionError("Verification should not run in this scenario")
      
        6191
        
        6192
            implementation_plan = temp_dir / "implementation.md"
      
        6193
            implementation_plan.write_text(
      
        6194
                "\n".join(
      
        6195
                    [
      
        6196
                        "# Implementation Plan",
      
        6197
                        "",
      
        6198
                        "## File Changes",
      
        6199
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        6200
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        6201
                        "",
      
        6202
                    ]
      
        6203
                )
      
        6204
            )
      
        6205
        
        6206
            context = build_context(
      
        6207
                temp_dir=temp_dir,
      
        6208
                messages=[],
      
        6209
                safeguards=FakeSafeguards(),
      
        6210
                assess_confidence=assess_confidence,
      
        6211
                verify_action=verify_action,
      
        6212
                auto_recover=False,
      
        6213
            )
      
        6214
            queued_messages: list[str] = []
      
        6215
            context.queue_steering_message_callback = queued_messages.append
      
        6216
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6217
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6218
            dod.implementation_plan = str(implementation_plan)
      
        6219
            dod.pending_items.extend(
      
        6220
                [
      
        6221
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        6222
                    "Create the nginx directory structure",
      
        6223
                    "Develop the main index.html file for the nginx guide",
      
        6224
                ]
      
        6225
            )
      
        6226
        
        6227
            tool_call = ToolCall(
      
        6228
                id="working-note",
      
        6229
                name="notepad_write_working",
      
        6230
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        6231
            )
      
        6232
            executor = FakeExecutor(
      
        6233
                [
      
        6234
                    tool_outcome(
      
        6235
                        tool_call=tool_call,
      
        6236
                        output="Working note recorded",
      
        6237
                        is_error=False,
      
        6238
                    )
      
        6239
                ]
      
        6240
            )
      
        6241
        
        6242
            summary = TurnSummary(final_response="")
      
        6243
            await runner.execute_batch(
      
        6244
                tool_calls=[tool_call],
      
        6245
                tool_source="assistant",
      
        6246
                pending_tool_calls_seen=set(),
      
        6247
                emit=_noop_emit,
      
        6248
                summary=summary,
      
        6249
                dod=dod,
      
        6250
                executor=executor,  # type: ignore[arg-type]
      
        6251
                on_confirmation=None,
      
        6252
                on_user_question=None,
      
        6253
                emit_confirmation=None,
      
        6254
                consecutive_errors=0,
      
        6255
            )
      
        6256
        
        6257
            assert queued_messages
      
        6258
            message = queued_messages[-1]
      
        6259
            assert (
      
        6260
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        6261
                in message
      
        6262
            )
      
        6263
            assert "one concrete evidence-gathering tool call" in message
      
        6264
            assert "Resume by creating `index.html` now." not in message
      
        6265
        
        6266
        
        6267
        @pytest.mark.asyncio
      
        6268
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        6269
            temp_dir: Path,
      
        6270
        ) -> None:
      
        6271
            async def assess_confidence(
      
        6272
                tool_name: str,
      
        6273
                tool_args: dict,
      
        6274
                context: str,
      
        6275
            ) -> ConfidenceAssessment:
      
        6276
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6277
        
        6278
            async def verify_action(
      
        6279
                tool_name: str,
      
        6280
                tool_args: dict,
      
        6281
                result: str,
      
        6282
                expected: str = "",
      
        6283
            ) -> ActionVerification:
      
        6284
                raise AssertionError("Verification should not run in this scenario")
      
        6285
        
        6286
            guide_root = temp_dir / "guides" / "nginx"
      
        6287
            chapters_dir = guide_root / "chapters"
      
        6288
            chapters_dir.mkdir(parents=True)
      
        6289
            index_path = guide_root / "index.html"
      
        6290
            first_chapter = chapters_dir / "01-introduction.html"
      
        6291
            index_path.write_text(
      
        6292
                "\n".join(
      
        6293
                    [
      
        6294
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        6295
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        6296
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        6297
                    ]
      
        6298
                )
      
        6299
            )
      
        6300
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        6301
        
        6302
            implementation_plan = temp_dir / "implementation.md"
      
        6303
            implementation_plan.write_text(
      
        6304
                "\n".join(
      
        6305
                    [
      
        6306
                        "# Implementation Plan",
      
        6307
                        "",
      
        6308
                        "## File Changes",
      
        6309
                        f"- `{guide_root / 'index.html'}`",
      
        6310
                        f"- `{chapters_dir}/`",
      
        6311
                        "",
      
        6312
                    ]
      
        6313
                )
      
        6314
            )
      
        6315
        
        6316
            context = build_context(
      
        6317
                temp_dir=temp_dir,
      
        6318
                messages=[],
      
        6319
                safeguards=FakeSafeguards(),
      
        6320
                assess_confidence=assess_confidence,
      
        6321
                verify_action=verify_action,
      
        6322
                auto_recover=False,
      
        6323
            )
      
        6324
            queued_messages: list[str] = []
      
        6325
            context.queue_steering_message_callback = queued_messages.append
      
        6326
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6327
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6328
            dod.implementation_plan = str(implementation_plan)
      
        6329
            dod.pending_items.extend(
      
        6330
                [
      
        6331
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        6332
                    "Create chapter files following the established pattern",
      
        6333
                ]
      
        6334
            )
      
        6335
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        6336
        
        6337
            tool_call = ToolCall(
      
        6338
                id="working-note",
      
        6339
                name="notepad_write_working",
      
        6340
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        6341
            )
      
        6342
            executor = FakeExecutor(
      
        6343
                [
      
        6344
                    tool_outcome(
      
        6345
                        tool_call=tool_call,
      
        6346
                        output="Working note recorded",
      
        6347
                        is_error=False,
      
        6348
                    )
      
        6349
                ]
      
        6350
            )
      
        6351
        
        6352
            summary = TurnSummary(final_response="")
      
        6353
            await runner.execute_batch(
      
        6354
                tool_calls=[tool_call],
      
        6355
                tool_source="assistant",
      
        6356
                pending_tool_calls_seen=set(),
      
        6357
                emit=_noop_emit,
      
        6358
                summary=summary,
      
        6359
                dod=dod,
      
        6360
                executor=executor,  # type: ignore[arg-type]
      
        6361
                on_confirmation=None,
      
        6362
                on_user_question=None,
      
        6363
                emit_confirmation=None,
      
        6364
                consecutive_errors=0,
      
        6365
            )
      
        6366
        
        6367
            assert queued_messages
      
        6368
            message = queued_messages[-1]
      
        6369
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        6370
            assert "Resume by creating `02-installation.html` now." in message
      
        6371
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        6372
        
        6373
        
        6374
        @pytest.mark.asyncio
      
        6375
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        6376
            temp_dir: Path,
      
        6377
        ) -> None:
      
        6378
            async def assess_confidence(
      
        6379
                tool_name: str,
      
        6380
                tool_args: dict,
      
        6381
                context: str,
      
        6382
            ) -> ConfidenceAssessment:
      
        6383
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6384
        
        6385
            async def verify_action(
      
        6386
                tool_name: str,
      
        6387
                tool_args: dict,
      
        6388
                result: str,
      
        6389
                expected: str = "",
      
        6390
            ) -> ActionVerification:
      
        6391
                raise AssertionError("Verification should not run in this scenario")
      
        6392
        
        6393
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        6394
            chapters_dir = fortran_root / "chapters"
      
        6395
            chapters_dir.mkdir(parents=True)
      
        6396
        
        6397
            implementation_plan = temp_dir / "implementation.md"
      
        6398
            implementation_plan.write_text(
      
        6399
                "\n".join(
      
        6400
                    [
      
        6401
                        "# Implementation Plan",
      
        6402
                        "",
      
        6403
                        "## File Changes",
      
        6404
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        6405
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        6406
                        "",
      
        6407
                    ]
      
        6408
                )
      
        6409
            )
      
        6410
        
        6411
            context = build_context(
      
        6412
                temp_dir=temp_dir,
      
        6413
                messages=[],
      
        6414
                safeguards=FakeSafeguards(),
      
        6415
                assess_confidence=assess_confidence,
      
        6416
                verify_action=verify_action,
      
        6417
                auto_recover=False,
      
        6418
            )
      
        6419
            queued_messages: list[str] = []
      
        6420
            context.queue_steering_message_callback = queued_messages.append
      
        6421
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6422
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6423
            dod.implementation_plan = str(implementation_plan)
      
        6424
            dod.pending_items.extend(
      
        6425
                [
      
        6426
                    "First, examine the existing fortran guide structure and content",
      
        6427
                    "Create the nginx directory structure",
      
        6428
                    "Develop the main index.html file for nginx guide",
      
        6429
                ]
      
        6430
            )
      
        6431
        
        6432
            tool_call = ToolCall(
      
        6433
                id="glob-1",
      
        6434
                name="glob",
      
        6435
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        6436
            )
      
        6437
            executor = FakeExecutor(
      
        6438
                [
      
        6439
                    tool_outcome(
      
        6440
                        tool_call=tool_call,
      
        6441
                        output=f"{fortran_root}\n{chapters_dir}",
      
        6442
                        is_error=False,
      
        6443
                    )
      
        6444
                ]
      
        6445
            )
      
        6446
        
        6447
            summary = TurnSummary(final_response="")
      
        6448
            await runner.execute_batch(
      
        6449
                tool_calls=[tool_call],
      
        6450
                tool_source="assistant",
      
        6451
                pending_tool_calls_seen=set(),
      
        6452
                emit=_noop_emit,
      
        6453
                summary=summary,
      
        6454
                dod=dod,
      
        6455
                executor=executor,  # type: ignore[arg-type]
      
        6456
                on_confirmation=None,
      
        6457
                on_user_question=None,
      
        6458
                emit_confirmation=None,
      
        6459
                consecutive_errors=0,
      
        6460
            )
      
        6461
        
        6462
            assert queued_messages == []
      
        6463
        
        6464
        
        6465
        @pytest.mark.asyncio
      
        6466
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        6467
            temp_dir: Path,
      
        6468
        ) -> None:
      
        6469
            async def assess_confidence(
      
        6470
                tool_name: str,
      
        6471
                tool_args: dict,
      
        6472
                context: str,
      
        6473
            ) -> ConfidenceAssessment:
      
        6474
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6475
        
        6476
            async def verify_action(
      
        6477
                tool_name: str,
      
        6478
                tool_args: dict,
      
        6479
                result: str,
      
        6480
                expected: str = "",
      
        6481
            ) -> ActionVerification:
      
        6482
                raise AssertionError("Verification should not run in this scenario")
      
        6483
        
        6484
            prompt = (
      
        6485
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        6486
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        6487
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        6488
                "links and their link texts to be correct."
      
        6489
            )
      
        6490
            chapters = temp_dir / "chapters"
      
        6491
            chapters.mkdir()
      
        6492
            (chapters / "01-introduction.html").write_text(
      
        6493
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        6494
            )
      
        6495
            (chapters / "02-setup.html").write_text(
      
        6496
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        6497
            )
      
        6498
            current_block = (
      
        6499
                "<h2>Table of Contents</h2>\n"
      
        6500
                '        <ul class="chapter-list">\n'
      
        6501
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        6502
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        6503
                "        </ul>\n"
      
        6504
            )
      
        6505
            index_path = temp_dir / "index.html"
      
        6506
            index_path.write_text(current_block)
      
        6507
        
        6508
            context = build_context(
      
        6509
                temp_dir=temp_dir,
      
        6510
                messages=[],
      
        6511
                safeguards=FakeSafeguards(),
      
        6512
                assess_confidence=assess_confidence,
      
        6513
                verify_action=verify_action,
      
        6514
                auto_recover=False,
      
        6515
            )
      
        6516
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        6517
            queued_messages: list[str] = []
      
        6518
            context.queue_steering_message_callback = queued_messages.append
      
        6519
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6520
            tool_call = ToolCall(
      
        6521
                id="edit-1",
      
        6522
                name="edit",
      
        6523
                arguments={
      
        6524
                    "file_path": str(index_path),
      
        6525
                    "old_string": current_block,
      
        6526
                    "new_string": current_block,
      
        6527
                },
      
        6528
            )
      
        6529
            executor = FakeExecutor(
      
        6530
                [
      
        6531
                    tool_outcome(
      
        6532
                        tool_call=tool_call,
      
        6533
                        output=(
      
        6534
                            "[Blocked - old_string and new_string are identical - no change "
      
        6535
                            "would occur] Suggestion: Provide different old and new strings"
      
        6536
                        ),
      
        6537
                        is_error=True,
      
        6538
                        state=ToolExecutionState.BLOCKED,
      
        6539
                    )
      
        6540
                ]
      
        6541
            )
      
        6542
        
        6543
            await runner.execute_batch(
      
        6544
                tool_calls=[tool_call],
      
        6545
                tool_source="assistant",
      
        6546
                pending_tool_calls_seen=set(),
      
        6547
                emit=_noop_emit,
      
        6548
                summary=TurnSummary(final_response=""),
      
        6549
                dod=create_definition_of_done(prompt),
      
        6550
                executor=executor,  # type: ignore[arg-type]
      
        6551
                on_confirmation=None,
      
        6552
                on_user_question=None,
      
        6553
                emit_confirmation=None,
      
        6554
                consecutive_errors=0,
      
        6555
            )
      
        6556
        
        6557
            assert queued_messages == []
      
        6558
        
        6559
        
        6560
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        6561
            temp_dir: Path,
      
        6562
        ) -> None:
      
        6563
            async def assess_confidence(
      
        6564
                tool_name: str,
      
        6565
                tool_args: dict,
      
        6566
                context: str,
      
        6567
            ) -> ConfidenceAssessment:
      
        6568
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6569
        
        6570
            async def verify_action(
      
        6571
                tool_name: str,
      
        6572
                tool_args: dict,
      
        6573
                result: str,
      
        6574
                expected: str = "",
      
        6575
            ) -> ActionVerification:
      
        6576
                raise AssertionError("Verification should not run in this scenario")
      
        6577
        
        6578
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        6579
            context = build_context(
      
        6580
                temp_dir=temp_dir,
      
        6581
                messages=[
      
        6582
                    Message(
      
        6583
                        role=Role.ASSISTANT,
      
        6584
                        content=(
      
        6585
                            "Repair focus:\n"
      
        6586
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        6587
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        6588
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        6589
                        ),
      
        6590
                    )
      
        6591
                ],
      
        6592
                safeguards=FakeSafeguards(),
      
        6593
                assess_confidence=assess_confidence,
      
        6594
                verify_action=verify_action,
      
        6595
            )
      
        6596
            queued: list[str] = []
      
        6597
            context.queue_steering_message_callback = queued.append
      
        6598
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6599
            dod = create_definition_of_done("Repair a guide page.")
      
        6600
        
        6601
            runner._queue_blocked_html_edit_nudge(
      
        6602
                ToolCall(
      
        6603
                    id="edit-1",
      
        6604
                    name="edit",
      
        6605
                    arguments={
      
        6606
                        "file_path": str(repair_target),
      
        6607
                        "old_string": "same",
      
        6608
                        "new_string": "same",
      
        6609
                    },
      
        6610
                ),
      
        6611
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        6612
                dod=dod,
      
        6613
            )
      
        6614
        
        6615
            assert queued
      
        6616
            assert str(repair_target) in queued[0]
      
        6617
            assert "no on-disk change" in queued[0]
      
        6618
            assert "replace the surrounding block" in queued[0]
      
        6619
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        6620
        
        6621
        
        6622
        def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
      
        6623
            temp_dir: Path,
      
        6624
        ) -> None:
      
        6625
            async def assess_confidence(
      
        6626
                tool_name: str,
      
        6627
                tool_args: dict,
      
        6628
                context: str,
      
        6629
            ) -> ConfidenceAssessment:
      
        6630
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6631
        
        6632
            async def verify_action(
      
        6633
                tool_name: str,
      
        6634
                tool_args: dict,
      
        6635
                result: str,
      
        6636
                expected: str = "",
      
        6637
            ) -> ActionVerification:
      
        6638
                raise AssertionError("Verification should not run in this scenario")
      
        6639
        
        6640
            guide_root = temp_dir / "guide"
      
        6641
            chapters = guide_root / "chapters"
      
        6642
            chapters.mkdir(parents=True)
      
        6643
            index_path = guide_root / "index.html"
      
        6644
            chapter_one = chapters / "01-introduction.html"
      
        6645
            index_path.write_text("<html></html>\n")
      
        6646
            chapter_one.write_text("<html></html>\n")
      
        6647
        
        6648
            implementation_plan = temp_dir / "implementation.md"
      
        6649
            implementation_plan.write_text(
      
        6650
                "\n".join(
      
        6651
                    [
      
        6652
                        "# Implementation Plan",
      
        6653
                        "",
      
        6654
                        "## File Changes",
      
        6655
                        f"- `{index_path}`",
      
        6656
                        f"- `{chapter_one}`",
      
        6657
                        "",
      
        6658
                    ]
      
        6659
                )
      
        6660
            )
      
        6661
        
        6662
            context = build_context(
      
        6663
                temp_dir=temp_dir,
      
        6664
                messages=[
      
        6665
                    Message(
      
        6666
                        role=Role.ASSISTANT,
      
        6667
                        content=(
      
        6668
                            "Repair focus:\n"
      
        6669
                            f"- Confirm the final guide state in `{index_path}`.\n"
      
        6670
                            f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
      
        6671
                        ),
      
        6672
                    )
      
        6673
                ],
      
        6674
                safeguards=FakeSafeguards(),
      
        6675
                assess_confidence=assess_confidence,
      
        6676
                verify_action=verify_action,
      
        6677
            )
      
        6678
            queued: list[str] = []
      
        6679
            context.queue_steering_message_callback = queued.append
      
        6680
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6681
        
        6682
            dod = create_definition_of_done("Create a multi-file guide.")
      
        6683
            dod.implementation_plan = str(implementation_plan)
      
        6684
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        6685
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6686
        
        6687
            runner._queue_blocked_html_edit_nudge(
      
        6688
                ToolCall(
      
        6689
                    id="edit-1",
      
        6690
                    name="edit",
      
        6691
                    arguments={
      
        6692
                        "file_path": str(index_path),
      
        6693
                        "old_string": "same",
      
        6694
                        "new_string": "same",
      
        6695
                    },
      
        6696
                ),
      
        6697
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        6698
                dod=dod,
      
        6699
            )
      
        6700
        
        6701
            assert queued
      
        6702
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        6703
            assert "Finish with a final response now so Loader can run verification automatically." in queued[0]
      
        6704
            assert "replace the surrounding block" not in queued[0]
      
        6705
        
        6706
        
        6707
        def test_tool_batch_runner_blocked_noop_edit_keeps_quality_repair_active_after_full_build(
      
        6708
            temp_dir: Path,
      
        6709
        ) -> None:
      
        6710
            async def assess_confidence(
      
        6711
                tool_name: str,
      
        6712
                tool_args: dict,
      
        6713
                context: str,
      
        6714
            ) -> ConfidenceAssessment:
      
        6715
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6716
        
        6717
            async def verify_action(
      
        6718
                tool_name: str,
      
        6719
                tool_args: dict,
      
        6720
                result: str,
      
        6721
                expected: str = "",
      
        6722
            ) -> ActionVerification:
      
        6723
                raise AssertionError("Verification should not run in this scenario")
      
        6724
        
        6725
            guide_root = temp_dir / "guide"
      
        6726
            chapters = guide_root / "chapters"
      
        6727
            chapters.mkdir(parents=True)
      
        6728
            index_path = guide_root / "index.html"
      
        6729
            chapter_one = chapters / "01-introduction.html"
      
        6730
            chapter_two = chapters / "02-installation.html"
      
        6731
            index_path.write_text("<html></html>\n")
      
        6732
            chapter_one.write_text("<html></html>\n")
      
        6733
            chapter_two.write_text("<html></html>\n")
      
        6734
        
        6735
            implementation_plan = temp_dir / "implementation.md"
      
        6736
            implementation_plan.write_text(
      
        6737
                "\n".join(
      
        6738
                    [
      
        6739
                        "# Implementation Plan",
      
        6740
                        "",
      
        6741
                        "## File Changes",
      
        6742
                        f"- `{index_path}`",
      
        6743
                        f"- `{chapter_one}`",
      
        6744
                        f"- `{chapter_two}`",
      
        6745
                        "",
      
        6746
                    ]
      
        6747
                )
      
        6748
            )
      
        6749
        
        6750
            context = build_context(
      
        6751
                temp_dir=temp_dir,
      
        6752
                messages=[
      
        6753
                    Message(
      
        6754
                        role=Role.USER,
      
        6755
                        content=(
      
        6756
                            "Repair focus:\n"
      
        6757
                            f"- Improve `{chapter_two}`: thin content (504 text chars, expected at least 1758).\n"
      
        6758
                            f"- Improve `{chapter_two}`: insufficient structured content (6 blocks, expected at least 18).\n"
      
        6759
                            f"- Immediate next step: edit `{chapter_two}`.\n"
      
        6760
                        ),
      
        6761
                    )
      
        6762
                ],
      
        6763
                safeguards=FakeSafeguards(),
      
        6764
                assess_confidence=assess_confidence,
      
        6765
                verify_action=verify_action,
      
        6766
            )
      
        6767
            queued: list[str] = []
      
        6768
            context.queue_steering_message_callback = queued.append
      
        6769
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6770
        
        6771
            dod = create_definition_of_done("Create a multi-file guide.")
      
        6772
            dod.implementation_plan = str(implementation_plan)
      
        6773
            dod.touched_files.extend([str(index_path), str(chapter_one), str(chapter_two)])
      
        6774
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6775
        
        6776
            runner._queue_blocked_html_edit_nudge(
      
        6777
                ToolCall(
      
        6778
                    id="edit-1",
      
        6779
                    name="edit",
      
        6780
                    arguments={
      
        6781
                        "file_path": str(chapter_two),
      
        6782
                        "old_string": "same",
      
        6783
                        "new_string": "same",
      
        6784
                    },
      
        6785
                ),
      
        6786
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        6787
                dod=dod,
      
        6788
            )
      
        6789
        
        6790
            assert queued
      
        6791
            assert "active content-quality repair is not complete" in queued[0]
      
        6792
            assert "Repair focus:" in queued[0]
      
        6793
            assert f"Immediate next step: edit `{chapter_two}`" in queued[0]
      
        6794
            assert "thin content" in queued[0]
      
        6795
            assert "TodoWrite cannot satisfy" not in queued[0]
      
        6796
            assert "Finish with a final response now" not in queued[0]
      
        6797
        
        6798
        
        6799
        async def _noop_emit(event: AgentEvent) -> None:
      
        6800
            return None
      
        6801
        
        6802
        
        6803
        @pytest.mark.asyncio
      
        6804
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        6805
            temp_dir: Path,
      
        6806
        ) -> None:
      
        6807
            async def assess_confidence(
      
        6808
                tool_name: str,
      
        6809
                tool_args: dict,
      
        6810
                context: str,
      
        6811
            ) -> ConfidenceAssessment:
      
        6812
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6813
        
        6814
            async def verify_action(
      
        6815
                tool_name: str,
      
        6816
                tool_args: dict,
      
        6817
                result: str,
      
        6818
                expected: str = "",
      
        6819
            ) -> ActionVerification:
      
        6820
                raise AssertionError("Verification should not run for this scenario")
      
        6821
        
        6822
            context = build_context(
      
        6823
                temp_dir=temp_dir,
      
        6824
                messages=[],
      
        6825
                safeguards=FakeSafeguards(),
      
        6826
                assess_confidence=assess_confidence,
      
        6827
                verify_action=verify_action,
      
        6828
            )
      
        6829
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6830
            tool_call = ToolCall(
      
        6831
                id="write-1",
      
        6832
                name="write",
      
        6833
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        6834
            )
      
        6835
            executor = FakeExecutor(
      
        6836
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        6837
            )
      
        6838
            summary = TurnSummary(final_response="")
      
        6839
            dod = create_definition_of_done("Update README and verify it still works.")
      
        6840
            events: list[AgentEvent] = []
      
        6841
        
        6842
            async def emit(event: AgentEvent) -> None:
      
        6843
                events.append(event)
      
        6844
        
        6845
            await runner.execute_batch(
      
        6846
                tool_calls=[tool_call],
      
        6847
                tool_source="assistant",
      
        6848
                pending_tool_calls_seen=set(),
      
        6849
                emit=emit,
      
        6850
                summary=summary,
      
        6851
                dod=dod,
      
        6852
                executor=executor,  # type: ignore[arg-type]
      
        6853
                on_confirmation=None,
      
        6854
                on_user_question=None,
      
        6855
                emit_confirmation=None,
      
        6856
                consecutive_errors=0,
      
        6857
            )
      
        6858
        
        6859
            assert dod.last_verification_result == "planned"
      
        6860
            assert dod.verification_commands
      
        6861
            assert "Collect verification evidence" in dod.pending_items
      
        6862
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        6863
            assert dod.active_verification_attempt_number == 1
      
        6864
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        6865
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        6866
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        6867
            assert (
      
        6868
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        6869
                == "verification-attempt-1"
      
        6870
            )
      
        6871
            assert (
      
        6872
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        6873
            )
      
        6874
        
        6875
        
        6876
        @pytest.mark.asyncio
      
        6877
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        6878
            temp_dir: Path,
      
        6879
        ) -> None:
      
        6880
            async def assess_confidence(
      
        6881
                tool_name: str,
      
        6882
                tool_args: dict,
      
        6883
                context: str,
      
        6884
            ) -> ConfidenceAssessment:
      
        6885
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6886
        
        6887
            async def verify_action(
      
        6888
                tool_name: str,
      
        6889
                tool_args: dict,
      
        6890
                result: str,
      
        6891
                expected: str = "",
      
        6892
            ) -> ActionVerification:
      
        6893
                raise AssertionError("Verification should not run in this scenario")
      
        6894
        
        6895
            context = build_context(
      
        6896
                temp_dir=temp_dir,
      
        6897
                messages=[],
      
        6898
                safeguards=FakeSafeguards(),
      
        6899
                assess_confidence=assess_confidence,
      
        6900
                verify_action=verify_action,
      
        6901
            )
      
        6902
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6903
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        6904
            chapters = nginx_root / "chapters"
      
        6905
            implementation_plan = temp_dir / "implementation.md"
      
        6906
            implementation_plan.write_text(
      
        6907
                "\n".join(
      
        6908
                    [
      
        6909
                        "# Implementation Plan",
      
        6910
                        "",
      
        6911
                        "## File Changes",
      
        6912
                        f"- `{chapters}/`",
      
        6913
                        f"- `{nginx_root / 'index.html'}`",
      
        6914
                        "",
      
        6915
                    ]
      
        6916
                )
      
        6917
            )
      
        6918
        
        6919
            tool_call = ToolCall(
      
        6920
                id="mkdir-1",
      
        6921
                name="bash",
      
        6922
                arguments={"command": f"mkdir -p {chapters}"},
      
        6923
            )
      
        6924
            executor = FakeExecutor(
      
        6925
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        6926
            )
      
        6927
            summary = TurnSummary(final_response="")
      
        6928
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        6929
            dod.implementation_plan = str(implementation_plan)
      
        6930
            events: list[AgentEvent] = []
      
        6931
        
        6932
            async def emit(event: AgentEvent) -> None:
      
        6933
                events.append(event)
      
        6934
        
        6935
            await runner.execute_batch(
      
        6936
                tool_calls=[tool_call],
      
        6937
                tool_source="assistant",
      
        6938
                pending_tool_calls_seen=set(),
      
        6939
                emit=emit,
      
        6940
                summary=summary,
      
        6941
                dod=dod,
      
        6942
                executor=executor,  # type: ignore[arg-type]
      
        6943
                on_confirmation=None,
      
        6944
                on_user_question=None,
      
        6945
                emit_confirmation=None,
      
        6946
                consecutive_errors=0,
      
        6947
            )
      
        6948
        
        6949
            assert dod.last_verification_result is None
      
        6950
            assert "Collect verification evidence" not in dod.pending_items
      
        6951
            assert not any(
      
        6952
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        6953
            )
      
        6954
        
        6955
        
        6956
        @pytest.mark.asyncio
      
        6957
        async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
      
        6958
            temp_dir: Path,
      
        6959
        ) -> None:
      
        6960
            async def assess_confidence(
      
        6961
                tool_name: str,
      
        6962
                tool_args: dict,
      
        6963
                context: str,
      
        6964
            ) -> ConfidenceAssessment:
      
        6965
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6966
        
        6967
            async def verify_action(
      
        6968
                tool_name: str,
      
        6969
                tool_args: dict,
      
        6970
                result: str,
      
        6971
                expected: str = "",
      
        6972
            ) -> ActionVerification:
      
        6973
                raise AssertionError("Verification should not run in this scenario")
      
        6974
        
        6975
            context = build_context(
      
        6976
                temp_dir=temp_dir,
      
        6977
                messages=[],
      
        6978
                safeguards=FakeSafeguards(),
      
        6979
                assess_confidence=assess_confidence,
      
        6980
                verify_action=verify_action,
      
        6981
            )
      
        6982
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6983
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        6984
            chapters = nginx_root / "chapters"
      
        6985
            chapters.mkdir(parents=True)
      
        6986
            index_path = nginx_root / "index.html"
      
        6987
            implementation_plan = temp_dir / "implementation.md"
      
        6988
            implementation_plan.write_text(
      
        6989
                "\n".join(
      
        6990
                    [
      
        6991
                        "# Implementation Plan",
      
        6992
                        "",
      
        6993
                        "## File Changes",
      
        6994
                        f"- `{nginx_root}/`",
      
        6995
                        f"- `{chapters}/`",
      
        6996
                        f"- `{index_path}`",
      
        6997
                        "",
      
        6998
                    ]
      
        6999
                )
      
        7000
            )
      
        7001
        
        7002
            tool_call = ToolCall(
      
        7003
                id="write-index",
      
        7004
                name="write",
      
        7005
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        7006
            )
      
        7007
            executor = FakeExecutor(
      
        7008
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        7009
            )
      
        7010
            summary = TurnSummary(final_response="")
      
        7011
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        7012
            dod.implementation_plan = str(implementation_plan)
      
        7013
            dod.pending_items.extend(
      
        7014
                [
      
        7015
                    "Develop the main index.html file with proper structure",
      
        7016
                    "Create first nginx chapter",
      
        7017
                ]
      
        7018
            )
      
        7019
            events: list[AgentEvent] = []
      
        7020
        
        7021
            async def emit(event: AgentEvent) -> None:
      
        7022
                events.append(event)
      
        7023
        
        7024
            await runner.execute_batch(
      
        7025
                tool_calls=[tool_call],
      
        7026
                tool_source="assistant",
      
        7027
                pending_tool_calls_seen=set(),
      
        7028
                emit=emit,
      
        7029
                summary=summary,
      
        7030
                dod=dod,
      
        7031
                executor=executor,  # type: ignore[arg-type]
      
        7032
                on_confirmation=None,
      
        7033
                on_user_question=None,
      
        7034
                emit_confirmation=None,
      
        7035
                consecutive_errors=0,
      
        7036
            )
      
        7037
        
        7038
            assert dod.last_verification_result is None
      
        7039
            assert "Collect verification evidence" not in dod.pending_items
      
        7040
            assert "Create first nginx chapter" in dod.pending_items
      
        7041
            assert not any(
      
        7042
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        7043
            )
      
        7044
        
        7045
        
        7046
        @pytest.mark.asyncio
      
        7047
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        7048
            temp_dir: Path,
      
        7049
        ) -> None:
      
        7050
            async def assess_confidence(
      
        7051
                tool_name: str,
      
        7052
                tool_args: dict,
      
        7053
                context: str,
      
        7054
            ) -> ConfidenceAssessment:
      
        7055
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7056
        
        7057
            async def verify_action(
      
        7058
                tool_name: str,
      
        7059
                tool_args: dict,
      
        7060
                result: str,
      
        7061
                expected: str = "",
      
        7062
            ) -> ActionVerification:
      
        7063
                raise AssertionError("Verification should not run for this scenario")
      
        7064
        
        7065
            context = build_context(
      
        7066
                temp_dir=temp_dir,
      
        7067
                messages=[],
      
        7068
                safeguards=FakeSafeguards(),
      
        7069
                assess_confidence=assess_confidence,
      
        7070
                verify_action=verify_action,
      
        7071
            )
      
        7072
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7073
            tool_call = ToolCall(
      
        7074
                id="write-1",
      
        7075
                name="write",
      
        7076
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        7077
            )
      
        7078
            executor = FakeExecutor(
      
        7079
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        7080
            )
      
        7081
            summary = TurnSummary(final_response="")
      
        7082
            dod = create_definition_of_done("Update README and verify it still works.")
      
        7083
            dod.verification_commands = ["uv run pytest -q"]
      
        7084
            dod.last_verification_result = "passed"
      
        7085
            dod.verification_attempt_counter = 1
      
        7086
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        7087
            dod.active_verification_attempt_number = 1
      
        7088
            dod.evidence = [
      
        7089
                VerificationEvidence(
      
        7090
                    command="uv run pytest -q",
      
        7091
                    passed=True,
      
        7092
                    stdout="401 passed",
      
        7093
                    kind="test",
      
        7094
                )
      
        7095
            ]
      
        7096
            dod.completed_items.append("Collect verification evidence")
      
        7097
            events: list[AgentEvent] = []
      
        7098
        
        7099
            async def emit(event: AgentEvent) -> None:
      
        7100
                events.append(event)
      
        7101
        
        7102
            await runner.execute_batch(
      
        7103
                tool_calls=[tool_call],
      
        7104
                tool_source="assistant",
      
        7105
                pending_tool_calls_seen=set(),
      
        7106
                emit=emit,
      
        7107
                summary=summary,
      
        7108
                dod=dod,
      
        7109
                executor=executor,  # type: ignore[arg-type]
      
        7110
                on_confirmation=None,
      
        7111
                on_user_question=None,
      
        7112
                emit_confirmation=None,
      
        7113
                consecutive_errors=0,
      
        7114
            )
      
        7115
        
        7116
            assert dod.last_verification_result == "stale"
      
        7117
            assert dod.evidence == []
      
        7118
            assert "Collect verification evidence" in dod.pending_items
      
        7119
            assert "Collect verification evidence" not in dod.completed_items
      
        7120
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        7121
            assert dod.active_verification_attempt_number == 2
      
        7122
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        7123
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        7124
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        7125
            assert (
      
        7126
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        7127
                == "verification-attempt-1"
      
        7128
            )
      
        7129
            assert (
      
        7130
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        7131
            )
      
        7132
            assert (
      
        7133
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        7134
                == "verification-attempt-2"
      
        7135
            )
      
        7136
            assert (
      
        7137
                summary.workflow_timeline[-1].verification_observations[0].command
      
        7138
                == "uv run pytest -q"
      
        7139
            )
      
        7140
        
        7141
        
        7142
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        7143
            async def assess_confidence(
      
        7144
                tool_name: str,
      
        7145
                tool_args: dict,
      
        7146
                context: str,
      
        7147
            ) -> ConfidenceAssessment:
      
        7148
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7149
        
        7150
            async def verify_action(
      
        7151
                tool_name: str,
      
        7152
                tool_args: dict,
      
        7153
                result: str,
      
        7154
                expected: str = "",
      
        7155
            ) -> ActionVerification:
      
        7156
                raise AssertionError("Verification should not run in this scenario")
      
        7157
        
        7158
            repair_target = temp_dir / "guide" / "index.html"
      
        7159
            context = build_context(
      
        7160
                temp_dir=temp_dir,
      
        7161
                messages=[
      
        7162
                    Message(
      
        7163
                        role=Role.ASSISTANT,
      
        7164
                        content=(
      
        7165
                            "Repair focus:\n"
      
        7166
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        7167
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        7168
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        7169
                        ),
      
        7170
                    )
      
        7171
                ],
      
        7172
                safeguards=FakeSafeguards(),
      
        7173
                assess_confidence=assess_confidence,
      
        7174
                verify_action=verify_action,
      
        7175
            )
      
        7176
            queued: list[str] = []
      
        7177
            context.queue_steering_message_callback = queued.append
      
        7178
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7179
        
        7180
            runner._queue_blocked_active_repair_nudge(
      
        7181
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        7182
            )
      
        7183
        
        7184
            assert queued
      
        7185
            assert str(repair_target) in queued[0]
      
        7186
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        7187
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        7188
        
        7189
        
        7190
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        7191
            temp_dir: Path,
      
        7192
        ) -> None:
      
        7193
            async def assess_confidence(
      
        7194
                tool_name: str,
      
        7195
                tool_args: dict,
      
        7196
                context: str,
      
        7197
            ) -> ConfidenceAssessment:
      
        7198
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7199
        
        7200
            async def verify_action(
      
        7201
                tool_name: str,
      
        7202
                tool_args: dict,
      
        7203
                result: str,
      
        7204
                expected: str = "",
      
        7205
            ) -> ActionVerification:
      
        7206
                raise AssertionError("Verification should not run in this scenario")
      
        7207
        
        7208
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        7209
            stylesheet = temp_dir / "guide" / "styles.css"
      
        7210
            context = build_context(
      
        7211
                temp_dir=temp_dir,
      
        7212
                messages=[
      
        7213
                    Message(
      
        7214
                        role=Role.ASSISTANT,
      
        7215
                        content=(
      
        7216
                            "Repair focus:\n"
      
        7217
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        7218
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        7219
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        7220
                        ),
      
        7221
                    )
      
        7222
                ],
      
        7223
                safeguards=FakeSafeguards(),
      
        7224
                assess_confidence=assess_confidence,
      
        7225
                verify_action=verify_action,
      
        7226
            )
      
        7227
            queued: list[str] = []
      
        7228
            context.queue_steering_message_callback = queued.append
      
        7229
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7230
        
        7231
            runner._queue_blocked_active_repair_mutation_nudge(
      
        7232
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        7233
            )
      
        7234
        
        7235
            assert queued
      
        7236
            assert str(repair_target) in queued[0]
      
        7237
            assert str(stylesheet) in queued[0]
      
        7238
            assert "before widening the change set" in queued[0]
      
        7239
        
        7240
        
        7241
        def test_tool_batch_runner_duplicate_repair_mutation_restates_verifier_deltas(
      
        7242
            temp_dir: Path,
      
        7243
        ) -> None:
      
        7244
            async def assess_confidence(
      
        7245
                tool_name: str,
      
        7246
                tool_args: dict,
      
        7247
                context: str,
      
        7248
            ) -> ConfidenceAssessment:
      
        7249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7250
        
        7251
            async def verify_action(
      
        7252
                tool_name: str,
      
        7253
                tool_args: dict,
      
        7254
                result: str,
      
        7255
                expected: str = "",
      
        7256
            ) -> ActionVerification:
      
        7257
                raise AssertionError("Verification should not run in this scenario")
      
        7258
        
        7259
            index_path = temp_dir / "guide" / "index.html"
      
        7260
            chapter_path = temp_dir / "guide" / "chapters" / "02-installation.html"
      
        7261
            context = build_context(
      
        7262
                temp_dir=temp_dir,
      
        7263
                messages=[
      
        7264
                    Message(
      
        7265
                        role=Role.USER,
      
        7266
                        content=(
      
        7267
                            "Repair focus:\n"
      
        7268
                            f"- Improve `{index_path}`: insufficient structured content (9 blocks, expected at least 12).\n"
      
        7269
                            f"- Improve `{chapter_path}`: thin content (526 text chars, expected at least 1758).\n"
      
        7270
                            f"- Immediate next step: edit `{index_path}`.\n"
      
        7271
                            "- Update the listed generated artifacts directly; do not recreate the artifact set.\n"
      
        7272
                        ),
      
        7273
                    )
      
        7274
                ],
      
        7275
                safeguards=FakeSafeguards(),
      
        7276
                assess_confidence=assess_confidence,
      
        7277
                verify_action=verify_action,
      
        7278
            )
      
        7279
            queued: list[str] = []
      
        7280
            context.queue_steering_message_callback = queued.append
      
        7281
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7282
            dod = create_definition_of_done("Create a multi-file guide.")
      
        7283
        
        7284
            runner._queue_duplicate_mutation_nudge(  # type: ignore[attr-defined]
      
        7285
                ToolCall(
      
        7286
                    id="dup-write",
      
        7287
                    name="write",
      
        7288
                    arguments={"file_path": str(index_path), "content": "<h1>same</h1>"},
      
        7289
                ),
      
        7290
                dod=dod,
      
        7291
            )
      
        7292
        
        7293
            assert queued
      
        7294
            assert "skipped because it would not change" in queued[0]
      
        7295
            assert "Do not submit the same content again" in queued[0]
      
        7296
            assert "insufficient structured content" in queued[0]
      
        7297
            assert "thin content" in queued[0]
      
        7298
            assert "make one real edit" in queued[0]
      
        7299
        
        7300
        
        7301
        @pytest.mark.asyncio
      
        7302
        async def test_tool_batch_runner_quality_repair_success_hands_to_next_target(
      
        7303
            temp_dir: Path,
      
        7304
        ) -> None:
      
        7305
            async def assess_confidence(
      
        7306
                tool_name: str,
      
        7307
                tool_args: dict,
      
        7308
                context: str,
      
        7309
            ) -> ConfidenceAssessment:
      
        7310
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7311
        
        7312
            async def verify_action(
      
        7313
                tool_name: str,
      
        7314
                tool_args: dict,
      
        7315
                result: str,
      
        7316
                expected: str = "",
      
        7317
            ) -> ActionVerification:
      
        7318
                raise AssertionError("Verification should not run in this scenario")
      
        7319
        
        7320
            chapters = temp_dir / "guide" / "chapters"
      
        7321
            first = chapters / "01-introduction.html"
      
        7322
            second = chapters / "02-installation.html"
      
        7323
            chapters.mkdir(parents=True)
      
        7324
            first.write_text("<h1>Intro</h1>\n")
      
        7325
            second.write_text("<h1>Install</h1>\n")
      
        7326
            context = build_context(
      
        7327
                temp_dir=temp_dir,
      
        7328
                messages=[
      
        7329
                    Message(
      
        7330
                        role=Role.ASSISTANT,
      
        7331
                        content=(
      
        7332
                            "Repair focus:\n"
      
        7333
                            f"- Improve `{first}`: thin content (400 text chars, expected at least 1758).\n"
      
        7334
                            f"- Improve `{second}`: insufficient structured content (6 blocks, expected at least 18).\n"
      
        7335
                            f"- Immediate next step: edit `{first}` with a substantial expansion or replacement.\n"
      
        7336
                            "- Repair every listed quality target in order before any final answer.\n"
      
        7337
                        ),
      
        7338
                    )
      
        7339
                ],
      
        7340
                safeguards=FakeSafeguards(),
      
        7341
                assess_confidence=assess_confidence,
      
        7342
                verify_action=verify_action,
      
        7343
            )
      
        7344
            queued: list[str] = []
      
        7345
            context.queue_steering_message_callback = queued.append
      
        7346
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7347
            dod = create_definition_of_done("Repair generated HTML guide quality.")
      
        7348
            tool_call = ToolCall(
      
        7349
                id="write-intro",
      
        7350
                name="write",
      
        7351
                arguments={
      
        7352
                    "file_path": str(first),
      
        7353
                    "content": "<h1>Intro</h1><p>Substantial expansion.</p>\n",
      
        7354
                },
      
        7355
            )
      
        7356
        
        7357
            await runner.execute_batch(
      
        7358
                tool_calls=[tool_call],
      
        7359
                tool_source="assistant",
      
        7360
                pending_tool_calls_seen=set(),
      
        7361
                emit=_noop_emit,
      
        7362
                summary=TurnSummary(final_response=""),
      
        7363
                dod=dod,
      
        7364
                executor=FakeExecutor(
      
        7365
                    [
      
        7366
                        tool_outcome(
      
        7367
                            tool_call=tool_call,
      
        7368
                            output=f"Successfully wrote {first}",
      
        7369
                            is_error=False,
      
        7370
                        )
      
        7371
                    ]
      
        7372
                ),  # type: ignore[arg-type]
      
        7373
                on_confirmation=None,
      
        7374
                on_user_question=None,
      
        7375
                emit_confirmation=None,
      
        7376
                consecutive_errors=0,
      
        7377
            )
      
        7378
        
        7379
            assert queued
      
        7380
            handoff = next(message for message in queued if "next listed quality target" in message)
      
        7381
            assert str(second.resolve(strict=False)) in handoff
      
        7382
            assert "Do not rerun verification" in handoff
      
        7383
            assert "Repair focus:" in handoff
      
        7384
            assert "insufficient structured content" in handoff
      
        7385
            assert f"Immediate next step: edit `{second.resolve(strict=False)}`" in handoff
      
        7386
            assert all("All explicitly planned artifacts now exist" not in message for message in queued)
      
        7387
        
        7388
        
        7389
        @pytest.mark.asyncio
      
        7390
        async def test_tool_batch_runner_hands_off_after_active_repair_support_file_write(
      
        7391
            temp_dir: Path,
      
        7392
        ) -> None:
      
        7393
            async def assess_confidence(
      
        7394
                tool_name: str,
      
        7395
                tool_args: dict,
      
        7396
                context: str,
      
        7397
            ) -> ConfidenceAssessment:
      
        7398
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7399
        
        7400
            async def verify_action(
      
        7401
                tool_name: str,
      
        7402
                tool_args: dict,
      
        7403
                result: str,
      
        7404
                expected: str = "",
      
        7405
            ) -> ActionVerification:
      
        7406
                raise AssertionError("Verification should not run in this scenario")
      
        7407
        
        7408
            repair_target = temp_dir / "guide" / "index.html"
      
        7409
            stylesheet = temp_dir / "guide" / "style.css"
      
        7410
            repair_target.parent.mkdir(parents=True)
      
        7411
            repair_target.write_text('<link rel="stylesheet" href="style.css">\n')
      
        7412
            context = build_context(
      
        7413
                temp_dir=temp_dir,
      
        7414
                messages=[
      
        7415
                    Message(
      
        7416
                        role=Role.ASSISTANT,
      
        7417
                        content=(
      
        7418
                            "Repair focus:\n"
      
        7419
                            f"- Fix the broken local reference `style.css` in `{repair_target}`.\n"
      
        7420
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        7421
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `style.css`.\n"
      
        7422
                        ),
      
        7423
                    )
      
        7424
                ],
      
        7425
                safeguards=FakeSafeguards(),
      
        7426
                assess_confidence=assess_confidence,
      
        7427
                verify_action=verify_action,
      
        7428
            )
      
        7429
            queued: list[str] = []
      
        7430
            context.queue_steering_message_callback = queued.append
      
        7431
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7432
            dod = create_definition_of_done("Repair a guide stylesheet link.")
      
        7433
            tool_call = ToolCall(
      
        7434
                id="write-style",
      
        7435
                name="write",
      
        7436
                arguments={
      
        7437
                    "file_path": str(stylesheet),
      
        7438
                    "content": "body { font-family: sans-serif; }\n",
      
        7439
                },
      
        7440
            )
      
        7441
        
        7442
            await runner.execute_batch(
      
        7443
                tool_calls=[tool_call],
      
        7444
                tool_source="assistant",
      
        7445
                pending_tool_calls_seen=set(),
      
        7446
                emit=_noop_emit,
      
        7447
                summary=TurnSummary(final_response=""),
      
        7448
                dod=dod,
      
        7449
                executor=FakeExecutor(
      
        7450
                    [
      
        7451
                        tool_outcome(
      
        7452
                            tool_call=tool_call,
      
        7453
                            output=f"Successfully wrote {stylesheet}",
      
        7454
                            is_error=False,
      
        7455
                        )
      
        7456
                    ]
      
        7457
                ),  # type: ignore[arg-type]
      
        7458
                on_confirmation=None,
      
        7459
                on_user_question=None,
      
        7460
                emit_confirmation=None,
      
        7461
                consecutive_errors=0,
      
        7462
            )
      
        7463
        
        7464
            assert queued
      
        7465
            assert any("support file for the active verification repair now exists" in message for message in queued)
      
        7466
            assert any("Do not retarget" in message for message in queued)
      
        7467
            assert any("Loader can re-run verification" in message for message in queued)
      
        7468
        
        7469
        
        7470
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        7471
            temp_dir: Path,
      
        7472
        ) -> None:
      
        7473
            async def assess_confidence(
      
        7474
                tool_name: str,
      
        7475
                tool_args: dict,
      
        7476
                context: str,
      
        7477
            ) -> ConfidenceAssessment:
      
        7478
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7479
        
        7480
            async def verify_action(
      
        7481
                tool_name: str,
      
        7482
                tool_args: dict,
      
        7483
                result: str,
      
        7484
                expected: str = "",
      
        7485
            ) -> ActionVerification:
      
        7486
                raise AssertionError("Verification should not run in this scenario")
      
        7487
        
        7488
            context = build_context(
      
        7489
                temp_dir=temp_dir,
      
        7490
                messages=[],
      
        7491
                safeguards=FakeSafeguards(),
      
        7492
                assess_confidence=assess_confidence,
      
        7493
                verify_action=verify_action,
      
        7494
            )
      
        7495
            queued: list[str] = []
      
        7496
            context.queue_steering_message_callback = queued.append
      
        7497
            store = DefinitionOfDoneStore(temp_dir)
      
        7498
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        7499
            plan_path = temp_dir / "implementation.md"
      
        7500
            plan_path.write_text(
      
        7501
                "# File Changes\n"
      
        7502
                "- `guide/index.html`\n"
      
        7503
                "- `guide/chapters/01-getting-started.html`\n"
      
        7504
                "- `guide/chapters/02-installation.html`\n"
      
        7505
                "- `guide/chapters/03-first-website.html`\n"
      
        7506
            )
      
        7507
            dod.implementation_plan = str(plan_path)
      
        7508
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        7509
            (temp_dir / "guide" / "index.html").write_text("index")
      
        7510
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        7511
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        7512
            runner = ToolBatchRunner(context, store)
      
        7513
        
        7514
            runner._queue_blocked_late_reference_drift_nudge(
      
        7515
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        7516
                dod=dod,
      
        7517
            )
      
        7518
        
        7519
            assert queued
      
        7520
            assert "03-first-website.html" in queued[0]
      
        7521
            assert "older reference materials" in queued[0]
      
        7522
        
        7523
        
        7524
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        7525
            temp_dir: Path,
      
        7526
        ) -> None:
      
        7527
            async def assess_confidence(
      
        7528
                tool_name: str,
      
        7529
                tool_args: dict,
      
        7530
                context: str,
      
        7531
            ) -> ConfidenceAssessment:
      
        7532
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7533
        
        7534
            async def verify_action(
      
        7535
                tool_name: str,
      
        7536
                tool_args: dict,
      
        7537
                result: str,
      
        7538
                expected: str = "",
      
        7539
            ) -> ActionVerification:
      
        7540
                raise AssertionError("Verification should not run in this scenario")
      
        7541
        
        7542
            guide_root = temp_dir / "guide"
      
        7543
            chapters = guide_root / "chapters"
      
        7544
            guide_root.mkdir(parents=True)
      
        7545
            chapters.mkdir()
      
        7546
            index_path = guide_root / "index.html"
      
        7547
            chapter_one = chapters / "01-getting-started.html"
      
        7548
            chapter_two = chapters / "02-installation.html"
      
        7549
            index_path.write_text("index")
      
        7550
            chapter_one.write_text("one")
      
        7551
            chapter_two.write_text("two")
      
        7552
        
        7553
            implementation_plan = temp_dir / "implementation.md"
      
        7554
            implementation_plan.write_text(
      
        7555
                "\n".join(
      
        7556
                    [
      
        7557
                        "# Implementation Plan",
      
        7558
                        "",
      
        7559
                        "## File Changes",
      
        7560
                        f"- `{guide_root}`",
      
        7561
                        f"- `{chapters}`",
      
        7562
                        f"- `{index_path}`",
      
        7563
                        f"- `{chapter_one}`",
      
        7564
                        f"- `{chapter_two}`",
      
        7565
                        "",
      
        7566
                    ]
      
        7567
                )
      
        7568
            )
      
        7569
        
        7570
            context = build_context(
      
        7571
                temp_dir=temp_dir,
      
        7572
                messages=[],
      
        7573
                safeguards=FakeSafeguards(),
      
        7574
                assess_confidence=assess_confidence,
      
        7575
                verify_action=verify_action,
      
        7576
            )
      
        7577
            queued: list[str] = []
      
        7578
            context.queue_steering_message_callback = queued.append
      
        7579
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7580
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        7581
            dod.implementation_plan = str(implementation_plan)
      
        7582
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        7583
            sync_todos_to_definition_of_done(
      
        7584
                dod,
      
        7585
                [
      
        7586
                    {
      
        7587
                        "content": "Verify all guide files are linked and complete",
      
        7588
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        7589
                        "status": "pending",
      
        7590
                    }
      
        7591
                ],
      
        7592
                project_root=temp_dir,
      
        7593
            )
      
        7594
        
        7595
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        7596
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        7597
                dod=dod,
      
        7598
            )
      
        7599
        
        7600
            assert queued
      
        7601
            assert context.workflow_mode == "verify"
      
        7602
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        7603
            assert "Verify all guide files are linked and complete" in queued[0]
      
        7604
            assert "Do not reopen earlier reference materials." in queued[0]
      
        7605
            assert "Finish with a final response so Loader can verify" in queued[0]
      
        7606
        
        7607
        
        7608
        def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
      
        7609
            temp_dir: Path,
      
        7610
        ) -> None:
      
        7611
            async def assess_confidence(
      
        7612
                tool_name: str,
      
        7613
                tool_args: dict,
      
        7614
                context: str,
      
        7615
            ) -> ConfidenceAssessment:
      
        7616
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7617
        
        7618
            async def verify_action(
      
        7619
                tool_name: str,
      
        7620
                tool_args: dict,
      
        7621
                result: str,
      
        7622
                expected: str = "",
      
        7623
            ) -> ActionVerification:
      
        7624
                raise AssertionError("Verification should not run in this scenario")
      
        7625
        
        7626
            guide_root = temp_dir / "guide"
      
        7627
            chapters = guide_root / "chapters"
      
        7628
            guide_root.mkdir(parents=True)
      
        7629
            chapters.mkdir()
      
        7630
            index_path = guide_root / "index.html"
      
        7631
            chapter_one = chapters / "01-getting-started.html"
      
        7632
            chapter_two = chapters / "02-installation.html"
      
        7633
            index_path.write_text("index")
      
        7634
            chapter_one.write_text("one")
      
        7635
            chapter_two.write_text("two")
      
        7636
        
        7637
            implementation_plan = temp_dir / "implementation.md"
      
        7638
            implementation_plan.write_text(
      
        7639
                "\n".join(
      
        7640
                    [
      
        7641
                        "# Implementation Plan",
      
        7642
                        "",
      
        7643
                        "## File Changes",
      
        7644
                        f"- `{guide_root}`",
      
        7645
                        f"- `{chapters}`",
      
        7646
                        f"- `{index_path}`",
      
        7647
                        f"- `{chapter_one}`",
      
        7648
                        f"- `{chapter_two}`",
      
        7649
                        "",
      
        7650
                    ]
      
        7651
                )
      
        7652
            )
      
        7653
        
        7654
            context = build_context(
      
        7655
                temp_dir=temp_dir,
      
        7656
                messages=[],
      
        7657
                safeguards=FakeSafeguards(),
      
        7658
                assess_confidence=assess_confidence,
      
        7659
                verify_action=verify_action,
      
        7660
            )
      
        7661
            queued: list[str] = []
      
        7662
            context.queue_steering_message_callback = queued.append
      
        7663
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7664
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        7665
            dod.implementation_plan = str(implementation_plan)
      
        7666
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        7667
        
        7668
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        7669
                "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
      
        7670
                dod=dod,
      
        7671
            )
      
        7672
        
        7673
            assert queued
      
        7674
            assert context.workflow_mode == "verify"
      
        7675
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        7676
            assert "finish with a final response so Loader can verify" in queued[0]
      
        7677
        
        7678
        
        7679
        @pytest.mark.asyncio
      
        7680
        async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
      
        7681
            temp_dir: Path,
      
        7682
        ) -> None:
      
        7683
            async def assess_confidence(
      
        7684
                tool_name: str,
      
        7685
                tool_args: dict,
      
        7686
                context: str,
      
        7687
            ) -> ConfidenceAssessment:
      
        7688
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7689
        
        7690
            async def verify_action(
      
        7691
                tool_name: str,
      
        7692
                tool_args: dict,
      
        7693
                result: str,
      
        7694
                expected: str = "",
      
        7695
            ) -> ActionVerification:
      
        7696
                raise AssertionError("Verification should not run in this scenario")
      
        7697
        
        7698
            guide_root = temp_dir / "guide"
      
        7699
            chapters = guide_root / "chapters"
      
        7700
            guide_root.mkdir(parents=True)
      
        7701
            chapters.mkdir()
      
        7702
            index_path = guide_root / "index.html"
      
        7703
            chapter_one = chapters / "01-getting-started.html"
      
        7704
            chapter_two = chapters / "02-installation.html"
      
        7705
            index_path.write_text("index")
      
        7706
            chapter_one.write_text("one")
      
        7707
            chapter_two.write_text("two")
      
        7708
        
        7709
            implementation_plan = temp_dir / "implementation.md"
      
        7710
            implementation_plan.write_text(
      
        7711
                "\n".join(
      
        7712
                    [
      
        7713
                        "# Implementation Plan",
      
        7714
                        "",
      
        7715
                        "## File Changes",
      
        7716
                        f"- `{guide_root}`",
      
        7717
                        f"- `{chapters}`",
      
        7718
                        f"- `{index_path}`",
      
        7719
                        f"- `{chapter_one}`",
      
        7720
                        f"- `{chapter_two}`",
      
        7721
                        "",
      
        7722
                    ]
      
        7723
                )
      
        7724
            )
      
        7725
        
        7726
            context = build_context(
      
        7727
                temp_dir=temp_dir,
      
        7728
                messages=[],
      
        7729
                safeguards=FakeSafeguards(),
      
        7730
                assess_confidence=assess_confidence,
      
        7731
                verify_action=verify_action,
      
        7732
            )
      
        7733
            queued: list[str] = []
      
        7734
            context.queue_steering_message_callback = queued.append
      
        7735
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7736
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        7737
            dod.implementation_plan = str(implementation_plan)
      
        7738
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        7739
        
        7740
            blocked_message = (
      
        7741
                "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
      
        7742
            )
      
        7743
            tool_calls = [
      
        7744
                ToolCall(
      
        7745
                    id=f"audit-{index}",
      
        7746
                    name="bash",
      
        7747
                    arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
      
        7748
                )
      
        7749
                for index in range(1, 4)
      
        7750
            ]
      
        7751
            executor = FakeExecutor(
      
        7752
                [
      
        7753
                    tool_outcome(
      
        7754
                        tool_call=tool_call,
      
        7755
                        output=blocked_message,
      
        7756
                        is_error=True,
      
        7757
                        state=ToolExecutionState.BLOCKED,
      
        7758
                    )
      
        7759
                    for tool_call in tool_calls
      
        7760
                ]
      
        7761
            )
      
        7762
            events: list[AgentEvent] = []
      
        7763
        
        7764
            async def emit(event: AgentEvent) -> None:
      
        7765
                events.append(event)
      
        7766
        
        7767
            result = await runner.execute_batch(
      
        7768
                tool_calls=tool_calls,
      
        7769
                tool_source="native",
      
        7770
                pending_tool_calls_seen=set(),
      
        7771
                emit=emit,
      
        7772
                summary=TurnSummary(final_response=""),
      
        7773
                dod=dod,
      
        7774
                executor=executor,
      
        7775
                on_confirmation=None,
      
        7776
                on_user_question=None,
      
        7777
                emit_confirmation=None,
      
        7778
                consecutive_errors=0,
      
        7779
            )
      
        7780
        
        7781
            assert result.halted is False
      
        7782
            assert result.consecutive_errors == 0
      
        7783
            assert context.workflow_mode == "verify"
      
        7784
            assert queued
      
        7785
            assert any("finish with a final response so Loader can verify" in message for message in queued)
      
        7786
        
        7787
        
        7788
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        7789
            temp_dir: Path,
      
        7790
        ) -> None:
      
        7791
            async def assess_confidence(
      
        7792
                tool_name: str,
      
        7793
                tool_args: dict,
      
        7794
                context: str,
      
        7795
            ) -> ConfidenceAssessment:
      
        7796
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7797
        
        7798
            async def verify_action(
      
        7799
                tool_name: str,
      
        7800
                tool_args: dict,
      
        7801
                result: str,
      
        7802
                expected: str = "",
      
        7803
            ) -> ActionVerification:
      
        7804
                raise AssertionError("Verification should not run in this scenario")
      
        7805
        
        7806
            context = build_context(
      
        7807
                temp_dir=temp_dir,
      
        7808
                messages=[],
      
        7809
                safeguards=FakeSafeguards(),
      
        7810
                assess_confidence=assess_confidence,
      
        7811
                verify_action=verify_action,
      
        7812
            )
      
        7813
            queued: list[str] = []
      
        7814
            context.queue_steering_message_callback = queued.append
      
        7815
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7816
        
        7817
            runner._queue_blocked_html_declared_target_nudge(
      
        7818
                ToolCall(
      
        7819
                    id="write-ch1",
      
        7820
                    name="write",
      
        7821
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        7822
                ),
      
        7823
                (
      
        7824
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        7825
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        7826
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        7827
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        7828
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        7829
                ),
      
        7830
            )
      
        7831
        
        7832
            assert queued
      
        7833
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        7834
            assert "`chapters/02-installation.html`" in queued[0]
      
        7835
            assert "same file now" in queued[0]
      
        7836
        
        7837
        
        7838
        def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
      
        7839
            temp_dir: Path,
      
        7840
        ) -> None:
      
        7841
            async def assess_confidence(
      
        7842
                tool_name: str,
      
        7843
                tool_args: dict,
      
        7844
                context: str,
      
        7845
            ) -> ConfidenceAssessment:
      
        7846
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7847
        
        7848
            async def verify_action(
      
        7849
                tool_name: str,
      
        7850
                tool_args: dict,
      
        7851
                result: str,
      
        7852
                expected: str = "",
      
        7853
            ) -> ActionVerification:
      
        7854
                raise AssertionError("Verification should not run in this scenario")
      
        7855
        
        7856
            context = build_context(
      
        7857
                temp_dir=temp_dir,
      
        7858
                messages=[],
      
        7859
                safeguards=FakeSafeguards(),
      
        7860
                assess_confidence=assess_confidence,
      
        7861
                verify_action=verify_action,
      
        7862
            )
      
        7863
            queued: list[str] = []
      
        7864
            context.queue_steering_message_callback = queued.append
      
        7865
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7866
        
        7867
            runner._queue_blocked_html_declared_target_nudge(
      
        7868
                ToolCall(
      
        7869
                    id="write-ch1",
      
        7870
                    name="write",
      
        7871
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
      
        7872
                ),
      
        7873
                (
      
        7874
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        7875
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        7876
                    "introducing new sibling targets that the guide root does not declare; remove or replace "
      
        7877
                    "undeclared hrefs like: troubleshooting.html. "
      
        7878
                    "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
      
        7879
                    "chapters/configuration.html. Allowed hrefs from this file include: ../index.html, "
      
        7880
                    "installation.html, configuration.html."
      
        7881
                ),
      
        7882
            )
      
        7883
        
        7884
            assert queued
      
        7885
            assert "use only these exact href values" in queued[0]
      
        7886
            assert "`installation.html`" in queued[0]
      
        7887
            assert "`../index.html`" in queued[0]
      
        7888
            assert "closest declared target(s)" not in queued[0]
      
        7889
        
        7890
        
        7891
        def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
      
        7892
            temp_dir: Path,
      
        7893
        ) -> None:
      
        7894
            async def assess_confidence(
      
        7895
                tool_name: str,
      
        7896
                tool_args: dict,
      
        7897
                context: str,
      
        7898
            ) -> ConfidenceAssessment:
      
        7899
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7900
        
        7901
            async def verify_action(
      
        7902
                tool_name: str,
      
        7903
                tool_args: dict,
      
        7904
                result: str,
      
        7905
                expected: str = "",
      
        7906
            ) -> ActionVerification:
      
        7907
                raise AssertionError("Verification should not run in this scenario")
      
        7908
        
        7909
            context = build_context(
      
        7910
                temp_dir=temp_dir,
      
        7911
                messages=[],
      
        7912
                safeguards=FakeSafeguards(),
      
        7913
                assess_confidence=assess_confidence,
      
        7914
                verify_action=verify_action,
      
        7915
            )
      
        7916
            queued: list[str] = []
      
        7917
            context.queue_steering_message_callback = queued.append
      
        7918
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7919
            dod = create_definition_of_done("Create a guide.")
      
        7920
        
        7921
            target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
      
        7922
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        7923
                ToolCall(
      
        7924
                    id="write-troubleshooting",
      
        7925
                    name="write",
      
        7926
                    arguments={"file_path": str(target)},
      
        7927
                ),
      
        7928
                (
      
        7929
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        7930
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        7931
                    f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
      
        7932
                    "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
      
        7933
                    "Already-declared local targets include: chapters/advanced-topics.html, "
      
        7934
                    "chapters/basic-usage.html, chapters/configuration.html"
      
        7935
                ),
      
        7936
                dod=dod,
      
        7937
            )
      
        7938
        
        7939
            assert queued
      
        7940
            assert "update" in queued[0].lower()
      
        7941
            assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
      
        7942
            assert "`chapters/troubleshooting.html`" in queued[0]
      
        7943
            assert "retry the file creation" in queued[0]
      
        7944
        
        7945
        
        7946
        def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
      
        7947
            temp_dir: Path,
      
        7948
        ) -> None:
      
        7949
            async def assess_confidence(
      
        7950
                tool_name: str,
      
        7951
                tool_args: dict,
      
        7952
                context: str,
      
        7953
            ) -> ConfidenceAssessment:
      
        7954
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        7955
        
        7956
            async def verify_action(
      
        7957
                tool_name: str,
      
        7958
                tool_args: dict,
      
        7959
                result: str,
      
        7960
                expected: str = "",
      
        7961
            ) -> ActionVerification:
      
        7962
                raise AssertionError("Verification should not run in this scenario")
      
        7963
        
        7964
            guide = temp_dir / "guide"
      
        7965
            chapters = guide / "chapters"
      
        7966
            guide.mkdir()
      
        7967
            chapters.mkdir()
      
        7968
            index = guide / "index.html"
      
        7969
            index.write_text(
      
        7970
                "\n".join(
      
        7971
                    [
      
        7972
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        7973
                        '<a href="chapters/02-installation.html">Install</a>',
      
        7974
                        '<a href="../index.html">Back</a>',
      
        7975
                        "",
      
        7976
                    ]
      
        7977
                )
      
        7978
            )
      
        7979
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        7980
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        7981
        
        7982
            implementation_plan = temp_dir / "implementation.md"
      
        7983
            implementation_plan.write_text(
      
        7984
                "\n".join(
      
        7985
                    [
      
        7986
                        "# Implementation Plan",
      
        7987
                        "",
      
        7988
                        "## File Changes",
      
        7989
                        f"- `{index}`",
      
        7990
                        f"- `{chapters / '01-introduction.html'}`",
      
        7991
                        f"- `{chapters / '02-installation.html'}`",
      
        7992
                        "",
      
        7993
                    ]
      
        7994
                )
      
        7995
            )
      
        7996
        
        7997
            context = build_context(
      
        7998
                temp_dir=temp_dir,
      
        7999
                messages=[],
      
        8000
                safeguards=FakeSafeguards(),
      
        8001
                assess_confidence=assess_confidence,
      
        8002
                verify_action=verify_action,
      
        8003
            )
      
        8004
            queued: list[str] = []
      
        8005
            context.queue_steering_message_callback = queued.append
      
        8006
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8007
            dod = create_definition_of_done("Create a guide.")
      
        8008
            dod.implementation_plan = str(implementation_plan)
      
        8009
            dod.verification_commands = [f"ls -la {guide}"]
      
        8010
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        8011
        
        8012
            target = guide / "chapters" / "08-advanced-configuration.html"
      
        8013
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        8014
                ToolCall(
      
        8015
                    id="write-extra",
      
        8016
                    name="write",
      
        8017
                    arguments={"file_path": str(target)},
      
        8018
                ),
      
        8019
                (
      
        8020
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        8021
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        8022
                    f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
      
        8023
                    "for example: chapters/08-advanced-configuration.html."
      
        8024
                ),
      
        8025
                dod=dod,
      
        8026
            )
      
        8027
        
        8028
            assert queued
      
        8029
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        8030
            assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
      
        8031
            assert "Finish with a final response now so Loader can run verification automatically." in queued[0]
      
        8032
            assert "update the guide root" not in queued[0]
      
        8033
        
        8034
        
        8035
        def test_tool_batch_runner_blocked_html_declared_file_creation_prefers_closest_target(
      
        8036
            temp_dir: Path,
      
        8037
        ) -> None:
      
        8038
            async def assess_confidence(
      
        8039
                tool_name: str,
      
        8040
                tool_args: dict,
      
        8041
                context: str,
      
        8042
            ) -> ConfidenceAssessment:
      
        8043
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        8044
        
        8045
            async def verify_action(
      
        8046
                tool_name: str,
      
        8047
                tool_args: dict,
      
        8048
                result: str,
      
        8049
                expected: str = "",
      
        8050
            ) -> ActionVerification:
      
        8051
                raise AssertionError("Verification should not run in this scenario")
      
        8052
        
        8053
            context = build_context(
      
        8054
                temp_dir=temp_dir,
      
        8055
                messages=[],
      
        8056
                safeguards=FakeSafeguards(),
      
        8057
                assess_confidence=assess_confidence,
      
        8058
                verify_action=verify_action,
      
        8059
            )
      
        8060
            queued: list[str] = []
      
        8061
            context.queue_steering_message_callback = queued.append
      
        8062
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8063
            dod = create_definition_of_done("Create a guide.")
      
        8064
        
        8065
            target = temp_dir / "guide" / "chapters" / "02-basics.html"
      
        8066
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        8067
                ToolCall(
      
        8068
                    id="write-basics",
      
        8069
                    name="write",
      
        8070
                    arguments={"file_path": str(target)},
      
        8071
                ),
      
        8072
                (
      
        8073
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        8074
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set. "
      
        8075
                    "Do not create undeclared sibling page `chapters/02-basics.html`; use the closest declared local target instead. "
      
        8076
                    "Already-declared local targets include: chapters/01-introduction.html, "
      
        8077
                    "chapters/02-installation.html, chapters/03-basic-configuration.html. "
      
        8078
                    "Closest declared local targets include: chapters/02-installation.html"
      
        8079
                ),
      
        8080
                dod=dod,
      
        8081
            )
      
        8082
        
        8083
            assert queued
      
        8084
            assert "Do not create `chapters/02-basics.html`." in queued[0]
      
        8085
            assert "closest declared target instead: `chapters/02-installation.html`" in queued[0]
      
        8086
            assert "Already-declared local targets include:" in queued[0]
      
        8087
            assert "update the guide root" not in queued[0]
      
        8088
        
        8089
        
        8090
        def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
      
        8091
            temp_dir: Path,
      
        8092
        ) -> None:
      
        8093
            async def assess_confidence(
      
        8094
                tool_name: str,
      
        8095
                tool_args: dict,
      
        8096
                context: str,
      
        8097
            ) -> ConfidenceAssessment:
      
        8098
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        8099
        
        8100
            async def verify_action(
      
        8101
                tool_name: str,
      
        8102
                tool_args: dict,
      
        8103
                result: str,
      
        8104
                expected: str = "",
      
        8105
            ) -> ActionVerification:
      
        8106
                raise AssertionError("Verification should not run in this scenario")
      
        8107
        
        8108
            guide = temp_dir / "guide"
      
        8109
            chapters = guide / "chapters"
      
        8110
            guide.mkdir()
      
        8111
            chapters.mkdir()
      
        8112
            index = guide / "index.html"
      
        8113
            index.write_text(
      
        8114
                "\n".join(
      
        8115
                    [
      
        8116
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        8117
                        '<a href="chapters/02-installation.html">Install</a>',
      
        8118
                        '<a href="../index.html">Back</a>',
      
        8119
                        "",
      
        8120
                    ]
      
        8121
                )
      
        8122
            )
      
        8123
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        8124
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        8125
        
        8126
            implementation_plan = temp_dir / "implementation.md"
      
        8127
            implementation_plan.write_text(
      
        8128
                "\n".join(
      
        8129
                    [
      
        8130
                        "# Implementation Plan",
      
        8131
                        "",
      
        8132
                        "## File Changes",
      
        8133
                        f"- `{index}`",
      
        8134
                        f"- `{chapters / '01-introduction.html'}`",
      
        8135
                        f"- `{chapters / '02-installation.html'}`",
      
        8136
                        "",
      
        8137
                    ]
      
        8138
                )
      
        8139
            )
      
        8140
        
        8141
            context = build_context(
      
        8142
                temp_dir=temp_dir,
      
        8143
                messages=[],
      
        8144
                safeguards=FakeSafeguards(),
      
        8145
                assess_confidence=assess_confidence,
      
        8146
                verify_action=verify_action,
      
        8147
            )
      
        8148
            queued: list[str] = []
      
        8149
            context.queue_steering_message_callback = queued.append
      
        8150
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8151
            dod = create_definition_of_done("Create a guide.")
      
        8152
            dod.implementation_plan = str(implementation_plan)
      
        8153
            dod.verification_commands = [f"ls -la {guide}"]
      
        8154
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        8155
        
        8156
            runner._queue_blocked_html_missing_target_nudge(
      
        8157
                ToolCall(
      
        8158
                    id="edit-root",
      
        8159
                    name="edit",
      
        8160
                    arguments={"file_path": str(index)},
      
        8161
                ),
      
        8162
                (
      
        8163
                    "[Blocked - Edited HTML links point to files that do not exist] "
      
        8164
                    "Suggestion: Use only existing local targets for href values and avoid introducing missing links. "
      
        8165
                    "Broken href(s): chapters/08-advanced-configuration.html. "
      
        8166
                    "Replace them with an existing local target or remove the broken link."
      
        8167
                ),
      
        8168
                dod=dod,
      
        8169
            )
      
        8170
        
        8171
            assert queued
      
        8172
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        8173
            assert f"Stay on `{index}`." in queued[0]
      
        8174
            assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
      
        8175
            assert "Repair the existing generated files instead of expanding the guide." in queued[0]
      
        8176
            assert "Replace broken hrefs with existing local targets or remove the broken link." in queued[0]
      
        8177
        
        8178
        
        8179
        def test_tool_batch_runner_blocked_html_asset_nudge_retries_same_file(
      
        8180
            temp_dir: Path,
      
        8181
        ) -> None:
      
        8182
            async def assess_confidence(
      
        8183
                tool_name: str,
      
        8184
                tool_args: dict,
      
        8185
                context: str,
      
        8186
            ) -> ConfidenceAssessment:
      
        8187
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        8188
        
        8189
            async def verify_action(
      
        8190
                tool_name: str,
      
        8191
                tool_args: dict,
      
        8192
                result: str,
      
        8193
                expected: str = "",
      
        8194
            ) -> ActionVerification:
      
        8195
                raise AssertionError("Verification should not run in this scenario")
      
        8196
        
        8197
            context = build_context(
      
        8198
                temp_dir=temp_dir,
      
        8199
                messages=[],
      
        8200
                safeguards=FakeSafeguards(),
      
        8201
                assess_confidence=assess_confidence,
      
        8202
                verify_action=verify_action,
      
        8203
            )
      
        8204
            queued: list[str] = []
      
        8205
            context.queue_steering_message_callback = queued.append
      
        8206
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8207
            target = temp_dir / "guide" / "chapters" / "03-configuration.html"
      
        8208
        
        8209
            runner._queue_blocked_html_asset_nudge(
      
        8210
                ToolCall(
      
        8211
                    id="write-config",
      
        8212
                    name="write",
      
        8213
                    arguments={"file_path": str(target)},
      
        8214
                ),
      
        8215
                (
      
        8216
                    "[Blocked - HTML local asset references do not exist] Suggestion: "
      
        8217
                    "Use only existing local assets for non-HTML href values. "
      
        8218
                    "Missing local asset href(s): ../styles.css. Remove the asset link, "
      
        8219
                    "create the referenced asset first, inline the styling/content, or point "
      
        8220
                    "the href at an existing local file."
      
        8221
                ),
      
        8222
            )
      
        8223
        
        8224
            assert queued
      
        8225
            assert str(target) in queued[0]
      
        8226
            assert "was not created or updated" in queued[0]
      
        8227
            assert "Remove or replace `../styles.css`." in queued[0]
      
        8228
            assert "Do not resend the same `<link>` tag" in queued[0]
      
        8229
            assert "do not claim completion" in queued[0]
      
        8230
        
        8231
        
        8232
        def test_tool_batch_runner_repeated_blocked_html_asset_nudge_forces_href_removal(
      
        8233
            temp_dir: Path,
      
        8234
        ) -> None:
      
        8235
            async def assess_confidence(
      
        8236
                tool_name: str,
      
        8237
                tool_args: dict,
      
        8238
                context: str,
      
        8239
            ) -> ConfidenceAssessment:
      
        8240
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        8241
        
        8242
            async def verify_action(
      
        8243
                tool_name: str,
      
        8244
                tool_args: dict,
      
        8245
                result: str,
      
        8246
                expected: str = "",
      
        8247
            ) -> ActionVerification:
      
        8248
                raise AssertionError("Verification should not run in this scenario")
      
        8249
        
        8250
            blocked_event = (
      
        8251
                "[Blocked - HTML local asset references do not exist] Suggestion: "
      
        8252
                "Use only existing local assets for non-HTML href values. "
      
        8253
                "Missing local asset href(s): ../style.css. Remove the asset link, "
      
        8254
                "create the referenced asset first, inline the styling/content, or point "
      
        8255
                "the href at an existing local file."
      
        8256
            )
      
        8257
            context = build_context(
      
        8258
                temp_dir=temp_dir,
      
        8259
                messages=[Message(role=Role.TOOL, content=blocked_event)],
      
        8260
                safeguards=FakeSafeguards(),
      
        8261
                assess_confidence=assess_confidence,
      
        8262
                verify_action=verify_action,
      
        8263
            )
      
        8264
            context.session.append(Message(role=Role.TOOL, content=blocked_event))
      
        8265
            queued: list[str] = []
      
        8266
            context.queue_steering_message_callback = queued.append
      
        8267
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8268
            target = temp_dir / "guide" / "chapters" / "05-troubleshooting.html"
      
        8269
        
        8270
            runner._queue_blocked_html_asset_nudge(
      
        8271
                ToolCall(
      
        8272
                    id="write-troubleshooting",
      
        8273
                    name="write",
      
        8274
                    arguments={"file_path": str(target)},
      
        8275
                ),
      
        8276
                blocked_event,
      
        8277
            )
      
        8278
        
        8279
            assert queued
      
        8280
            assert "blocked 2 times" in queued[0]
      
        8281
            assert "`../style.css`" in queued[0]
      
        8282
            assert "line removed" in queued[0]
      
        8283
            assert "Do not resend another" in queued[0]
      
        8284
        
        8285
        
        8286
        @pytest.mark.asyncio
      
        8287
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        8288
            temp_dir: Path,
      
        8289
        ) -> None:
      
        8290
            async def assess_confidence(
      
        8291
                tool_name: str,
      
        8292
                tool_args: dict,
      
        8293
                context: str,
      
        8294
            ) -> ConfidenceAssessment:
      
        8295
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        8296
        
        8297
            async def verify_action(
      
        8298
                tool_name: str,
      
        8299
                tool_args: dict,
      
        8300
                result: str,
      
        8301
                expected: str = "",
      
        8302
            ) -> ActionVerification:
      
        8303
                raise AssertionError("Verification should not run in this scenario")
      
        8304
        
        8305
            guide_root = temp_dir / "guides" / "nginx"
      
        8306
            chapters = guide_root / "chapters"
      
        8307
            chapters.mkdir(parents=True)
      
        8308
            index_path = guide_root / "index.html"
      
        8309
            chapter_one = chapters / "01-introduction.html"
      
        8310
            chapter_two = chapters / "02-installation.html"
      
        8311
            index_path.write_text("<html></html>\n")
      
        8312
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        8313
        
        8314
            implementation_plan = temp_dir / "implementation.md"
      
        8315
            implementation_plan.write_text(
      
        8316
                "\n".join(
      
        8317
                    [
      
        8318
                        "# Implementation Plan",
      
        8319
                        "",
      
        8320
                        "## File Changes",
      
        8321
                        f"- `{index_path}`",
      
        8322
                        f"- `{chapter_one}`",
      
        8323
                        f"- `{chapter_two}`",
      
        8324
                        "",
      
        8325
                    ]
      
        8326
                )
      
        8327
            )
      
        8328
        
        8329
            context = build_context(
      
        8330
                temp_dir=temp_dir,
      
        8331
                messages=[],
      
        8332
                safeguards=FakeSafeguards(),
      
        8333
                assess_confidence=assess_confidence,
      
        8334
                verify_action=verify_action,
      
        8335
                auto_recover=False,
      
        8336
            )
      
        8337
            queued: list[str] = []
      
        8338
            context.queue_steering_message_callback = queued.append
      
        8339
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        8340
            tool_call = ToolCall(
      
        8341
                id="write-2",
      
        8342
                name="write",
      
        8343
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        8344
            )
      
        8345
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        8346
            executor = FakeExecutor(
      
        8347
                [
      
        8348
                    ToolExecutionOutcome(
      
        8349
                        tool_call=tool_call,
      
        8350
                        state=ToolExecutionState.BLOCKED,
      
        8351
                        message=Message.tool_result_message(
      
        8352
                            tool_call_id=tool_call.id,
      
        8353
                            display_content=blocked_message,
      
        8354
                            result_content=blocked_message,
      
        8355
                            is_error=True,
      
        8356
                        ),
      
        8357
                        event_content=blocked_message,
      
        8358
                        is_error=True,
      
        8359
                        result_output=blocked_message,
      
        8360
                    )
      
        8361
                ]
      
        8362
            )
      
        8363
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        8364
            dod.implementation_plan = str(implementation_plan)
      
        8365
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        8366
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        8367
        
        8368
            await runner.execute_batch(
      
        8369
                tool_calls=[tool_call],
      
        8370
                tool_source="assistant",
      
        8371
                pending_tool_calls_seen=set(),
      
        8372
                emit=_noop_emit,
      
        8373
                summary=TurnSummary(final_response=""),
      
        8374
                dod=dod,
      
        8375
                executor=executor,  # type: ignore[arg-type]
      
        8376
                on_confirmation=None,
      
        8377
                on_user_question=None,
      
        8378
                emit_confirmation=None,
      
        8379
                consecutive_errors=0,
      
        8380
            )
      
        8381
        
        8382
            assert queued
      
        8383
            assert "did not provide a valid `file_path`" in queued[0]
      
        8384
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        8385
            assert (
      
        8386
                f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
      
        8387
                in queued[0]
      
        8388
            )
      
        8389
            assert context.recovery_context is not None
      
        8390
            assert context.recovery_context.attempts[-1].error == blocked_message