loader Public

Watch 0 Fork 0 Star 0
Python · 224940 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.path_display import display_runtime_path
      
        20
        from loader.runtime.permissions import (
      
        21
            PermissionMode,
      
        22
            build_permission_policy,
      
        23
            load_permission_rules,
      
        24
        )
      
        25
        from loader.runtime.reasoning_types import (
      
        26
            ActionVerification,
      
        27
            ConfidenceAssessment,
      
        28
            ConfidenceLevel,
      
        29
        )
      
        30
        from loader.runtime.recovery import RecoveryContext
      
        31
        from loader.runtime.tool_batches import (
      
        32
            ToolBatchRunner,
      
        33
        )
      
        34
        from loader.runtime.tool_batches import (
      
        35
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        36
        )
      
        37
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        38
        from loader.tools.base import ToolResult as RegistryToolResult
      
        39
        from loader.tools.base import create_default_registry
      
        40
        from tests.helpers.runtime_harness import ScriptedBackend
      
        41
        
        42
        
        43
        class FakeSession:
      
        44
            def __init__(self, messages: list[Message]) -> None:
      
        45
                self.messages = list(messages)
      
        46
                self.workflow_timeline = []
      
        47
        
        48
            def append(self, message: Message) -> None:
      
        49
                self.messages.append(message)
      
        50
        
        51
            def append_workflow_timeline_entry(self, entry) -> None:
      
        52
                self.workflow_timeline.append(entry)
      
        53
        
        54
        
        55
        class FakeCodeFilter:
      
        56
            def reset(self) -> None:
      
        57
                return None
      
        58
        
        59
        
        60
        class FakeSafeguards:
      
        61
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        62
                self.action_tracker = object()
      
        63
                self.validator = object()
      
        64
                self.code_filter = FakeCodeFilter()
      
        65
                self._detect_loop_result = detect_loop_result
      
        66
        
        67
            def filter_stream_chunk(self, content: str) -> str:
      
        68
                return content
      
        69
        
        70
            def filter_complete_content(self, content: str) -> str:
      
        71
                return content
      
        72
        
        73
            def should_steer(self) -> bool:
      
        74
                return False
      
        75
        
        76
            def get_steering_message(self) -> str | None:
      
        77
                return None
      
        78
        
        79
            def record_response(self, content: str) -> None:
      
        80
                return None
      
        81
        
        82
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        83
                return False, ""
      
        84
        
        85
            def detect_loop(self) -> tuple[bool, str]:
      
        86
                return self._detect_loop_result
      
        87
        
        88
        
        89
        class FakeExecutor:
      
        90
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        91
                self._outcomes = list(outcomes)
      
        92
                self.calls: list[ToolCall] = []
      
        93
        
        94
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        95
                self.calls.append(tool_call)
      
        96
                if not self._outcomes:
      
        97
                    raise AssertionError("No fake tool outcome queued")
      
        98
                return self._outcomes.pop(0)
      
        99
        
        100
        
        101
        def build_context(
      
        102
            *,
      
        103
            temp_dir: Path,
      
        104
            messages: list[Message],
      
        105
            safeguards: FakeSafeguards,
      
        106
            assess_confidence,
      
        107
            verify_action,
      
        108
            recovery_context: RecoveryContext | None = None,
      
        109
            confidence_scoring: bool = False,
      
        110
            verification: bool = False,
      
        111
            auto_recover: bool = True,
      
        112
            min_confidence_for_action: int = 3,
      
        113
        ) -> RuntimeContext:
      
        114
            registry = create_default_registry(temp_dir)
      
        115
            registry.configure_workspace_root(temp_dir)
      
        116
            rule_status = load_permission_rules(temp_dir)
      
        117
            policy = build_permission_policy(
      
        118
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        119
                workspace_root=temp_dir,
      
        120
                tool_requirements=registry.get_tool_requirements(),
      
        121
                rules=rule_status.rules,
      
        122
            )
      
        123
            context = RuntimeContext(
      
        124
                project_root=temp_dir,
      
        125
                backend=ScriptedBackend(),
      
        126
                registry=registry,
      
        127
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        128
                config=SimpleNamespace(
      
        129
                    force_react=False,
      
        130
                    max_recovery_attempts=2,
      
        131
                    auto_recover=auto_recover,
      
        132
                    reasoning=SimpleNamespace(
      
        133
                        rollback=False,
      
        134
                        show_rollback_plan=False,
      
        135
                        completion_check=True,
      
        136
                        max_continuation_prompts=5,
      
        137
                        self_critique=False,
      
        138
                        confidence_scoring=confidence_scoring,
      
        139
                        min_confidence_for_action=min_confidence_for_action,
      
        140
                        verification=verification,
      
        141
                    ),
      
        142
                ),
      
        143
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        144
                project_context=None,
      
        145
                permission_policy=policy,
      
        146
                permission_config_status=rule_status,
      
        147
                workflow_mode="execute",
      
        148
                safeguards=safeguards,
      
        149
                reasoning=SimpleNamespace(
      
        150
                    assess_confidence=assess_confidence,
      
        151
                    verify_action=verify_action,
      
        152
                ),
      
        153
                recovery_context=recovery_context,
      
        154
            )
      
        155
            return context
      
        156
        
        157
        
        158
        def tool_outcome(
      
        159
            *,
      
        160
            tool_call: ToolCall,
      
        161
            output: str,
      
        162
            is_error: bool,
      
        163
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        164
            metadata: dict[str, object] | None = None,
      
        165
        ) -> ToolExecutionOutcome:
      
        166
            return ToolExecutionOutcome(
      
        167
                tool_call=tool_call,
      
        168
                state=state,
      
        169
                message=Message.tool_result_message(
      
        170
                    tool_call_id=tool_call.id,
      
        171
                    display_content=output,
      
        172
                    result_content=output,
      
        173
                    is_error=is_error,
      
        174
                ),
      
        175
                event_content=output,
      
        176
                is_error=is_error,
      
        177
                result_output=output,
      
        178
                registry_result=RegistryToolResult(
      
        179
                    output=output,
      
        180
                    is_error=is_error,
      
        181
                    metadata=metadata or {},
      
        182
                ),
      
        183
            )
      
        184
        
        185
        
        186
        @pytest.mark.asyncio
      
        187
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        188
            captured: dict[str, str] = {}
      
        189
        
        190
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        191
                captured["context"] = context
      
        192
                return ConfidenceAssessment(
      
        193
                    action=f"{tool_name} with {tool_args}",
      
        194
                    tool_name=tool_name,
      
        195
                    tool_args=tool_args,
      
        196
                    level=ConfidenceLevel.LOW,
      
        197
                    reasoning="Need to inspect the target first.",
      
        198
                    risks=["Unknown target file"],
      
        199
                )
      
        200
        
        201
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        202
                raise AssertionError("Verification should not run for skipped actions")
      
        203
        
        204
            context = build_context(
      
        205
                temp_dir=temp_dir,
      
        206
                messages=[
      
        207
                    Message(role=Role.USER, content="Please inspect the project."),
      
        208
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        209
                ],
      
        210
                safeguards=FakeSafeguards(),
      
        211
                assess_confidence=assess_confidence,
      
        212
                verify_action=verify_action,
      
        213
                confidence_scoring=True,
      
        214
                min_confidence_for_action=3,
      
        215
            )
      
        216
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        217
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        218
            events: list[AgentEvent] = []
      
        219
        
        220
            async def emit(event: AgentEvent) -> None:
      
        221
                events.append(event)
      
        222
        
        223
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        224
            result = await runner.execute_batch(
      
        225
                tool_calls=[tool_call],
      
        226
                tool_source="assistant",
      
        227
                pending_tool_calls_seen=set(),
      
        228
                emit=emit,
      
        229
                summary=TurnSummary(final_response=""),
      
        230
                dod=create_definition_of_done("Read the docs"),
      
        231
                executor=executor,  # type: ignore[arg-type]
      
        232
                on_confirmation=None,
      
        233
                on_user_question=None,
      
        234
                emit_confirmation=None,
      
        235
                consecutive_errors=0,
      
        236
            )
      
        237
        
        238
            assert result.actions_taken == []
      
        239
            assert executor.calls == []
      
        240
            assert "Please inspect the project." in captured["context"]
      
        241
            assert context.session.messages[-1].role == Role.USER
      
        242
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        243
            event_types = [event.type for event in events]
      
        244
            assert "confidence" in event_types
      
        245
        
        246
        
        247
        @pytest.mark.asyncio
      
        248
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        249
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        250
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        251
        
        252
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        253
                raise AssertionError("Verification should not run for failed actions")
      
        254
        
        255
            context = build_context(
      
        256
                temp_dir=temp_dir,
      
        257
                messages=[],
      
        258
                safeguards=FakeSafeguards(),
      
        259
                assess_confidence=assess_confidence,
      
        260
                verify_action=verify_action,
      
        261
                auto_recover=True,
      
        262
            )
      
        263
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        264
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        265
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        266
            summary = TurnSummary(final_response="")
      
        267
            events: list[AgentEvent] = []
      
        268
        
        269
            async def emit(event: AgentEvent) -> None:
      
        270
                events.append(event)
      
        271
        
        272
            await runner.execute_batch(
      
        273
                tool_calls=[tool_call],
      
        274
                tool_source="assistant",
      
        275
                pending_tool_calls_seen=set(),
      
        276
                emit=emit,
      
        277
                summary=summary,
      
        278
                dod=create_definition_of_done("Run tests"),
      
        279
                executor=executor,  # type: ignore[arg-type]
      
        280
                on_confirmation=None,
      
        281
                on_user_question=None,
      
        282
                emit_confirmation=None,
      
        283
                consecutive_errors=0,
      
        284
            )
      
        285
        
        286
            assert context.recovery_context is not None
      
        287
            assert summary.tool_result_messages
      
        288
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        289
            assert any(event.type == "recovery" for event in events)
      
        290
        
        291
        
        292
        @pytest.mark.asyncio
      
        293
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        294
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        295
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        296
        
        297
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        298
                raise AssertionError("Verification should not run for this scenario")
      
        299
        
        300
            context = build_context(
      
        301
                temp_dir=temp_dir,
      
        302
                messages=[],
      
        303
                safeguards=FakeSafeguards(),
      
        304
                assess_confidence=assess_confidence,
      
        305
                verify_action=verify_action,
      
        306
                auto_recover=False,
      
        307
            )
      
        308
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        309
            tool_call = ToolCall(
      
        310
                id="bash-1",
      
        311
                name="bash",
      
        312
                arguments={"command": "python -m http.server 8000", "background": True},
      
        313
            )
      
        314
            metadata = {
      
        315
                "job_id": "bash-1",
      
        316
                "status": "running",
      
        317
                "background": True,
      
        318
            }
      
        319
            executor = FakeExecutor(
      
        320
                [
      
        321
                    tool_outcome(
      
        322
                        tool_call=tool_call,
      
        323
                        output="Started bash job bash-1",
      
        324
                        is_error=False,
      
        325
                        metadata=metadata,
      
        326
                    )
      
        327
                ]
      
        328
            )
      
        329
            events: list[AgentEvent] = []
      
        330
        
        331
            async def emit(event: AgentEvent) -> None:
      
        332
                events.append(event)
      
        333
        
        334
            await runner.execute_batch(
      
        335
                tool_calls=[tool_call],
      
        336
                tool_source="assistant",
      
        337
                pending_tool_calls_seen=set(),
      
        338
                emit=emit,
      
        339
                summary=TurnSummary(final_response=""),
      
        340
                dod=create_definition_of_done("Launch a preview server"),
      
        341
                executor=executor,  # type: ignore[arg-type]
      
        342
                on_confirmation=None,
      
        343
                on_user_question=None,
      
        344
                emit_confirmation=None,
      
        345
                consecutive_errors=0,
      
        346
            )
      
        347
        
        348
            tool_result = next(event for event in events if event.type == "tool_result")
      
        349
            assert tool_result.tool_metadata == metadata
      
        350
        
        351
        
        352
        @pytest.mark.asyncio
      
        353
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        354
            verification_calls: list[str] = []
      
        355
        
        356
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        357
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        358
        
        359
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        360
                verification_calls.append(result)
      
        361
                return ActionVerification(
      
        362
                    tool_name=tool_name,
      
        363
                    tool_args=tool_args,
      
        364
                    expected_outcome="Success",
      
        365
                    actual_result=result,
      
        366
                    verified=False,
      
        367
                    discrepancies=["File contents did not match"],
      
        368
                    needs_correction=True,
      
        369
                    correction_suggestion="Read the file before editing again.",
      
        370
                )
      
        371
        
        372
            existing_recovery = RecoveryContext(
      
        373
                original_tool="edit",
      
        374
                original_args={"file_path": "README.md"},
      
        375
            )
      
        376
            context = build_context(
      
        377
                temp_dir=temp_dir,
      
        378
                messages=[],
      
        379
                safeguards=FakeSafeguards(),
      
        380
                assess_confidence=assess_confidence,
      
        381
                verify_action=verify_action,
      
        382
                recovery_context=existing_recovery,
      
        383
                verification=True,
      
        384
            )
      
        385
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        386
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        387
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        388
            events: list[AgentEvent] = []
      
        389
        
        390
            async def emit(event: AgentEvent) -> None:
      
        391
                events.append(event)
      
        392
        
        393
            await runner.execute_batch(
      
        394
                tool_calls=[tool_call],
      
        395
                tool_source="assistant",
      
        396
                pending_tool_calls_seen=set(),
      
        397
                emit=emit,
      
        398
                summary=TurnSummary(final_response=""),
      
        399
                dod=create_definition_of_done("Read the docs"),
      
        400
                executor=executor,  # type: ignore[arg-type]
      
        401
                on_confirmation=None,
      
        402
                on_user_question=None,
      
        403
                emit_confirmation=None,
      
        404
                consecutive_errors=0,
      
        405
            )
      
        406
        
        407
            assert verification_calls == ["file contents"]
      
        408
            assert context.recovery_context is existing_recovery
      
        409
            assert existing_recovery.successful_steps == [
      
        410
                ("read", {"file_path": "README.md"})
      
        411
            ]
      
        412
            assert context.session.messages[-1].role == Role.TOOL
      
        413
            assert context.session.messages[-1].content == "file contents"
      
        414
            assert any(event.type == "verification" for event in events)
      
        415
        
        416
        
        417
        @pytest.mark.asyncio
      
        418
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        419
            temp_dir: Path,
      
        420
        ) -> None:
      
        421
            async def assess_confidence(
      
        422
                tool_name: str,
      
        423
                tool_args: dict,
      
        424
                context: str,
      
        425
            ) -> ConfidenceAssessment:
      
        426
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        427
        
        428
            async def verify_action(
      
        429
                tool_name: str,
      
        430
                tool_args: dict,
      
        431
                result: str,
      
        432
                expected: str = "",
      
        433
            ) -> ActionVerification:
      
        434
                raise AssertionError("Verification should not run for this scenario")
      
        435
        
        436
            existing_recovery = RecoveryContext(
      
        437
                original_tool="read",
      
        438
                original_args={"file_path": "chapters/04-data-types.html"},
      
        439
            )
      
        440
            existing_recovery.add_attempt(
      
        441
                "read",
      
        442
                {"file_path": "chapters/04-data-types.html"},
      
        443
                "File not found",
      
        444
            )
      
        445
            context = build_context(
      
        446
                temp_dir=temp_dir,
      
        447
                messages=[],
      
        448
                safeguards=FakeSafeguards(),
      
        449
                assess_confidence=assess_confidence,
      
        450
                verify_action=verify_action,
      
        451
                recovery_context=existing_recovery,
      
        452
                auto_recover=False,
      
        453
            )
      
        454
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        455
            tool_call = ToolCall(
      
        456
                id="bash-1",
      
        457
                name="bash",
      
        458
                arguments={"command": "ls chapters"},
      
        459
            )
      
        460
            executor = FakeExecutor(
      
        461
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        462
            )
      
        463
        
        464
            summary = TurnSummary(final_response="")
      
        465
            await runner.execute_batch(
      
        466
                tool_calls=[tool_call],
      
        467
                tool_source="assistant",
      
        468
                pending_tool_calls_seen=set(),
      
        469
                emit=_noop_emit,
      
        470
                summary=summary,
      
        471
                dod=create_definition_of_done("Fix the chapter links"),
      
        472
                executor=executor,  # type: ignore[arg-type]
      
        473
                on_confirmation=None,
      
        474
                on_user_question=None,
      
        475
                emit_confirmation=None,
      
        476
                consecutive_errors=0,
      
        477
            )
      
        478
        
        479
            assert context.recovery_context is existing_recovery
      
        480
            assert existing_recovery.successful_steps == [
      
        481
                ("bash", {"command": "ls chapters"})
      
        482
            ]
      
        483
        
        484
        
        485
        @pytest.mark.asyncio
      
        486
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        487
            temp_dir: Path,
      
        488
        ) -> None:
      
        489
            async def assess_confidence(
      
        490
                tool_name: str,
      
        491
                tool_args: dict,
      
        492
                context: str,
      
        493
            ) -> ConfidenceAssessment:
      
        494
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        495
        
        496
            async def verify_action(
      
        497
                tool_name: str,
      
        498
                tool_args: dict,
      
        499
                result: str,
      
        500
                expected: str = "",
      
        501
            ) -> ActionVerification:
      
        502
                raise AssertionError("Verification should not run for this scenario")
      
        503
        
        504
            existing_recovery = RecoveryContext(
      
        505
                original_tool="read",
      
        506
                original_args={"file_path": "chapters/04-data-types.html"},
      
        507
            )
      
        508
            existing_recovery.add_attempt(
      
        509
                "read",
      
        510
                {"file_path": "chapters/04-data-types.html"},
      
        511
                "File not found",
      
        512
            )
      
        513
            context = build_context(
      
        514
                temp_dir=temp_dir,
      
        515
                messages=[],
      
        516
                safeguards=FakeSafeguards(),
      
        517
                assess_confidence=assess_confidence,
      
        518
                verify_action=verify_action,
      
        519
                recovery_context=existing_recovery,
      
        520
                auto_recover=False,
      
        521
            )
      
        522
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        523
            tool_call = ToolCall(
      
        524
                id="patch-1",
      
        525
                name="patch",
      
        526
                arguments={
      
        527
                    "file_path": "index.html",
      
        528
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        529
                },
      
        530
            )
      
        531
            executor = FakeExecutor(
      
        532
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        533
            )
      
        534
        
        535
            summary = TurnSummary(final_response="")
      
        536
            await runner.execute_batch(
      
        537
                tool_calls=[tool_call],
      
        538
                tool_source="assistant",
      
        539
                pending_tool_calls_seen=set(),
      
        540
                emit=_noop_emit,
      
        541
                summary=summary,
      
        542
                dod=create_definition_of_done("Fix the chapter links"),
      
        543
                executor=executor,  # type: ignore[arg-type]
      
        544
                on_confirmation=None,
      
        545
                on_user_question=None,
      
        546
                emit_confirmation=None,
      
        547
                consecutive_errors=0,
      
        548
            )
      
        549
        
        550
            assert context.recovery_context is None
      
        551
        
        552
        
        553
        @pytest.mark.asyncio
      
        554
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        555
            temp_dir: Path,
      
        556
        ) -> None:
      
        557
            async def assess_confidence(
      
        558
                tool_name: str,
      
        559
                tool_args: dict,
      
        560
                context: str,
      
        561
            ) -> ConfidenceAssessment:
      
        562
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        563
        
        564
            async def verify_action(
      
        565
                tool_name: str,
      
        566
                tool_args: dict,
      
        567
                result: str,
      
        568
                expected: str = "",
      
        569
            ) -> ActionVerification:
      
        570
                raise AssertionError("Verification should not run for this scenario")
      
        571
        
        572
            messages = [
      
        573
                Message(
      
        574
                    role=Role.TOOL,
      
        575
                    content=(
      
        576
                        "Observation [glob]: Result: "
      
        577
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        578
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        579
                        f"{temp_dir}/chapters/03-basics.html"
      
        580
                    ),
      
        581
                    tool_results=[],
      
        582
                ),
      
        583
                Message(
      
        584
                    role=Role.ASSISTANT,
      
        585
                    content="I already inspected the first chapter title.",
      
        586
                    tool_calls=[
      
        587
                        ToolCall(
      
        588
                            id="read-ch1",
      
        589
                            name="read",
      
        590
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        591
                        )
      
        592
                    ],
      
        593
                ),
      
        594
                Message.tool_result_message(
      
        595
                    tool_call_id="read-ch1",
      
        596
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        598
                ),
      
        599
                Message(
      
        600
                    role=Role.ASSISTANT,
      
        601
                    content="I should update the index now.",
      
        602
                    tool_calls=[
      
        603
                        ToolCall(
      
        604
                            id="read-index",
      
        605
                            name="read",
      
        606
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        607
                        )
      
        608
                    ],
      
        609
                ),
      
        610
            ]
      
        611
            context = build_context(
      
        612
                temp_dir=temp_dir,
      
        613
                messages=messages,
      
        614
                safeguards=FakeSafeguards(),
      
        615
                assess_confidence=assess_confidence,
      
        616
                verify_action=verify_action,
      
        617
                auto_recover=False,
      
        618
            )
      
        619
            (temp_dir / "chapters").mkdir()
      
        620
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        621
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        622
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        623
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        624
            implementation_plan = temp_dir / "implementation.md"
      
        625
            implementation_plan.write_text(
      
        626
                "\n".join(
      
        627
                    [
      
        628
                        "# Implementation Plan",
      
        629
                        "",
      
        630
                        "## File Changes",
      
        631
                        f"- `{temp_dir / 'index.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        635
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        636
                    ]
      
        637
                )
      
        638
            )
      
        639
            context.session.current_task = (
      
        640
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        641
            )
      
        642
            persistent_messages: list[str] = []
      
        643
            ephemeral_messages: list[str] = []
      
        644
            context.queue_steering_message_callback = persistent_messages.append
      
        645
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        646
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        647
            tool_call = ToolCall(
      
        648
                id="read-dup",
      
        649
                name="read",
      
        650
                arguments={"file_path": str(temp_dir / "index.html")},
      
        651
            )
      
        652
            duplicate_message = (
      
        653
                "[Skipped - duplicate action: Already read "
      
        654
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        655
                "reuse the earlier read result instead of rereading]"
      
        656
            )
      
        657
            executor = FakeExecutor(
      
        658
                [
      
        659
                    ToolExecutionOutcome(
      
        660
                        tool_call=tool_call,
      
        661
                        state=ToolExecutionState.DUPLICATE,
      
        662
                        message=Message.tool_result_message(
      
        663
                            tool_call_id=tool_call.id,
      
        664
                            display_content=duplicate_message,
      
        665
                            result_content=duplicate_message,
      
        666
                        ),
      
        667
                        event_content=duplicate_message,
      
        668
                        is_error=False,
      
        669
                        result_output=duplicate_message,
      
        670
                    )
      
        671
                ]
      
        672
            )
      
        673
        
        674
            summary = TurnSummary(final_response="")
      
        675
            dod = create_definition_of_done("Fix the chapter links")
      
        676
            dod.implementation_plan = str(implementation_plan)
      
        677
            dod.pending_items.append("Create the remaining chapter files")
      
        678
            await runner.execute_batch(
      
        679
                tool_calls=[tool_call],
      
        680
                tool_source="assistant",
      
        681
                pending_tool_calls_seen=set(),
      
        682
                emit=_noop_emit,
      
        683
                summary=summary,
      
        684
                dod=dod,
      
        685
                executor=executor,  # type: ignore[arg-type]
      
        686
                on_confirmation=None,
      
        687
                on_user_question=None,
      
        688
                emit_confirmation=None,
      
        689
                consecutive_errors=0,
      
        690
            )
      
        691
        
        692
            assert len(persistent_messages) == 1
      
        693
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        694
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        695
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        696
            assert (
      
        697
                "Prefer one `write` call for "
      
        698
                f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
      
        699
                in persistent_messages[0]
      
        700
            )
      
        701
            assert ephemeral_messages == []
      
        702
        
        703
        
        704
        @pytest.mark.asyncio
      
        705
        async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
      
        706
            temp_dir: Path,
      
        707
        ) -> None:
      
        708
            async def assess_confidence(
      
        709
                tool_name: str,
      
        710
                tool_args: dict,
      
        711
                context: str,
      
        712
            ) -> ConfidenceAssessment:
      
        713
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        714
        
        715
            async def verify_action(
      
        716
                tool_name: str,
      
        717
                tool_args: dict,
      
        718
                result: str,
      
        719
                expected: str = "",
      
        720
            ) -> ActionVerification:
      
        721
                raise AssertionError("Verification should not run for this scenario")
      
        722
        
        723
            guide_root = temp_dir / "guide"
      
        724
            chapters = guide_root / "chapters"
      
        725
            chapters.mkdir(parents=True)
      
        726
            index = guide_root / "index.html"
      
        727
            chapter_one = chapters / "01-introduction.html"
      
        728
            index.write_text(
      
        729
                '<a href="chapters/01-introduction.html">Intro</a>\n'
      
        730
                '<a href="chapters/02-installation.html">Install</a>\n'
      
        731
            )
      
        732
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        733
        
        734
            implementation_plan = temp_dir / "implementation.md"
      
        735
            implementation_plan.write_text(
      
        736
                "\n".join(
      
        737
                    [
      
        738
                        "# Implementation Plan",
      
        739
                        "",
      
        740
                        "## File Changes",
      
        741
                        f"- `{index}`",
      
        742
                        f"- `{chapters}/` (directory for chapter files)",
      
        743
                    ]
      
        744
                )
      
        745
            )
      
        746
        
        747
            messages = [
      
        748
                Message(
      
        749
                    role=Role.ASSISTANT,
      
        750
                    content="I should keep building the guide.",
      
        751
                    tool_calls=[
      
        752
                        ToolCall(
      
        753
                            id="read-index",
      
        754
                            name="read",
      
        755
                            arguments={"file_path": str(index)},
      
        756
                        )
      
        757
                    ],
      
        758
                ),
      
        759
            ]
      
        760
            context = build_context(
      
        761
                temp_dir=temp_dir,
      
        762
                messages=messages,
      
        763
                safeguards=FakeSafeguards(),
      
        764
                assess_confidence=assess_confidence,
      
        765
                verify_action=verify_action,
      
        766
                auto_recover=False,
      
        767
            )
      
        768
            context.session.current_task = f"Build the guide rooted at {index}."
      
        769
            persistent_messages: list[str] = []
      
        770
            ephemeral_messages: list[str] = []
      
        771
            context.queue_steering_message_callback = persistent_messages.append
      
        772
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        773
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        774
            tool_call = ToolCall(
      
        775
                id="read-dup-rooted",
      
        776
                name="read",
      
        777
                arguments={"file_path": str(index)},
      
        778
            )
      
        779
            duplicate_message = (
      
        780
                "[Skipped - duplicate action: Already read "
      
        781
                f"{index} recently without any intervening changes; "
      
        782
                "reuse the earlier read result instead of rereading]"
      
        783
            )
      
        784
            executor = FakeExecutor(
      
        785
                [
      
        786
                    ToolExecutionOutcome(
      
        787
                        tool_call=tool_call,
      
        788
                        state=ToolExecutionState.DUPLICATE,
      
        789
                        message=Message.tool_result_message(
      
        790
                            tool_call_id=tool_call.id,
      
        791
                            display_content=duplicate_message,
      
        792
                            result_content=duplicate_message,
      
        793
                        ),
      
        794
                        event_content=duplicate_message,
      
        795
                        is_error=False,
      
        796
                        result_output=duplicate_message,
      
        797
                    )
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
      
        803
            dod.implementation_plan = str(implementation_plan)
      
        804
            dod.touched_files = [str(index), str(chapter_one)]
      
        805
            dod.completed_items = ["Create chapter files with appropriate content"]
      
        806
            dod.pending_items.append("Create the remaining chapter files")
      
        807
        
        808
            await runner.execute_batch(
      
        809
                tool_calls=[tool_call],
      
        810
                tool_source="assistant",
      
        811
                pending_tool_calls_seen=set(),
      
        812
                emit=_noop_emit,
      
        813
                summary=summary,
      
        814
                dod=dod,
      
        815
                executor=executor,  # type: ignore[arg-type]
      
        816
                on_confirmation=None,
      
        817
                on_user_question=None,
      
        818
                emit_confirmation=None,
      
        819
                consecutive_errors=0,
      
        820
            )
      
        821
        
        822
            assert len(persistent_messages) == 1
      
        823
            assert "Create the remaining chapter files" in persistent_messages[0]
      
        824
            assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
      
        825
            assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
      
        826
            assert ephemeral_messages == []
      
        827
        
        828
        
        829
        @pytest.mark.asyncio
      
        830
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        831
            temp_dir: Path,
      
        832
        ) -> None:
      
        833
            async def assess_confidence(
      
        834
                tool_name: str,
      
        835
                tool_args: dict,
      
        836
                context: str,
      
        837
            ) -> ConfidenceAssessment:
      
        838
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        839
        
        840
            async def verify_action(
      
        841
                tool_name: str,
      
        842
                tool_args: dict,
      
        843
                result: str,
      
        844
                expected: str = "",
      
        845
            ) -> ActionVerification:
      
        846
                raise AssertionError("Verification should not run for this scenario")
      
        847
        
        848
            context = build_context(
      
        849
                temp_dir=temp_dir,
      
        850
                messages=[],
      
        851
                safeguards=FakeSafeguards(),
      
        852
                assess_confidence=assess_confidence,
      
        853
                verify_action=verify_action,
      
        854
                auto_recover=False,
      
        855
            )
      
        856
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        857
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        858
            sync_todos_to_definition_of_done(
      
        859
                dod,
      
        860
                [
      
        861
                    {
      
        862
                        "content": "Create 03-first-website.html",
      
        863
                        "active_form": "Creating 03-first-website.html",
      
        864
                        "status": "pending",
      
        865
                    },
      
        866
                    {
      
        867
                        "content": "Create 04-configuration-basics.html",
      
        868
                        "active_form": "Creating 04-configuration-basics.html",
      
        869
                        "status": "pending",
      
        870
                    },
      
        871
                ],
      
        872
            )
      
        873
        
        874
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        875
            chapter_path.parent.mkdir(parents=True)
      
        876
            write_call = ToolCall(
      
        877
                id="write-ch3",
      
        878
                name="write",
      
        879
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        880
            )
      
        881
            stale_todo_call = ToolCall(
      
        882
                id="todo-stale",
      
        883
                name="TodoWrite",
      
        884
                arguments={
      
        885
                    "todos": [
      
        886
                        {
      
        887
                            "content": "Create 03-first-website.html",
      
        888
                            "active_form": "Creating 03-first-website.html",
      
        889
                            "status": "pending",
      
        890
                        },
      
        891
                        {
      
        892
                            "content": "Create 04-configuration-basics.html",
      
        893
                            "active_form": "Creating 04-configuration-basics.html",
      
        894
                            "status": "pending",
      
        895
                        },
      
        896
                    ]
      
        897
                },
      
        898
            )
      
        899
            executor = FakeExecutor(
      
        900
                [
      
        901
                    tool_outcome(
      
        902
                        tool_call=write_call,
      
        903
                        output=f"Successfully wrote {chapter_path}",
      
        904
                        is_error=False,
      
        905
                    ),
      
        906
                    tool_outcome(
      
        907
                        tool_call=stale_todo_call,
      
        908
                        output="Todos updated",
      
        909
                        is_error=False,
      
        910
                        metadata={
      
        911
                            "new_todos": [
      
        912
                                {
      
        913
                                    "content": "Create 03-first-website.html",
      
        914
                                    "active_form": "Creating 03-first-website.html",
      
        915
                                    "status": "pending",
      
        916
                                },
      
        917
                                {
      
        918
                                    "content": "Create 04-configuration-basics.html",
      
        919
                                    "active_form": "Creating 04-configuration-basics.html",
      
        920
                                    "status": "pending",
      
        921
                                },
      
        922
                            ]
      
        923
                        },
      
        924
                    ),
      
        925
                ]
      
        926
            )
      
        927
        
        928
            summary = TurnSummary(final_response="")
      
        929
            await runner.execute_batch(
      
        930
                tool_calls=[write_call, stale_todo_call],
      
        931
                tool_source="assistant",
      
        932
                pending_tool_calls_seen=set(),
      
        933
                emit=_noop_emit,
      
        934
                summary=summary,
      
        935
                dod=dod,
      
        936
                executor=executor,  # type: ignore[arg-type]
      
        937
                on_confirmation=None,
      
        938
                on_user_question=None,
      
        939
                emit_confirmation=None,
      
        940
                consecutive_errors=0,
      
        941
            )
      
        942
        
        943
            assert "Create 03-first-website.html" in dod.completed_items
      
        944
            assert "Create 03-first-website.html" not in dod.pending_items
      
        945
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        946
        
        947
        
        948
        @pytest.mark.asyncio
      
        949
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        950
            temp_dir: Path,
      
        951
        ) -> None:
      
        952
            async def assess_confidence(
      
        953
                tool_name: str,
      
        954
                tool_args: dict,
      
        955
                context: str,
      
        956
            ) -> ConfidenceAssessment:
      
        957
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        958
        
        959
            async def verify_action(
      
        960
                tool_name: str,
      
        961
                tool_args: dict,
      
        962
                result: str,
      
        963
                expected: str = "",
      
        964
            ) -> ActionVerification:
      
        965
                raise AssertionError("Verification should not run for this scenario")
      
        966
        
        967
            chapters = temp_dir / "chapters"
      
        968
            chapters.mkdir()
      
        969
            (chapters / "01-introduction.html").write_text(
      
        970
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        971
            )
      
        972
            (chapters / "02-setup.html").write_text(
      
        973
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        974
            )
      
        975
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        976
        
        977
            context = build_context(
      
        978
                temp_dir=temp_dir,
      
        979
                messages=[],
      
        980
                safeguards=FakeSafeguards(),
      
        981
                assess_confidence=assess_confidence,
      
        982
                verify_action=verify_action,
      
        983
                auto_recover=False,
      
        984
            )
      
        985
            context.session.current_task = (
      
        986
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        987
            )
      
        988
            persistent_messages: list[str] = []
      
        989
            ephemeral_messages: list[str] = []
      
        990
            context.queue_steering_message_callback = persistent_messages.append
      
        991
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        992
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        993
            tool_call = ToolCall(
      
        994
                id="glob-1",
      
        995
                name="glob",
      
        996
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        997
            )
      
        998
            executor = FakeExecutor(
      
        999
                [
      
        1000
                    tool_outcome(
      
        1001
                        tool_call=tool_call,
      
        1002
                        output="\n".join(
      
        1003
                            [
      
        1004
                                str(chapters / "01-introduction.html"),
      
        1005
                                str(chapters / "02-setup.html"),
      
        1006
                            ]
      
        1007
                        ),
      
        1008
                        is_error=False,
      
        1009
                    )
      
        1010
                ]
      
        1011
            )
      
        1012
        
        1013
            summary = TurnSummary(final_response="")
      
        1014
            await runner.execute_batch(
      
        1015
                tool_calls=[tool_call],
      
        1016
                tool_source="assistant",
      
        1017
                pending_tool_calls_seen=set(),
      
        1018
                emit=_noop_emit,
      
        1019
                summary=summary,
      
        1020
                dod=create_definition_of_done("Fix the chapter links"),
      
        1021
                executor=executor,  # type: ignore[arg-type]
      
        1022
                on_confirmation=None,
      
        1023
                on_user_question=None,
      
        1024
                emit_confirmation=None,
      
        1025
                consecutive_errors=0,
      
        1026
            )
      
        1027
        
        1028
            assert persistent_messages == []
      
        1029
            assert ephemeral_messages == []
      
        1030
            assert len(summary.tool_result_messages) == 1
      
        1031
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        1032
        
        1033
        
        1034
        @pytest.mark.asyncio
      
        1035
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        1036
            temp_dir: Path,
      
        1037
        ) -> None:
      
        1038
            async def assess_confidence(
      
        1039
                tool_name: str,
      
        1040
                tool_args: dict,
      
        1041
                context: str,
      
        1042
            ) -> ConfidenceAssessment:
      
        1043
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1044
        
        1045
            async def verify_action(
      
        1046
                tool_name: str,
      
        1047
                tool_args: dict,
      
        1048
                result: str,
      
        1049
                expected: str = "",
      
        1050
            ) -> ActionVerification:
      
        1051
                raise AssertionError("Verification should not run for this scenario")
      
        1052
        
        1053
            chapters = temp_dir / "chapters"
      
        1054
            chapters.mkdir()
      
        1055
            (chapters / "01-introduction.html").write_text(
      
        1056
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1057
            )
      
        1058
            (chapters / "02-setup.html").write_text(
      
        1059
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1060
            )
      
        1061
            index_path = temp_dir / "index.html"
      
        1062
            old_block = (
      
        1063
                '<ul class="chapter-list">\n'
      
        1064
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        1065
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        1066
                "</ul>\n"
      
        1067
            )
      
        1068
            new_block = (
      
        1069
                '<ul class="chapter-list">\n'
      
        1070
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1071
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1072
                "</ul>\n"
      
        1073
            )
      
        1074
            index_path.write_text(new_block)
      
        1075
        
        1076
            context = build_context(
      
        1077
                temp_dir=temp_dir,
      
        1078
                messages=[],
      
        1079
                safeguards=FakeSafeguards(),
      
        1080
                assess_confidence=assess_confidence,
      
        1081
                verify_action=verify_action,
      
        1082
                auto_recover=False,
      
        1083
            )
      
        1084
            context.session.current_task = (
      
        1085
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1086
            )
      
        1087
            persistent_messages: list[str] = []
      
        1088
            ephemeral_messages: list[str] = []
      
        1089
            context.queue_steering_message_callback = persistent_messages.append
      
        1090
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1091
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1092
            tool_call = ToolCall(
      
        1093
                id="edit-1",
      
        1094
                name="edit",
      
        1095
                arguments={
      
        1096
                    "file_path": str(index_path),
      
        1097
                    "old_string": old_block,
      
        1098
                    "new_string": new_block,
      
        1099
                },
      
        1100
            )
      
        1101
            executor = FakeExecutor(
      
        1102
                [
      
        1103
                    tool_outcome(
      
        1104
                        tool_call=tool_call,
      
        1105
                        output=f"Successfully edited {index_path}",
      
        1106
                        is_error=False,
      
        1107
                    )
      
        1108
                ]
      
        1109
            )
      
        1110
        
        1111
            summary = TurnSummary(final_response="")
      
        1112
            await runner.execute_batch(
      
        1113
                tool_calls=[tool_call],
      
        1114
                tool_source="assistant",
      
        1115
                pending_tool_calls_seen=set(),
      
        1116
                emit=_noop_emit,
      
        1117
                summary=summary,
      
        1118
                dod=create_definition_of_done(
      
        1119
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1120
                ),
      
        1121
                executor=executor,  # type: ignore[arg-type]
      
        1122
                on_confirmation=None,
      
        1123
                on_user_question=None,
      
        1124
                emit_confirmation=None,
      
        1125
                consecutive_errors=0,
      
        1126
            )
      
        1127
        
        1128
            assert all(
      
        1129
                "Semantic verification preview:" not in message.content
      
        1130
                for message in summary.tool_result_messages
      
        1131
            )
      
        1132
            assert persistent_messages == []
      
        1133
            assert ephemeral_messages == []
      
        1134
        
        1135
        
        1136
        @pytest.mark.asyncio
      
        1137
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1138
            temp_dir: Path,
      
        1139
        ) -> None:
      
        1140
            async def assess_confidence(
      
        1141
                tool_name: str,
      
        1142
                tool_args: dict,
      
        1143
                context: str,
      
        1144
            ) -> ConfidenceAssessment:
      
        1145
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1146
        
        1147
            async def verify_action(
      
        1148
                tool_name: str,
      
        1149
                tool_args: dict,
      
        1150
                result: str,
      
        1151
                expected: str = "",
      
        1152
            ) -> ActionVerification:
      
        1153
                raise AssertionError("Verification should not run for this scenario")
      
        1154
        
        1155
            chapters = temp_dir / "chapters"
      
        1156
            chapters.mkdir()
      
        1157
            (chapters / "01-introduction.html").write_text(
      
        1158
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1159
            )
      
        1160
            (chapters / "02-setup.html").write_text(
      
        1161
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1162
            )
      
        1163
            index_path = temp_dir / "index.html"
      
        1164
            index_path.write_text(
      
        1165
                "<h2>Table of Contents</h2>\n"
      
        1166
                '<ul class="chapter-list">\n'
      
        1167
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1168
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1169
                "</ul>\n"
      
        1170
            )
      
        1171
        
        1172
            prompt = (
      
        1173
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1174
                "for the structure and cadence of the guide. We are going to make an all "
      
        1175
                "new equally thorough guide on how to use the nginx tool."
      
        1176
            )
      
        1177
        
        1178
            context = build_context(
      
        1179
                temp_dir=temp_dir,
      
        1180
                messages=[],
      
        1181
                safeguards=FakeSafeguards(),
      
        1182
                assess_confidence=assess_confidence,
      
        1183
                verify_action=verify_action,
      
        1184
                auto_recover=False,
      
        1185
            )
      
        1186
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1187
            persistent_messages: list[str] = []
      
        1188
            ephemeral_messages: list[str] = []
      
        1189
            context.queue_steering_message_callback = persistent_messages.append
      
        1190
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1191
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1192
            tool_call = ToolCall(
      
        1193
                id="read-index",
      
        1194
                name="read",
      
        1195
                arguments={"file_path": str(index_path)},
      
        1196
            )
      
        1197
            executor = FakeExecutor(
      
        1198
                [
      
        1199
                    tool_outcome(
      
        1200
                        tool_call=tool_call,
      
        1201
                        output=index_path.read_text(),
      
        1202
                        is_error=False,
      
        1203
                    )
      
        1204
                ]
      
        1205
            )
      
        1206
        
        1207
            summary = TurnSummary(final_response="")
      
        1208
            await runner.execute_batch(
      
        1209
                tool_calls=[tool_call],
      
        1210
                tool_source="assistant",
      
        1211
                pending_tool_calls_seen=set(),
      
        1212
                emit=_noop_emit,
      
        1213
                summary=summary,
      
        1214
                dod=create_definition_of_done(prompt),
      
        1215
                executor=executor,  # type: ignore[arg-type]
      
        1216
                on_confirmation=None,
      
        1217
                on_user_question=None,
      
        1218
                emit_confirmation=None,
      
        1219
                consecutive_errors=0,
      
        1220
            )
      
        1221
        
        1222
            assert persistent_messages == []
      
        1223
            assert ephemeral_messages == []
      
        1224
            assert all(
      
        1225
                "Semantic verification preview:" not in message.content
      
        1226
                for message in summary.tool_result_messages
      
        1227
            )
      
        1228
        
        1229
        
        1230
        @pytest.mark.asyncio
      
        1231
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1232
            temp_dir: Path,
      
        1233
        ) -> None:
      
        1234
            async def assess_confidence(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                context: str,
      
        1238
            ) -> ConfidenceAssessment:
      
        1239
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1240
        
        1241
            async def verify_action(
      
        1242
                tool_name: str,
      
        1243
                tool_args: dict,
      
        1244
                result: str,
      
        1245
                expected: str = "",
      
        1246
            ) -> ActionVerification:
      
        1247
                raise AssertionError("Verification should not run for this scenario")
      
        1248
        
        1249
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1250
            reference.parent.mkdir(parents=True)
      
        1251
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1252
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1253
            chapters = nginx_root / "chapters"
      
        1254
            implementation_plan = temp_dir / "implementation.md"
      
        1255
            implementation_plan.write_text(
      
        1256
                "\n".join(
      
        1257
                    [
      
        1258
                        "# Implementation Plan",
      
        1259
                        "",
      
        1260
                        "## File Changes",
      
        1261
                        f"- `{chapters}/`",
      
        1262
                        f"- `{nginx_root / 'index.html'}`",
      
        1263
                        "",
      
        1264
                    ]
      
        1265
                )
      
        1266
            )
      
        1267
        
        1268
            context = build_context(
      
        1269
                temp_dir=temp_dir,
      
        1270
                messages=[],
      
        1271
                safeguards=FakeSafeguards(),
      
        1272
                assess_confidence=assess_confidence,
      
        1273
                verify_action=verify_action,
      
        1274
                auto_recover=False,
      
        1275
            )
      
        1276
            persistent_messages: list[str] = []
      
        1277
            ephemeral_messages: list[str] = []
      
        1278
            context.queue_steering_message_callback = persistent_messages.append
      
        1279
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1280
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1281
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1282
            dod.implementation_plan = str(implementation_plan)
      
        1283
            sync_todos_to_definition_of_done(
      
        1284
                dod,
      
        1285
                [
      
        1286
                    {
      
        1287
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1288
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1289
                        "status": "pending",
      
        1290
                    },
      
        1291
                    {
      
        1292
                        "content": "Create the nginx directory structure",
      
        1293
                        "active_form": "Working on: Create the nginx directory structure",
      
        1294
                        "status": "pending",
      
        1295
                    },
      
        1296
                    {
      
        1297
                        "content": "Create the nginx index.html file",
      
        1298
                        "active_form": "Working on: Create the nginx index.html file",
      
        1299
                        "status": "pending",
      
        1300
                    },
      
        1301
                ],
      
        1302
            )
      
        1303
            tool_call = ToolCall(
      
        1304
                id="read-reference",
      
        1305
                name="read",
      
        1306
                arguments={"file_path": str(reference)},
      
        1307
            )
      
        1308
            executor = FakeExecutor(
      
        1309
                [
      
        1310
                    tool_outcome(
      
        1311
                        tool_call=tool_call,
      
        1312
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1313
                        is_error=False,
      
        1314
                    )
      
        1315
                ]
      
        1316
            )
      
        1317
        
        1318
            summary = TurnSummary(final_response="")
      
        1319
            await runner.execute_batch(
      
        1320
                tool_calls=[tool_call],
      
        1321
                tool_source="assistant",
      
        1322
                pending_tool_calls_seen=set(),
      
        1323
                emit=_noop_emit,
      
        1324
                summary=summary,
      
        1325
                dod=dod,
      
        1326
                executor=executor,  # type: ignore[arg-type]
      
        1327
                on_confirmation=None,
      
        1328
                on_user_question=None,
      
        1329
                emit_confirmation=None,
      
        1330
                consecutive_errors=0,
      
        1331
            )
      
        1332
        
        1333
            assert (
      
        1334
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1335
                in dod.completed_items
      
        1336
            )
      
        1337
            assert any(
      
        1338
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1339
                in message
      
        1340
                for message in persistent_messages
      
        1341
            )
      
        1342
            assert any(
      
        1343
                "Resume by creating `chapters/` now." in message
      
        1344
                for message in persistent_messages
      
        1345
            )
      
        1346
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1347
            assert ephemeral_messages == []
      
        1348
        
        1349
        
        1350
        @pytest.mark.asyncio
      
        1351
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1352
            temp_dir: Path,
      
        1353
        ) -> None:
      
        1354
            async def assess_confidence(
      
        1355
                tool_name: str,
      
        1356
                tool_args: dict,
      
        1357
                context: str,
      
        1358
            ) -> ConfidenceAssessment:
      
        1359
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1360
        
        1361
            async def verify_action(
      
        1362
                tool_name: str,
      
        1363
                tool_args: dict,
      
        1364
                result: str,
      
        1365
                expected: str = "",
      
        1366
            ) -> ActionVerification:
      
        1367
                raise AssertionError("Verification should not run for this scenario")
      
        1368
        
        1369
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1370
            reference.parent.mkdir(parents=True)
      
        1371
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1372
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1373
            chapters = nginx_root / "chapters"
      
        1374
            implementation_plan = temp_dir / "implementation.md"
      
        1375
            implementation_plan.write_text(
      
        1376
                "\n".join(
      
        1377
                    [
      
        1378
                        "# Implementation Plan",
      
        1379
                        "",
      
        1380
                        "## File Changes",
      
        1381
                        f"- `{nginx_root / 'index.html'}`",
      
        1382
                        f"- `{chapters}/`",
      
        1383
                        "",
      
        1384
                    ]
      
        1385
                )
      
        1386
            )
      
        1387
        
        1388
            context = build_context(
      
        1389
                temp_dir=temp_dir,
      
        1390
                messages=[],
      
        1391
                safeguards=FakeSafeguards(),
      
        1392
                assess_confidence=assess_confidence,
      
        1393
                verify_action=verify_action,
      
        1394
                auto_recover=False,
      
        1395
            )
      
        1396
            persistent_messages: list[str] = []
      
        1397
            ephemeral_messages: list[str] = []
      
        1398
            context.queue_steering_message_callback = persistent_messages.append
      
        1399
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1400
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1401
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1402
            dod.implementation_plan = str(implementation_plan)
      
        1403
            sync_todos_to_definition_of_done(
      
        1404
                dod,
      
        1405
                [
      
        1406
                    {
      
        1407
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1408
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1409
                        "status": "pending",
      
        1410
                    },
      
        1411
                    {
      
        1412
                        "content": "Create the nginx directory structure",
      
        1413
                        "active_form": "Working on: Create the nginx directory structure",
      
        1414
                        "status": "pending",
      
        1415
                    },
      
        1416
                    {
      
        1417
                        "content": "Create the nginx index.html file",
      
        1418
                        "active_form": "Working on: Create the nginx index.html file",
      
        1419
                        "status": "pending",
      
        1420
                    },
      
        1421
                ],
      
        1422
                project_root=temp_dir,
      
        1423
            )
      
        1424
            tool_call = ToolCall(
      
        1425
                id="read-reference-index-first",
      
        1426
                name="read",
      
        1427
                arguments={"file_path": str(reference)},
      
        1428
            )
      
        1429
            executor = FakeExecutor(
      
        1430
                [
      
        1431
                    tool_outcome(
      
        1432
                        tool_call=tool_call,
      
        1433
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1434
                        is_error=False,
      
        1435
                    )
      
        1436
                ]
      
        1437
            )
      
        1438
        
        1439
            summary = TurnSummary(final_response="")
      
        1440
            await runner.execute_batch(
      
        1441
                tool_calls=[tool_call],
      
        1442
                tool_source="assistant",
      
        1443
                pending_tool_calls_seen=set(),
      
        1444
                emit=_noop_emit,
      
        1445
                summary=summary,
      
        1446
                dod=dod,
      
        1447
                executor=executor,  # type: ignore[arg-type]
      
        1448
                on_confirmation=None,
      
        1449
                on_user_question=None,
      
        1450
                emit_confirmation=None,
      
        1451
                consecutive_errors=0,
      
        1452
            )
      
        1453
        
        1454
            assert persistent_messages
      
        1455
            assert any(
      
        1456
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1457
                in message
      
        1458
                for message in persistent_messages
      
        1459
            )
      
        1460
            assert any(
      
        1461
                "Resume by creating `chapters/` now." in message
      
        1462
                for message in persistent_messages
      
        1463
            )
      
        1464
            assert all(
      
        1465
                "Next step: create `index.html`." not in message
      
        1466
                for message in persistent_messages
      
        1467
            )
      
        1468
            assert ephemeral_messages == []
      
        1469
        
        1470
        
        1471
        @pytest.mark.asyncio
      
        1472
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1473
            temp_dir: Path,
      
        1474
        ) -> None:
      
        1475
            async def assess_confidence(
      
        1476
                tool_name: str,
      
        1477
                tool_args: dict,
      
        1478
                context: str,
      
        1479
            ) -> ConfidenceAssessment:
      
        1480
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1481
        
        1482
            async def verify_action(
      
        1483
                tool_name: str,
      
        1484
                tool_args: dict,
      
        1485
                result: str,
      
        1486
                expected: str = "",
      
        1487
            ) -> ActionVerification:
      
        1488
                raise AssertionError("Verification should not run for this scenario")
      
        1489
        
        1490
            reference = temp_dir / "fortran" / "index.html"
      
        1491
            reference.parent.mkdir(parents=True)
      
        1492
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1493
        
        1494
            messages = [
      
        1495
                Message(
      
        1496
                    role=Role.TOOL,
      
        1497
                    content=(
      
        1498
                        "Observation [read]: Result: "
      
        1499
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1500
                    ),
      
        1501
                )
      
        1502
            ]
      
        1503
            context = build_context(
      
        1504
                temp_dir=temp_dir,
      
        1505
                messages=messages,
      
        1506
                safeguards=FakeSafeguards(),
      
        1507
                assess_confidence=assess_confidence,
      
        1508
                verify_action=verify_action,
      
        1509
                auto_recover=False,
      
        1510
            )
      
        1511
            prompt = (
      
        1512
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1513
                "for the structure and cadence of the guide. We are going to make an all "
      
        1514
                "new equally thorough guide on how to use the nginx tool."
      
        1515
            )
      
        1516
            context.session.current_task = prompt
      
        1517
            persistent_messages: list[str] = []
      
        1518
            ephemeral_messages: list[str] = []
      
        1519
            context.queue_steering_message_callback = persistent_messages.append
      
        1520
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1522
            dod = create_definition_of_done(prompt)
      
        1523
            sync_todos_to_definition_of_done(
      
        1524
                dod,
      
        1525
                [
      
        1526
                    {
      
        1527
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1528
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1529
                        "status": "completed",
      
        1530
                    },
      
        1531
                    {
      
        1532
                        "content": "Create the nginx directory structure",
      
        1533
                        "active_form": "Working on: Create the nginx directory structure",
      
        1534
                        "status": "pending",
      
        1535
                    },
      
        1536
                    {
      
        1537
                        "content": "Create the nginx index.html file",
      
        1538
                        "active_form": "Working on: Create the nginx index.html file",
      
        1539
                        "status": "pending",
      
        1540
                    },
      
        1541
                ],
      
        1542
            )
      
        1543
            tool_call = ToolCall(
      
        1544
                id="read-dup",
      
        1545
                name="read",
      
        1546
                arguments={"file_path": str(reference)},
      
        1547
            )
      
        1548
            duplicate_message = (
      
        1549
                "[Skipped - duplicate action: Already read "
      
        1550
                f"{reference} recently without any intervening changes; "
      
        1551
                "reuse the earlier read result instead of rereading]"
      
        1552
            )
      
        1553
            executor = FakeExecutor(
      
        1554
                [
      
        1555
                    ToolExecutionOutcome(
      
        1556
                        tool_call=tool_call,
      
        1557
                        state=ToolExecutionState.DUPLICATE,
      
        1558
                        message=Message.tool_result_message(
      
        1559
                            tool_call_id=tool_call.id,
      
        1560
                            display_content=duplicate_message,
      
        1561
                            result_content=duplicate_message,
      
        1562
                        ),
      
        1563
                        event_content=duplicate_message,
      
        1564
                        is_error=False,
      
        1565
                        result_output=duplicate_message,
      
        1566
                    )
      
        1567
                ]
      
        1568
            )
      
        1569
        
        1570
            summary = TurnSummary(final_response="")
      
        1571
            await runner.execute_batch(
      
        1572
                tool_calls=[tool_call],
      
        1573
                tool_source="assistant",
      
        1574
                pending_tool_calls_seen=set(),
      
        1575
                emit=_noop_emit,
      
        1576
                summary=summary,
      
        1577
                dod=dod,
      
        1578
                executor=executor,  # type: ignore[arg-type]
      
        1579
                on_confirmation=None,
      
        1580
                on_user_question=None,
      
        1581
                emit_confirmation=None,
      
        1582
                consecutive_errors=0,
      
        1583
            )
      
        1584
        
        1585
            assert len(persistent_messages) == 1
      
        1586
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1587
            assert (
      
        1588
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1589
                in persistent_messages[0]
      
        1590
            )
      
        1591
            assert "Update `" not in persistent_messages[0]
      
        1592
            assert ephemeral_messages == []
      
        1593
        
        1594
        
        1595
        @pytest.mark.asyncio
      
        1596
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1597
            temp_dir: Path,
      
        1598
        ) -> None:
      
        1599
            async def assess_confidence(
      
        1600
                tool_name: str,
      
        1601
                tool_args: dict,
      
        1602
                context: str,
      
        1603
            ) -> ConfidenceAssessment:
      
        1604
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1605
        
        1606
            async def verify_action(
      
        1607
                tool_name: str,
      
        1608
                tool_args: dict,
      
        1609
                result: str,
      
        1610
                expected: str = "",
      
        1611
            ) -> ActionVerification:
      
        1612
                raise AssertionError("Verification should not run for this scenario")
      
        1613
        
        1614
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1615
            chapters = guide_root / "chapters"
      
        1616
            chapters.mkdir(parents=True)
      
        1617
            chapter_one = chapters / "01-introduction.html"
      
        1618
            chapter_one.write_text("<html></html>\n")
      
        1619
            index_path = guide_root / "index.html"
      
        1620
        
        1621
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1622
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1623
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1624
        
        1625
            implementation_plan = temp_dir / "implementation.md"
      
        1626
            implementation_plan.write_text(
      
        1627
                "\n".join(
      
        1628
                    [
      
        1629
                        "# Implementation Plan",
      
        1630
                        "",
      
        1631
                        "## File Changes",
      
        1632
                        f"- `{guide_root}/`",
      
        1633
                        f"- `{chapters}/`",
      
        1634
                        f"- `{index_path}`",
      
        1635
                        f"- `{chapter_one}`",
      
        1636
                        f"- `{chapters / '02-installation.html'}`",
      
        1637
                        "",
      
        1638
                    ]
      
        1639
                )
      
        1640
            )
      
        1641
        
        1642
            context = build_context(
      
        1643
                temp_dir=temp_dir,
      
        1644
                messages=[],
      
        1645
                safeguards=FakeSafeguards(),
      
        1646
                assess_confidence=assess_confidence,
      
        1647
                verify_action=verify_action,
      
        1648
                auto_recover=False,
      
        1649
            )
      
        1650
            persistent_messages: list[str] = []
      
        1651
            ephemeral_messages: list[str] = []
      
        1652
            context.queue_steering_message_callback = persistent_messages.append
      
        1653
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1654
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1655
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1656
            dod.implementation_plan = str(implementation_plan)
      
        1657
            dod.touched_files.append(str(chapter_one))
      
        1658
            sync_todos_to_definition_of_done(
      
        1659
                dod,
      
        1660
                [
      
        1661
                    {
      
        1662
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1663
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1664
                        "status": "pending",
      
        1665
                    },
      
        1666
                    {
      
        1667
                        "content": "Create each chapter file with appropriate content",
      
        1668
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1669
                        "status": "pending",
      
        1670
                    },
      
        1671
                    {
      
        1672
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1673
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1674
                        "status": "pending",
      
        1675
                    },
      
        1676
                ],
      
        1677
            )
      
        1678
            tool_call = ToolCall(
      
        1679
                id="read-reference-chapter",
      
        1680
                name="read",
      
        1681
                arguments={"file_path": str(reference)},
      
        1682
            )
      
        1683
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1684
            executor = FakeExecutor(
      
        1685
                [
      
        1686
                    ToolExecutionOutcome(
      
        1687
                        tool_call=tool_call,
      
        1688
                        state=ToolExecutionState.EXECUTED,
      
        1689
                        message=Message.tool_result_message(
      
        1690
                            tool_call_id=tool_call.id,
      
        1691
                            display_content=read_output,
      
        1692
                            result_content=read_output,
      
        1693
                        ),
      
        1694
                        event_content=read_output,
      
        1695
                        is_error=False,
      
        1696
                        result_output=read_output,
      
        1697
                    )
      
        1698
                ]
      
        1699
            )
      
        1700
        
        1701
            summary = TurnSummary(final_response="")
      
        1702
            await runner.execute_batch(
      
        1703
                tool_calls=[tool_call],
      
        1704
                tool_source="assistant",
      
        1705
                pending_tool_calls_seen=set(),
      
        1706
                emit=_noop_emit,
      
        1707
                summary=summary,
      
        1708
                dod=dod,
      
        1709
                executor=executor,  # type: ignore[arg-type]
      
        1710
                on_confirmation=None,
      
        1711
                on_user_question=None,
      
        1712
                emit_confirmation=None,
      
        1713
                consecutive_errors=0,
      
        1714
            )
      
        1715
        
        1716
            assert persistent_messages
      
        1717
            assert any(
      
        1718
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1719
                in message
      
        1720
                for message in persistent_messages
      
        1721
            )
      
        1722
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1723
            assert not any(
      
        1724
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1725
                in message
      
        1726
                for message in persistent_messages
      
        1727
            )
      
        1728
            assert ephemeral_messages == []
      
        1729
        
        1730
        
        1731
        @pytest.mark.asyncio
      
        1732
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1733
            temp_dir: Path,
      
        1734
        ) -> None:
      
        1735
            async def assess_confidence(
      
        1736
                tool_name: str,
      
        1737
                tool_args: dict,
      
        1738
                context: str,
      
        1739
            ) -> ConfidenceAssessment:
      
        1740
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1741
        
        1742
            async def verify_action(
      
        1743
                tool_name: str,
      
        1744
                tool_args: dict,
      
        1745
                result: str,
      
        1746
                expected: str = "",
      
        1747
            ) -> ActionVerification:
      
        1748
                raise AssertionError("Verification should not run for this scenario")
      
        1749
        
        1750
            guide_root = temp_dir / "guides" / "nginx"
      
        1751
            chapters = guide_root / "chapters"
      
        1752
            guide_root.mkdir(parents=True)
      
        1753
            chapters.mkdir()
      
        1754
            index_path = guide_root / "index.html"
      
        1755
            chapter_one = chapters / "01-getting-started.html"
      
        1756
            chapter_two = chapters / "02-installation.html"
      
        1757
            index_path.write_text("<html></html>\n")
      
        1758
            chapter_one.write_text("<h1>One</h1>\n")
      
        1759
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1760
        
        1761
            implementation_plan = temp_dir / "implementation.md"
      
        1762
            implementation_plan.write_text(
      
        1763
                "\n".join(
      
        1764
                    [
      
        1765
                        "# Implementation Plan",
      
        1766
                        "",
      
        1767
                        "## File Changes",
      
        1768
                        f"- `{guide_root}/`",
      
        1769
                        f"- `{chapters}/`",
      
        1770
                        f"- `{index_path}`",
      
        1771
                        f"- `{chapter_one}`",
      
        1772
                        f"- `{chapter_two}`",
      
        1773
                        "",
      
        1774
                    ]
      
        1775
                )
      
        1776
            )
      
        1777
        
        1778
            context = build_context(
      
        1779
                temp_dir=temp_dir,
      
        1780
                messages=[],
      
        1781
                safeguards=FakeSafeguards(),
      
        1782
                assess_confidence=assess_confidence,
      
        1783
                verify_action=verify_action,
      
        1784
                auto_recover=False,
      
        1785
            )
      
        1786
            persistent_messages: list[str] = []
      
        1787
            ephemeral_messages: list[str] = []
      
        1788
            context.queue_steering_message_callback = persistent_messages.append
      
        1789
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1790
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1791
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1792
            dod.implementation_plan = str(implementation_plan)
      
        1793
            dod.pending_items = [
      
        1794
                "Create 07-performance-tuning.html",
      
        1795
                "Verify all guide files are linked and complete",
      
        1796
                "Complete the requested work",
      
        1797
            ]
      
        1798
        
        1799
            tool_call = ToolCall(
      
        1800
                id="read-dup",
      
        1801
                name="read",
      
        1802
                arguments={"file_path": str(chapter_one)},
      
        1803
            )
      
        1804
            duplicate_message = (
      
        1805
                "[Skipped - duplicate action: Already read "
      
        1806
                f"{chapter_one} recently without any intervening changes; "
      
        1807
                "reuse the earlier read result instead of rereading]"
      
        1808
            )
      
        1809
            executor = FakeExecutor(
      
        1810
                [
      
        1811
                    ToolExecutionOutcome(
      
        1812
                        tool_call=tool_call,
      
        1813
                        state=ToolExecutionState.DUPLICATE,
      
        1814
                        message=Message.tool_result_message(
      
        1815
                            tool_call_id=tool_call.id,
      
        1816
                            display_content=duplicate_message,
      
        1817
                            result_content=duplicate_message,
      
        1818
                        ),
      
        1819
                        event_content=duplicate_message,
      
        1820
                        is_error=False,
      
        1821
                        result_output=duplicate_message,
      
        1822
                    )
      
        1823
                ]
      
        1824
            )
      
        1825
        
        1826
            summary = TurnSummary(final_response="")
      
        1827
            await runner.execute_batch(
      
        1828
                tool_calls=[tool_call],
      
        1829
                tool_source="assistant",
      
        1830
                pending_tool_calls_seen=set(),
      
        1831
                emit=_noop_emit,
      
        1832
                summary=summary,
      
        1833
                dod=dod,
      
        1834
                executor=executor,  # type: ignore[arg-type]
      
        1835
                on_confirmation=None,
      
        1836
                on_user_question=None,
      
        1837
                emit_confirmation=None,
      
        1838
                consecutive_errors=0,
      
        1839
            )
      
        1840
        
        1841
            assert len(persistent_messages) == 1
      
        1842
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1843
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1844
            assert ephemeral_messages == []
      
        1845
        
        1846
        
        1847
        @pytest.mark.asyncio
      
        1848
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1849
            temp_dir: Path,
      
        1850
        ) -> None:
      
        1851
            async def assess_confidence(
      
        1852
                tool_name: str,
      
        1853
                tool_args: dict,
      
        1854
                context: str,
      
        1855
            ) -> ConfidenceAssessment:
      
        1856
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1857
        
        1858
            async def verify_action(
      
        1859
                tool_name: str,
      
        1860
                tool_args: dict,
      
        1861
                result: str,
      
        1862
                expected: str = "",
      
        1863
            ) -> ActionVerification:
      
        1864
                raise AssertionError("Verification should not run for this scenario")
      
        1865
        
        1866
            guide_root = temp_dir / "guides" / "nginx"
      
        1867
            chapters = guide_root / "chapters"
      
        1868
            guide_root.mkdir(parents=True)
      
        1869
            chapters.mkdir()
      
        1870
            index_path = guide_root / "index.html"
      
        1871
            chapter_one = chapters / "01-getting-started.html"
      
        1872
            chapter_two = chapters / "02-installation.html"
      
        1873
            index_path.write_text("<html></html>\n")
      
        1874
            chapter_one.write_text("<h1>One</h1>\n")
      
        1875
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1876
        
        1877
            implementation_plan = temp_dir / "implementation.md"
      
        1878
            implementation_plan.write_text(
      
        1879
                "\n".join(
      
        1880
                    [
      
        1881
                        "# Implementation Plan",
      
        1882
                        "",
      
        1883
                        "## File Changes",
      
        1884
                        f"- `{guide_root}/`",
      
        1885
                        f"- `{chapters}/`",
      
        1886
                        f"- `{index_path}`",
      
        1887
                        f"- `{chapter_one}`",
      
        1888
                        f"- `{chapter_two}`",
      
        1889
                        "",
      
        1890
                    ]
      
        1891
                )
      
        1892
            )
      
        1893
        
        1894
            context = build_context(
      
        1895
                temp_dir=temp_dir,
      
        1896
                messages=[],
      
        1897
                safeguards=FakeSafeguards(),
      
        1898
                assess_confidence=assess_confidence,
      
        1899
                verify_action=verify_action,
      
        1900
                auto_recover=False,
      
        1901
            )
      
        1902
            persistent_messages: list[str] = []
      
        1903
            ephemeral_messages: list[str] = []
      
        1904
            context.queue_steering_message_callback = persistent_messages.append
      
        1905
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1906
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1907
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1908
            dod.implementation_plan = str(implementation_plan)
      
        1909
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1910
            dod.pending_items = [
      
        1911
                "Create 07-performance-tuning.html",
      
        1912
                "Complete the requested work",
      
        1913
            ]
      
        1914
        
        1915
            tool_call = ToolCall(
      
        1916
                id="read-dup",
      
        1917
                name="read",
      
        1918
                arguments={"file_path": str(chapter_one)},
      
        1919
            )
      
        1920
            duplicate_message = (
      
        1921
                "[Skipped - duplicate action: Already read "
      
        1922
                f"{chapter_one} recently without any intervening changes; "
      
        1923
                "reuse the earlier read result instead of rereading]"
      
        1924
            )
      
        1925
            executor = FakeExecutor(
      
        1926
                [
      
        1927
                    ToolExecutionOutcome(
      
        1928
                        tool_call=tool_call,
      
        1929
                        state=ToolExecutionState.DUPLICATE,
      
        1930
                        message=Message.tool_result_message(
      
        1931
                            tool_call_id=tool_call.id,
      
        1932
                            display_content=duplicate_message,
      
        1933
                            result_content=duplicate_message,
      
        1934
                        ),
      
        1935
                        event_content=duplicate_message,
      
        1936
                        is_error=False,
      
        1937
                        result_output=duplicate_message,
      
        1938
                    )
      
        1939
                ]
      
        1940
            )
      
        1941
        
        1942
            summary = TurnSummary(final_response="")
      
        1943
            await runner.execute_batch(
      
        1944
                tool_calls=[tool_call],
      
        1945
                tool_source="assistant",
      
        1946
                pending_tool_calls_seen=set(),
      
        1947
                emit=_noop_emit,
      
        1948
                summary=summary,
      
        1949
                dod=dod,
      
        1950
                executor=executor,  # type: ignore[arg-type]
      
        1951
                on_confirmation=None,
      
        1952
                on_user_question=None,
      
        1953
                emit_confirmation=None,
      
        1954
                consecutive_errors=0,
      
        1955
            )
      
        1956
        
        1957
            assert len(persistent_messages) == 1
      
        1958
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        1959
            assert (
      
        1960
                "Move to verification or final confirmation using the files already on disk."
      
        1961
                in persistent_messages[0]
      
        1962
            )
      
        1963
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1964
            assert ephemeral_messages == []
      
        1965
        
        1966
        
        1967
        @pytest.mark.asyncio
      
        1968
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1969
            temp_dir: Path,
      
        1970
        ) -> None:
      
        1971
            async def assess_confidence(
      
        1972
                tool_name: str,
      
        1973
                tool_args: dict,
      
        1974
                context: str,
      
        1975
            ) -> ConfidenceAssessment:
      
        1976
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1977
        
        1978
            async def verify_action(
      
        1979
                tool_name: str,
      
        1980
                tool_args: dict,
      
        1981
                result: str,
      
        1982
                expected: str = "",
      
        1983
            ) -> ActionVerification:
      
        1984
                raise AssertionError("Verification should not run for this scenario")
      
        1985
        
        1986
            guide_root = temp_dir / "guides" / "nginx"
      
        1987
            chapters = guide_root / "chapters"
      
        1988
            guide_root.mkdir(parents=True)
      
        1989
            chapters.mkdir()
      
        1990
            index_path = guide_root / "index.html"
      
        1991
            chapter_one = chapters / "01-getting-started.html"
      
        1992
            chapter_two = chapters / "02-installation.html"
      
        1993
            index_path.write_text("<html></html>\n")
      
        1994
            chapter_one.write_text("<h1>One</h1>\n")
      
        1995
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1996
        
        1997
            implementation_plan = temp_dir / "implementation.md"
      
        1998
            implementation_plan.write_text(
      
        1999
                "\n".join(
      
        2000
                    [
      
        2001
                        "# Implementation Plan",
      
        2002
                        "",
      
        2003
                        "## File Changes",
      
        2004
                        f"- `{guide_root}/`",
      
        2005
                        f"- `{chapters}/`",
      
        2006
                        f"- `{index_path}`",
      
        2007
                        f"- `{chapter_one}`",
      
        2008
                        f"- `{chapter_two}`",
      
        2009
                        "",
      
        2010
                    ]
      
        2011
                )
      
        2012
            )
      
        2013
        
        2014
            context = build_context(
      
        2015
                temp_dir=temp_dir,
      
        2016
                messages=[],
      
        2017
                safeguards=FakeSafeguards(),
      
        2018
                assess_confidence=assess_confidence,
      
        2019
                verify_action=verify_action,
      
        2020
                auto_recover=False,
      
        2021
            )
      
        2022
            persistent_messages: list[str] = []
      
        2023
            ephemeral_messages: list[str] = []
      
        2024
            context.queue_steering_message_callback = persistent_messages.append
      
        2025
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2026
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2027
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2028
            dod.implementation_plan = str(implementation_plan)
      
        2029
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2030
            dod.pending_items = [
      
        2031
                "Create 01-getting-started.html",
      
        2032
                "Creating 02-installation.html",
      
        2033
                "Complete the requested work",
      
        2034
            ]
      
        2035
        
        2036
            tool_call = ToolCall(
      
        2037
                id="read-dup-built-stale",
      
        2038
                name="read",
      
        2039
                arguments={"file_path": str(chapter_one)},
      
        2040
            )
      
        2041
            duplicate_message = (
      
        2042
                "[Skipped - duplicate action: Already read "
      
        2043
                f"{chapter_one} recently without any intervening changes; "
      
        2044
                "reuse the earlier read result instead of rereading]"
      
        2045
            )
      
        2046
            executor = FakeExecutor(
      
        2047
                [
      
        2048
                    ToolExecutionOutcome(
      
        2049
                        tool_call=tool_call,
      
        2050
                        state=ToolExecutionState.DUPLICATE,
      
        2051
                        message=Message.tool_result_message(
      
        2052
                            tool_call_id=tool_call.id,
      
        2053
                            display_content=duplicate_message,
      
        2054
                            result_content=duplicate_message,
      
        2055
                        ),
      
        2056
                        event_content=duplicate_message,
      
        2057
                        is_error=False,
      
        2058
                        result_output=duplicate_message,
      
        2059
                    )
      
        2060
                ]
      
        2061
            )
      
        2062
        
        2063
            summary = TurnSummary(final_response="")
      
        2064
            await runner.execute_batch(
      
        2065
                tool_calls=[tool_call],
      
        2066
                tool_source="assistant",
      
        2067
                pending_tool_calls_seen=set(),
      
        2068
                emit=_noop_emit,
      
        2069
                summary=summary,
      
        2070
                dod=dod,
      
        2071
                executor=executor,  # type: ignore[arg-type]
      
        2072
                on_confirmation=None,
      
        2073
                on_user_question=None,
      
        2074
                emit_confirmation=None,
      
        2075
                consecutive_errors=0,
      
        2076
            )
      
        2077
        
        2078
            assert len(persistent_messages) == 1
      
        2079
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        2080
            assert (
      
        2081
                "Move to verification or final confirmation using the files already on disk."
      
        2082
                in persistent_messages[0]
      
        2083
            )
      
        2084
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        2085
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        2086
            assert ephemeral_messages == []
      
        2087
        
        2088
        
        2089
        @pytest.mark.asyncio
      
        2090
        async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
      
        2091
            temp_dir: Path,
      
        2092
        ) -> None:
      
        2093
            async def assess_confidence(
      
        2094
                tool_name: str,
      
        2095
                tool_args: dict,
      
        2096
                context: str,
      
        2097
            ) -> ConfidenceAssessment:
      
        2098
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2099
        
        2100
            async def verify_action(
      
        2101
                tool_name: str,
      
        2102
                tool_args: dict,
      
        2103
                result: str,
      
        2104
                expected: str = "",
      
        2105
            ) -> ActionVerification:
      
        2106
                raise AssertionError("Verification should not run for this scenario")
      
        2107
        
        2108
            guide_root = temp_dir / "guides" / "nginx"
      
        2109
            chapters = guide_root / "chapters"
      
        2110
            guide_root.mkdir(parents=True)
      
        2111
            chapters.mkdir()
      
        2112
            index_path = guide_root / "index.html"
      
        2113
            chapter_one = chapters / "01-getting-started.html"
      
        2114
            chapter_two = chapters / "02-installation.html"
      
        2115
            index_path.write_text("<html></html>\n")
      
        2116
            chapter_one.write_text("<h1>One</h1>\n")
      
        2117
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2118
        
        2119
            implementation_plan = temp_dir / "implementation.md"
      
        2120
            implementation_plan.write_text(
      
        2121
                "\n".join(
      
        2122
                    [
      
        2123
                        "# Implementation Plan",
      
        2124
                        "",
      
        2125
                        "## File Changes",
      
        2126
                        f"- `{guide_root}/`",
      
        2127
                        f"- `{chapters}/`",
      
        2128
                        f"- `{index_path}`",
      
        2129
                        f"- `{chapter_one}`",
      
        2130
                        f"- `{chapter_two}`",
      
        2131
                        "",
      
        2132
                    ]
      
        2133
                )
      
        2134
            )
      
        2135
        
        2136
            context = build_context(
      
        2137
                temp_dir=temp_dir,
      
        2138
                messages=[],
      
        2139
                safeguards=FakeSafeguards(),
      
        2140
                assess_confidence=assess_confidence,
      
        2141
                verify_action=verify_action,
      
        2142
                auto_recover=False,
      
        2143
            )
      
        2144
            persistent_messages: list[str] = []
      
        2145
            ephemeral_messages: list[str] = []
      
        2146
            context.queue_steering_message_callback = persistent_messages.append
      
        2147
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2148
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2149
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2150
            dod.implementation_plan = str(implementation_plan)
      
        2151
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2152
            sync_todos_to_definition_of_done(
      
        2153
                dod,
      
        2154
                [
      
        2155
                    {
      
        2156
                        "content": "Create 01-getting-started.html",
      
        2157
                        "active_form": "Creating 01-getting-started.html",
      
        2158
                        "status": "pending",
      
        2159
                    },
      
        2160
                    {
      
        2161
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2162
                        "active_form": "Reviewing guide consistency and linkage",
      
        2163
                        "status": "pending",
      
        2164
                    },
      
        2165
                ],
      
        2166
            )
      
        2167
        
        2168
            tool_call = ToolCall(
      
        2169
                id="read-built-review",
      
        2170
                name="read",
      
        2171
                arguments={"file_path": str(chapter_one)},
      
        2172
            )
      
        2173
            executor = FakeExecutor(
      
        2174
                [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
      
        2175
            )
      
        2176
        
        2177
            summary = TurnSummary(final_response="")
      
        2178
            await runner.execute_batch(
      
        2179
                tool_calls=[tool_call],
      
        2180
                tool_source="assistant",
      
        2181
                pending_tool_calls_seen=set(),
      
        2182
                emit=_noop_emit,
      
        2183
                summary=summary,
      
        2184
                dod=dod,
      
        2185
                executor=executor,  # type: ignore[arg-type]
      
        2186
                on_confirmation=None,
      
        2187
                on_user_question=None,
      
        2188
                emit_confirmation=None,
      
        2189
                consecutive_errors=0,
      
        2190
            )
      
        2191
        
        2192
            assert persistent_messages == []
      
        2193
            assert len(ephemeral_messages) == 1
      
        2194
            message = ephemeral_messages[0]
      
        2195
            assert "All explicitly planned artifacts already exist." in message
      
        2196
            assert "Ensure all files are properly linked and formatted consistently" in message
      
        2197
            assert "Create 01-getting-started.html" not in message
      
        2198
            assert "do not keep broad-rereading the output set" in message
      
        2199
            assert "If no specific mismatch remains, move to verification now." in message
      
        2200
        
        2201
        
        2202
        @pytest.mark.asyncio
      
        2203
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        2204
            temp_dir: Path,
      
        2205
        ) -> None:
      
        2206
            async def assess_confidence(
      
        2207
                tool_name: str,
      
        2208
                tool_args: dict,
      
        2209
                context: str,
      
        2210
            ) -> ConfidenceAssessment:
      
        2211
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2212
        
        2213
            async def verify_action(
      
        2214
                tool_name: str,
      
        2215
                tool_args: dict,
      
        2216
                result: str,
      
        2217
                expected: str = "",
      
        2218
            ) -> ActionVerification:
      
        2219
                raise AssertionError("Verification should not run for this scenario")
      
        2220
        
        2221
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2222
            reference.parent.mkdir(parents=True)
      
        2223
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2224
        
        2225
            context = build_context(
      
        2226
                temp_dir=temp_dir,
      
        2227
                messages=[],
      
        2228
                safeguards=FakeSafeguards(),
      
        2229
                assess_confidence=assess_confidence,
      
        2230
                verify_action=verify_action,
      
        2231
                auto_recover=False,
      
        2232
            )
      
        2233
            persistent_messages: list[str] = []
      
        2234
            ephemeral_messages: list[str] = []
      
        2235
            context.queue_steering_message_callback = persistent_messages.append
      
        2236
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2237
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2238
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2239
            sync_todos_to_definition_of_done(
      
        2240
                dod,
      
        2241
                [
      
        2242
                    {
      
        2243
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2244
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2245
                        "status": "pending",
      
        2246
                    },
      
        2247
                    {
      
        2248
                        "content": "Create the nginx index.html file",
      
        2249
                        "active_form": "Working on: Create the nginx index.html file",
      
        2250
                        "status": "pending",
      
        2251
                    },
      
        2252
                ],
      
        2253
            )
      
        2254
            tool_call = ToolCall(
      
        2255
                id="read-reference",
      
        2256
                name="read",
      
        2257
                arguments={"file_path": str(reference)},
      
        2258
            )
      
        2259
            executor = FakeExecutor(
      
        2260
                [
      
        2261
                    tool_outcome(
      
        2262
                        tool_call=tool_call,
      
        2263
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2264
                        is_error=False,
      
        2265
                    )
      
        2266
                ]
      
        2267
            )
      
        2268
        
        2269
            summary = TurnSummary(final_response="")
      
        2270
            await runner.execute_batch(
      
        2271
                tool_calls=[tool_call],
      
        2272
                tool_source="assistant",
      
        2273
                pending_tool_calls_seen=set(),
      
        2274
                emit=_noop_emit,
      
        2275
                summary=summary,
      
        2276
                dod=dod,
      
        2277
                executor=executor,  # type: ignore[arg-type]
      
        2278
                on_confirmation=None,
      
        2279
                on_user_question=None,
      
        2280
                emit_confirmation=None,
      
        2281
                consecutive_errors=0,
      
        2282
            )
      
        2283
        
        2284
            assert any(
      
        2285
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2286
                in message
      
        2287
                for message in persistent_messages
      
        2288
            )
      
        2289
            assert any(
      
        2290
                "stop gathering more reference material and perform the change now" in message
      
        2291
                for message in persistent_messages
      
        2292
            )
      
        2293
            assert ephemeral_messages == []
      
        2294
        
        2295
        
        2296
        @pytest.mark.asyncio
      
        2297
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2298
            temp_dir: Path,
      
        2299
        ) -> None:
      
        2300
            async def assess_confidence(
      
        2301
                tool_name: str,
      
        2302
                tool_args: dict,
      
        2303
                context: str,
      
        2304
            ) -> ConfidenceAssessment:
      
        2305
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2306
        
        2307
            async def verify_action(
      
        2308
                tool_name: str,
      
        2309
                tool_args: dict,
      
        2310
                result: str,
      
        2311
                expected: str = "",
      
        2312
            ) -> ActionVerification:
      
        2313
                raise AssertionError("Verification should not run for this scenario")
      
        2314
        
        2315
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2316
            reference.parent.mkdir(parents=True)
      
        2317
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2318
        
        2319
            context = build_context(
      
        2320
                temp_dir=temp_dir,
      
        2321
                messages=[],
      
        2322
                safeguards=FakeSafeguards(),
      
        2323
                assess_confidence=assess_confidence,
      
        2324
                verify_action=verify_action,
      
        2325
                auto_recover=False,
      
        2326
            )
      
        2327
            persistent_messages: list[str] = []
      
        2328
            ephemeral_messages: list[str] = []
      
        2329
            context.queue_steering_message_callback = persistent_messages.append
      
        2330
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2331
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2332
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2333
            sync_todos_to_definition_of_done(
      
        2334
                dod,
      
        2335
                [
      
        2336
                    {
      
        2337
                        "content": "First, examine the existing fortran guide structure and content",
      
        2338
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2339
                        "status": "pending",
      
        2340
                    },
      
        2341
                    {
      
        2342
                        "content": "Create the nginx directory structure",
      
        2343
                        "active_form": "Working on: Create the nginx directory structure",
      
        2344
                        "status": "pending",
      
        2345
                    },
      
        2346
                ],
      
        2347
            )
      
        2348
            tool_call = ToolCall(
      
        2349
                id="read-reference",
      
        2350
                name="read",
      
        2351
                arguments={"file_path": str(reference)},
      
        2352
            )
      
        2353
            executor = FakeExecutor(
      
        2354
                [
      
        2355
                    tool_outcome(
      
        2356
                        tool_call=tool_call,
      
        2357
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2358
                        is_error=False,
      
        2359
                    )
      
        2360
                ]
      
        2361
            )
      
        2362
        
        2363
            summary = TurnSummary(final_response="")
      
        2364
            await runner.execute_batch(
      
        2365
                tool_calls=[tool_call],
      
        2366
                tool_source="assistant",
      
        2367
                pending_tool_calls_seen=set(),
      
        2368
                emit=_noop_emit,
      
        2369
                summary=summary,
      
        2370
                dod=dod,
      
        2371
                executor=executor,  # type: ignore[arg-type]
      
        2372
                on_confirmation=None,
      
        2373
                on_user_question=None,
      
        2374
                emit_confirmation=None,
      
        2375
                consecutive_errors=0,
      
        2376
            )
      
        2377
        
        2378
            assert persistent_messages
      
        2379
            assert any(
      
        2380
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2381
                in message
      
        2382
                for message in persistent_messages
      
        2383
            )
      
        2384
            assert ephemeral_messages == []
      
        2385
        
        2386
        
        2387
        @pytest.mark.asyncio
      
        2388
        async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
      
        2389
            temp_dir: Path,
      
        2390
        ) -> None:
      
        2391
            async def assess_confidence(
      
        2392
                tool_name: str,
      
        2393
                tool_args: dict,
      
        2394
                context: str,
      
        2395
            ) -> ConfidenceAssessment:
      
        2396
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2397
        
        2398
            async def verify_action(
      
        2399
                tool_name: str,
      
        2400
                tool_args: dict,
      
        2401
                result: str,
      
        2402
                expected: str = "",
      
        2403
            ) -> ActionVerification:
      
        2404
                raise AssertionError("Verification should not run for this scenario")
      
        2405
        
        2406
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2407
            chapters = nginx_root / "chapters"
      
        2408
            implementation_plan = temp_dir / "implementation.md"
      
        2409
            implementation_plan.write_text(
      
        2410
                "\n".join(
      
        2411
                    [
      
        2412
                        "# Implementation Plan",
      
        2413
                        "",
      
        2414
                        "## File Changes",
      
        2415
                        f"- `{chapters}/`",
      
        2416
                        f"- `{nginx_root / 'index.html'}`",
      
        2417
                        "",
      
        2418
                    ]
      
        2419
                )
      
        2420
            )
      
        2421
        
        2422
            context = build_context(
      
        2423
                temp_dir=temp_dir,
      
        2424
                messages=[],
      
        2425
                safeguards=FakeSafeguards(),
      
        2426
                assess_confidence=assess_confidence,
      
        2427
                verify_action=verify_action,
      
        2428
                auto_recover=False,
      
        2429
            )
      
        2430
            persistent_messages: list[str] = []
      
        2431
            ephemeral_messages: list[str] = []
      
        2432
            context.queue_steering_message_callback = persistent_messages.append
      
        2433
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2434
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2435
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2436
            dod.implementation_plan = str(implementation_plan)
      
        2437
            sync_todos_to_definition_of_done(
      
        2438
                dod,
      
        2439
                [
      
        2440
                    {
      
        2441
                        "content": "Create the nginx directory structure",
      
        2442
                        "active_form": "Creating the nginx directory structure",
      
        2443
                        "status": "pending",
      
        2444
                    },
      
        2445
                    {
      
        2446
                        "content": "Develop the main index.html file with proper structure",
      
        2447
                        "active_form": "Developing the main index.html file with proper structure",
      
        2448
                        "status": "pending",
      
        2449
                    },
      
        2450
                ],
      
        2451
            )
      
        2452
        
        2453
            tool_call = ToolCall(
      
        2454
                id="mkdir-nginx",
      
        2455
                name="bash",
      
        2456
                arguments={"command": f"mkdir -p {chapters}"},
      
        2457
            )
      
        2458
            executor = FakeExecutor(
      
        2459
                [
      
        2460
                    tool_outcome(
      
        2461
                        tool_call=tool_call,
      
        2462
                        output="",
      
        2463
                        is_error=False,
      
        2464
                    )
      
        2465
                ]
      
        2466
            )
      
        2467
        
        2468
            summary = TurnSummary(final_response="")
      
        2469
            await runner.execute_batch(
      
        2470
                tool_calls=[tool_call],
      
        2471
                tool_source="assistant",
      
        2472
                pending_tool_calls_seen=set(),
      
        2473
                emit=_noop_emit,
      
        2474
                summary=summary,
      
        2475
                dod=dod,
      
        2476
                executor=executor,  # type: ignore[arg-type]
      
        2477
                on_confirmation=None,
      
        2478
                on_user_question=None,
      
        2479
                emit_confirmation=None,
      
        2480
                consecutive_errors=0,
      
        2481
            )
      
        2482
        
        2483
            assert persistent_messages
      
        2484
            message = persistent_messages[-1]
      
        2485
            assert "Directory setup is complete." in message
      
        2486
            assert "Next step: create `index.html`." in message
      
        2487
            assert "Write a compact but real initial version of that file now" in message
      
        2488
            assert ephemeral_messages == []
      
        2489
        
        2490
        
        2491
        @pytest.mark.asyncio
      
        2492
        async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
      
        2493
            temp_dir: Path,
      
        2494
        ) -> None:
      
        2495
            async def assess_confidence(
      
        2496
                tool_name: str,
      
        2497
                tool_args: dict,
      
        2498
                context: str,
      
        2499
            ) -> ConfidenceAssessment:
      
        2500
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2501
        
        2502
            async def verify_action(
      
        2503
                tool_name: str,
      
        2504
                tool_args: dict,
      
        2505
                result: str,
      
        2506
                expected: str = "",
      
        2507
            ) -> ActionVerification:
      
        2508
                raise AssertionError("Verification should not run for this scenario")
      
        2509
        
        2510
            nginx_root = temp_dir / "guides" / "nginx"
      
        2511
            chapters = nginx_root / "chapters"
      
        2512
            chapters.mkdir(parents=True)
      
        2513
            index_path = nginx_root / "index.html"
      
        2514
        
        2515
            implementation_plan = temp_dir / "implementation.md"
      
        2516
            implementation_plan.write_text(
      
        2517
                "\n".join(
      
        2518
                    [
      
        2519
                        "# Implementation Plan",
      
        2520
                        "",
      
        2521
                        "## File Changes",
      
        2522
                        f"- `{chapters}/`",
      
        2523
                        f"- `{index_path}`",
      
        2524
                        f"- `{chapters / '01-introduction.html'}`",
      
        2525
                        "",
      
        2526
                    ]
      
        2527
                )
      
        2528
            )
      
        2529
        
        2530
            context = build_context(
      
        2531
                temp_dir=temp_dir,
      
        2532
                messages=[],
      
        2533
                safeguards=FakeSafeguards(),
      
        2534
                assess_confidence=assess_confidence,
      
        2535
                verify_action=verify_action,
      
        2536
                auto_recover=False,
      
        2537
            )
      
        2538
            persistent_messages: list[str] = []
      
        2539
            ephemeral_messages: list[str] = []
      
        2540
            context.queue_steering_message_callback = persistent_messages.append
      
        2541
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2542
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2543
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2544
            dod.implementation_plan = str(implementation_plan)
      
        2545
            sync_todos_to_definition_of_done(
      
        2546
                dod,
      
        2547
                [
      
        2548
                    {
      
        2549
                        "content": "Create the main index.html file with proper structure",
      
        2550
                        "active_form": "Creating the main index.html file with proper structure",
      
        2551
                        "status": "pending",
      
        2552
                    },
      
        2553
                    {
      
        2554
                        "content": "Create each chapter file with appropriate content",
      
        2555
                        "active_form": "Creating each chapter file with appropriate content",
      
        2556
                        "status": "pending",
      
        2557
                    },
      
        2558
                ],
      
        2559
            )
      
        2560
        
        2561
            tool_call = ToolCall(
      
        2562
                id="write-index",
      
        2563
                name="write",
      
        2564
                arguments={
      
        2565
                    "file_path": str(index_path),
      
        2566
                    "content": "<html></html>\n",
      
        2567
                },
      
        2568
            )
      
        2569
            executor = FakeExecutor(
      
        2570
                [
      
        2571
                    tool_outcome(
      
        2572
                        tool_call=tool_call,
      
        2573
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2574
                        is_error=False,
      
        2575
                    )
      
        2576
                ]
      
        2577
            )
      
        2578
        
        2579
            summary = TurnSummary(final_response="")
      
        2580
            await runner.execute_batch(
      
        2581
                tool_calls=[tool_call],
      
        2582
                tool_source="assistant",
      
        2583
                pending_tool_calls_seen=set(),
      
        2584
                emit=_noop_emit,
      
        2585
                summary=summary,
      
        2586
                dod=dod,
      
        2587
                executor=executor,  # type: ignore[arg-type]
      
        2588
                on_confirmation=None,
      
        2589
                on_user_question=None,
      
        2590
                emit_confirmation=None,
      
        2591
                consecutive_errors=0,
      
        2592
            )
      
        2593
        
        2594
            assert persistent_messages
      
        2595
            assert ephemeral_messages == []
      
        2596
            message = persistent_messages[-1]
      
        2597
            assert "Confirmed progress:" in message
      
        2598
            assert "Next step: create `01-introduction.html`." in message
      
        2599
            assert (
      
        2600
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2601
                in message
      
        2602
            )
      
        2603
            assert "Write a compact but real initial version of that file now" not in message
      
        2604
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2605
        
        2606
        
        2607
        @pytest.mark.asyncio
      
        2608
        async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
      
        2609
            temp_dir: Path,
      
        2610
            monkeypatch: pytest.MonkeyPatch,
      
        2611
        ) -> None:
      
        2612
            monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
      
        2613
        
        2614
            async def assess_confidence(
      
        2615
                tool_name: str,
      
        2616
                tool_args: dict,
      
        2617
                context: str,
      
        2618
            ) -> ConfidenceAssessment:
      
        2619
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2620
        
        2621
            async def verify_action(
      
        2622
                tool_name: str,
      
        2623
                tool_args: dict,
      
        2624
                result: str,
      
        2625
                expected: str = "",
      
        2626
            ) -> ActionVerification:
      
        2627
                raise AssertionError("Verification should not run for this scenario")
      
        2628
        
        2629
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2630
            chapters = nginx_root / "chapters"
      
        2631
            index_path = nginx_root / "index.html"
      
        2632
        
        2633
            implementation_plan = temp_dir / "implementation.md"
      
        2634
            implementation_plan.write_text(
      
        2635
                "\n".join(
      
        2636
                    [
      
        2637
                        "# Implementation Plan",
      
        2638
                        "",
      
        2639
                        "## File Changes",
      
        2640
                        f"- `{chapters}/`",
      
        2641
                        f"- `{index_path}`",
      
        2642
                        "",
      
        2643
                    ]
      
        2644
                )
      
        2645
            )
      
        2646
        
        2647
            context = build_context(
      
        2648
                temp_dir=temp_dir,
      
        2649
                messages=[],
      
        2650
                safeguards=FakeSafeguards(),
      
        2651
                assess_confidence=assess_confidence,
      
        2652
                verify_action=verify_action,
      
        2653
                auto_recover=False,
      
        2654
            )
      
        2655
            persistent_messages: list[str] = []
      
        2656
            context.queue_steering_message_callback = persistent_messages.append
      
        2657
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2658
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2659
            dod.implementation_plan = str(implementation_plan)
      
        2660
            sync_todos_to_definition_of_done(
      
        2661
                dod,
      
        2662
                [
      
        2663
                    {
      
        2664
                        "content": "Create the nginx directory structure",
      
        2665
                        "active_form": "Creating the nginx directory structure",
      
        2666
                        "status": "pending",
      
        2667
                    },
      
        2668
                    {
      
        2669
                        "content": "Develop the main index.html file with proper structure",
      
        2670
                        "active_form": "Developing the main index.html file with proper structure",
      
        2671
                        "status": "pending",
      
        2672
                    },
      
        2673
                ],
      
        2674
            )
      
        2675
        
        2676
            tool_call = ToolCall(
      
        2677
                id="mkdir-nginx-home",
      
        2678
                name="bash",
      
        2679
                arguments={"command": f"mkdir -p {chapters}"},
      
        2680
            )
      
        2681
            executor = FakeExecutor(
      
        2682
                [
      
        2683
                    tool_outcome(
      
        2684
                        tool_call=tool_call,
      
        2685
                        output="",
      
        2686
                        is_error=False,
      
        2687
                    )
      
        2688
                ]
      
        2689
            )
      
        2690
        
        2691
            summary = TurnSummary(final_response="")
      
        2692
            await runner.execute_batch(
      
        2693
                tool_calls=[tool_call],
      
        2694
                tool_source="assistant",
      
        2695
                pending_tool_calls_seen=set(),
      
        2696
                emit=_noop_emit,
      
        2697
                summary=summary,
      
        2698
                dod=dod,
      
        2699
                executor=executor,  # type: ignore[arg-type]
      
        2700
                on_confirmation=None,
      
        2701
                on_user_question=None,
      
        2702
                emit_confirmation=None,
      
        2703
                consecutive_errors=0,
      
        2704
            )
      
        2705
        
        2706
            assert persistent_messages
      
        2707
            message = persistent_messages[-1]
      
        2708
            assert "Next step: create `index.html`." in message
      
        2709
            assert "`~/Loader/guides/nginx/index.html`" in message
      
        2710
            assert "Write a compact but real initial version of that file now" in message
      
        2711
        
        2712
        
        2713
        @pytest.mark.asyncio
      
        2714
        async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
      
        2715
            temp_dir: Path,
      
        2716
        ) -> None:
      
        2717
            async def assess_confidence(
      
        2718
                tool_name: str,
      
        2719
                tool_args: dict,
      
        2720
                context: str,
      
        2721
            ) -> ConfidenceAssessment:
      
        2722
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2723
        
        2724
            async def verify_action(
      
        2725
                tool_name: str,
      
        2726
                tool_args: dict,
      
        2727
                result: str,
      
        2728
                expected: str = "",
      
        2729
            ) -> ActionVerification:
      
        2730
                raise AssertionError("Verification should not run in this scenario")
      
        2731
        
        2732
            nginx_root = temp_dir / "guides" / "nginx"
      
        2733
            chapters = nginx_root / "chapters"
      
        2734
            chapters.mkdir(parents=True)
      
        2735
            index_path = nginx_root / "index.html"
      
        2736
            index_path.write_text(
      
        2737
                "\n".join(
      
        2738
                    [
      
        2739
                        "<html>",
      
        2740
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2741
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2742
                        "</html>",
      
        2743
                    ]
      
        2744
                )
      
        2745
                + "\n"
      
        2746
            )
      
        2747
        
        2748
            implementation_plan = temp_dir / "implementation.md"
      
        2749
            implementation_plan.write_text(
      
        2750
                "\n".join(
      
        2751
                    [
      
        2752
                        "# Implementation Plan",
      
        2753
                        "",
      
        2754
                        "## File Changes",
      
        2755
                        f"- `{nginx_root}/`",
      
        2756
                        f"- `{chapters}/`",
      
        2757
                        f"- `{index_path}`",
      
        2758
                        f"- `{chapters / '01-introduction.html'}`",
      
        2759
                        "",
      
        2760
                    ]
      
        2761
                )
      
        2762
            )
      
        2763
        
        2764
            context = build_context(
      
        2765
                temp_dir=temp_dir,
      
        2766
                messages=[],
      
        2767
                safeguards=FakeSafeguards(),
      
        2768
                assess_confidence=assess_confidence,
      
        2769
                verify_action=verify_action,
      
        2770
                auto_recover=False,
      
        2771
            )
      
        2772
            persistent_messages: list[str] = []
      
        2773
            ephemeral_messages: list[str] = []
      
        2774
            context.queue_steering_message_callback = persistent_messages.append
      
        2775
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2776
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2777
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2778
            dod.implementation_plan = str(implementation_plan)
      
        2779
            dod.touched_files.append(str(index_path))
      
        2780
            dod.completed_items.append("Develop the main index.html file for the nginx guide")
      
        2781
            dod.pending_items.append("Create chapter files for the nginx guide")
      
        2782
        
        2783
            tool_call = ToolCall(
      
        2784
                id="read-index-self-audit",
      
        2785
                name="read",
      
        2786
                arguments={"file_path": str(index_path)},
      
        2787
            )
      
        2788
            executor = FakeExecutor(
      
        2789
                [
      
        2790
                    tool_outcome(
      
        2791
                        tool_call=tool_call,
      
        2792
                        output="1\t<html>\n",
      
        2793
                        is_error=False,
      
        2794
                    )
      
        2795
                ]
      
        2796
            )
      
        2797
        
        2798
            summary = TurnSummary(final_response="")
      
        2799
            await runner.execute_batch(
      
        2800
                tool_calls=[tool_call],
      
        2801
                tool_source="assistant",
      
        2802
                pending_tool_calls_seen=set(),
      
        2803
                emit=_noop_emit,
      
        2804
                summary=summary,
      
        2805
                dod=dod,
      
        2806
                executor=executor,  # type: ignore[arg-type]
      
        2807
                on_confirmation=None,
      
        2808
                on_user_question=None,
      
        2809
                emit_confirmation=None,
      
        2810
                consecutive_errors=0,
      
        2811
            )
      
        2812
        
        2813
            assert persistent_messages
      
        2814
            message = persistent_messages[-1]
      
        2815
            assert "You already have the current contents of `index.html` from the successful write." in message
      
        2816
            assert "Resume by creating `01-introduction.html` now." in message
      
        2817
            assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
      
        2818
            assert ephemeral_messages == []
      
        2819
        
        2820
        
        2821
        @pytest.mark.asyncio
      
        2822
        async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
      
        2823
            temp_dir: Path,
      
        2824
        ) -> None:
      
        2825
            async def assess_confidence(
      
        2826
                tool_name: str,
      
        2827
                tool_args: dict,
      
        2828
                context: str,
      
        2829
            ) -> ConfidenceAssessment:
      
        2830
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2831
        
        2832
            async def verify_action(
      
        2833
                tool_name: str,
      
        2834
                tool_args: dict,
      
        2835
                result: str,
      
        2836
                expected: str = "",
      
        2837
            ) -> ActionVerification:
      
        2838
                raise AssertionError("Verification should not run for this scenario")
      
        2839
        
        2840
            nginx_root = temp_dir / "guides" / "nginx"
      
        2841
            chapters = nginx_root / "chapters"
      
        2842
            chapters.mkdir(parents=True)
      
        2843
            index_path = nginx_root / "index.html"
      
        2844
        
        2845
            implementation_plan = temp_dir / "implementation.md"
      
        2846
            implementation_plan.write_text(
      
        2847
                "\n".join(
      
        2848
                    [
      
        2849
                        "# Implementation Plan",
      
        2850
                        "",
      
        2851
                        "## File Changes",
      
        2852
                        f"- `{chapters}/`",
      
        2853
                        f"- `{index_path}`",
      
        2854
                        f"- `{chapters / '01-introduction.html'}`",
      
        2855
                        "",
      
        2856
                    ]
      
        2857
                )
      
        2858
            )
      
        2859
        
        2860
            context = build_context(
      
        2861
                temp_dir=temp_dir,
      
        2862
                messages=[
      
        2863
                    Message(
      
        2864
                        role=Role.USER,
      
        2865
                        content=(
      
        2866
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        2867
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        2868
                        ),
      
        2869
                    )
      
        2870
                ],
      
        2871
                safeguards=FakeSafeguards(),
      
        2872
                assess_confidence=assess_confidence,
      
        2873
                verify_action=verify_action,
      
        2874
                auto_recover=False,
      
        2875
            )
      
        2876
            persistent_messages: list[str] = []
      
        2877
            ephemeral_messages: list[str] = []
      
        2878
            context.queue_steering_message_callback = persistent_messages.append
      
        2879
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2880
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2881
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2882
            dod.implementation_plan = str(implementation_plan)
      
        2883
            sync_todos_to_definition_of_done(
      
        2884
                dod,
      
        2885
                [
      
        2886
                    {
      
        2887
                        "content": "Create the main index.html file with proper structure",
      
        2888
                        "active_form": "Creating the main index.html file with proper structure",
      
        2889
                        "status": "pending",
      
        2890
                    },
      
        2891
                    {
      
        2892
                        "content": "Create each chapter file with appropriate content",
      
        2893
                        "active_form": "Creating each chapter file with appropriate content",
      
        2894
                        "status": "pending",
      
        2895
                    },
      
        2896
                ],
      
        2897
            )
      
        2898
        
        2899
            tool_call = ToolCall(
      
        2900
                id="write-index-recovered",
      
        2901
                name="write",
      
        2902
                arguments={
      
        2903
                    "file_path": str(index_path),
      
        2904
                    "content": "<html></html>\n",
      
        2905
                },
      
        2906
            )
      
        2907
            executor = FakeExecutor(
      
        2908
                [
      
        2909
                    tool_outcome(
      
        2910
                        tool_call=tool_call,
      
        2911
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2912
                        is_error=False,
      
        2913
                    )
      
        2914
                ]
      
        2915
            )
      
        2916
        
        2917
            summary = TurnSummary(final_response="")
      
        2918
            await runner.execute_batch(
      
        2919
                tool_calls=[tool_call],
      
        2920
                tool_source="assistant",
      
        2921
                pending_tool_calls_seen=set(),
      
        2922
                emit=_noop_emit,
      
        2923
                summary=summary,
      
        2924
                dod=dod,
      
        2925
                executor=executor,  # type: ignore[arg-type]
      
        2926
                on_confirmation=None,
      
        2927
                on_user_question=None,
      
        2928
                emit_confirmation=None,
      
        2929
                consecutive_errors=0,
      
        2930
            )
      
        2931
        
        2932
            assert persistent_messages == []
      
        2933
            assert ephemeral_messages
      
        2934
            message = ephemeral_messages[-1]
      
        2935
            assert "Next step: create `01-introduction.html`." in message
      
        2936
            assert "Write a compact but real initial version of that file now" not in message
      
        2937
        
        2938
        
        2939
        @pytest.mark.asyncio
      
        2940
        async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
      
        2941
            temp_dir: Path,
      
        2942
        ) -> None:
      
        2943
            async def assess_confidence(
      
        2944
                tool_name: str,
      
        2945
                tool_args: dict,
      
        2946
                context: str,
      
        2947
            ) -> ConfidenceAssessment:
      
        2948
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2949
        
        2950
            async def verify_action(
      
        2951
                tool_name: str,
      
        2952
                tool_args: dict,
      
        2953
                result: str,
      
        2954
                expected: str = "",
      
        2955
            ) -> ActionVerification:
      
        2956
                raise AssertionError("Verification should not run in this scenario")
      
        2957
        
        2958
            guide_root = temp_dir / "guides" / "nginx"
      
        2959
            chapters = guide_root / "chapters"
      
        2960
            chapters.mkdir(parents=True)
      
        2961
            index_path = guide_root / "index.html"
      
        2962
            index_path.write_text(
      
        2963
                "\n".join(
      
        2964
                    [
      
        2965
                        "<html>",
      
        2966
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2967
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2968
                        "</html>",
      
        2969
                    ]
      
        2970
                )
      
        2971
                + "\n"
      
        2972
            )
      
        2973
        
        2974
            implementation_plan = temp_dir / "implementation.md"
      
        2975
            implementation_plan.write_text(
      
        2976
                "\n".join(
      
        2977
                    [
      
        2978
                        "# Implementation Plan",
      
        2979
                        "",
      
        2980
                        "## File Changes",
      
        2981
                        f"- `{guide_root}/`",
      
        2982
                        f"- `{chapters}/`",
      
        2983
                        f"- `{index_path}`",
      
        2984
                        "",
      
        2985
                    ]
      
        2986
                )
      
        2987
            )
      
        2988
        
        2989
            context = build_context(
      
        2990
                temp_dir=temp_dir,
      
        2991
                messages=[],
      
        2992
                safeguards=FakeSafeguards(),
      
        2993
                assess_confidence=assess_confidence,
      
        2994
                verify_action=verify_action,
      
        2995
            )
      
        2996
            queued_messages: list[str] = []
      
        2997
            context.queue_steering_message_callback = queued_messages.append
      
        2998
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2999
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3000
            dod.implementation_plan = str(implementation_plan)
      
        3001
            dod.touched_files.append(str(index_path))
      
        3002
            sync_todos_to_definition_of_done(
      
        3003
                dod,
      
        3004
                [
      
        3005
                    {
      
        3006
                        "content": "Develop the main index.html file with proper structure",
      
        3007
                        "active_form": "Developing the main index.html file with proper structure",
      
        3008
                        "status": "completed",
      
        3009
                    },
      
        3010
                    {
      
        3011
                        "content": "Create chapter files with content and structure",
      
        3012
                        "active_form": "Creating chapter files with content and structure",
      
        3013
                        "status": "pending",
      
        3014
                    },
      
        3015
                ],
      
        3016
            )
      
        3017
        
        3018
            todos = [
      
        3019
                {
      
        3020
                    "content": "Develop the main index.html file with proper structure",
      
        3021
                    "active_form": "Developing the main index.html file with proper structure",
      
        3022
                    "status": "completed",
      
        3023
                },
      
        3024
                {
      
        3025
                    "content": "Create chapter files with content and structure",
      
        3026
                    "active_form": "Creating chapter files with content and structure",
      
        3027
                    "status": "pending",
      
        3028
                },
      
        3029
            ]
      
        3030
            tool_call = ToolCall(
      
        3031
                id="todo-aggregate",
      
        3032
                name="TodoWrite",
      
        3033
                arguments={"todos": todos},
      
        3034
            )
      
        3035
            executor = FakeExecutor(
      
        3036
                [
      
        3037
                    tool_outcome(
      
        3038
                        tool_call=tool_call,
      
        3039
                        output="Todos updated",
      
        3040
                        is_error=False,
      
        3041
                        metadata={"new_todos": todos},
      
        3042
                    )
      
        3043
                ]
      
        3044
            )
      
        3045
        
        3046
            summary = TurnSummary(final_response="")
      
        3047
            await runner.execute_batch(
      
        3048
                tool_calls=[tool_call],
      
        3049
                tool_source="assistant",
      
        3050
                pending_tool_calls_seen=set(),
      
        3051
                emit=_noop_emit,
      
        3052
                summary=summary,
      
        3053
                dod=dod,
      
        3054
                executor=executor,  # type: ignore[arg-type]
      
        3055
                on_confirmation=None,
      
        3056
                on_user_question=None,
      
        3057
                emit_confirmation=None,
      
        3058
                consecutive_errors=0,
      
        3059
            )
      
        3060
        
        3061
            assert queued_messages
      
        3062
            message = queued_messages[-1]
      
        3063
            assert "Todo tracking is updated." in message
      
        3064
            assert "Next step: create `01-introduction.html`." in message
      
        3065
            assert (
      
        3066
                "Continue with the next pending item: `Create chapter files with content and structure`."
      
        3067
                not in message
      
        3068
            )
      
        3069
        
        3070
        
        3071
        @pytest.mark.asyncio
      
        3072
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        3073
            temp_dir: Path,
      
        3074
        ) -> None:
      
        3075
            async def assess_confidence(
      
        3076
                tool_name: str,
      
        3077
                tool_args: dict,
      
        3078
                context: str,
      
        3079
            ) -> ConfidenceAssessment:
      
        3080
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3081
        
        3082
            async def verify_action(
      
        3083
                tool_name: str,
      
        3084
                tool_args: dict,
      
        3085
                result: str,
      
        3086
                expected: str = "",
      
        3087
            ) -> ActionVerification:
      
        3088
                raise AssertionError("Verification should not run for this scenario")
      
        3089
        
        3090
            guide_root = temp_dir / "guides" / "nginx"
      
        3091
            chapters = guide_root / "chapters"
      
        3092
            chapters.mkdir(parents=True)
      
        3093
            index_path = guide_root / "index.html"
      
        3094
            chapter_one = chapters / "01-getting-started.html"
      
        3095
            chapter_one.write_text("<h1>One</h1>\n")
      
        3096
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3097
        
        3098
            implementation_plan = temp_dir / "implementation.md"
      
        3099
            implementation_plan.write_text(
      
        3100
                "\n".join(
      
        3101
                    [
      
        3102
                        "# Implementation Plan",
      
        3103
                        "",
      
        3104
                        "## File Changes",
      
        3105
                        f"- `{index_path}`",
      
        3106
                        f"- `{chapter_one}`",
      
        3107
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        3108
                        "",
      
        3109
                    ]
      
        3110
                )
      
        3111
            )
      
        3112
        
        3113
            context = build_context(
      
        3114
                temp_dir=temp_dir,
      
        3115
                messages=[],
      
        3116
                safeguards=FakeSafeguards(),
      
        3117
                assess_confidence=assess_confidence,
      
        3118
                verify_action=verify_action,
      
        3119
                auto_recover=False,
      
        3120
            )
      
        3121
            persistent_messages: list[str] = []
      
        3122
            ephemeral_messages: list[str] = []
      
        3123
            context.queue_steering_message_callback = persistent_messages.append
      
        3124
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3125
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3126
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3127
            dod.implementation_plan = str(implementation_plan)
      
        3128
            sync_todos_to_definition_of_done(
      
        3129
                dod,
      
        3130
                [
      
        3131
                    {
      
        3132
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3133
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3134
                        "status": "pending",
      
        3135
                    },
      
        3136
                    {
      
        3137
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        3138
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        3139
                        "status": "pending",
      
        3140
                    },
      
        3141
                ],
      
        3142
            )
      
        3143
            assert tool_batches_should_prioritize_missing_artifact(
      
        3144
                dod=dod,
      
        3145
                next_pending=dod.pending_items[0],
      
        3146
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        3147
                project_root=temp_dir,
      
        3148
            )
      
        3149
        
        3150
            tool_call = ToolCall(
      
        3151
                id="dup-read",
      
        3152
                name="read",
      
        3153
                arguments={"file_path": str(index_path)},
      
        3154
            )
      
        3155
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        3156
        
        3157
            assert persistent_messages
      
        3158
            message = persistent_messages[-1]
      
        3159
            assert "06-ssl-configuration.html" in message
      
        3160
            assert "Do not switch into review or consistency-check mode" in message
      
        3161
            assert (
      
        3162
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        3163
                not in message
      
        3164
            )
      
        3165
        
        3166
        
        3167
        @pytest.mark.asyncio
      
        3168
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        3169
            temp_dir: Path,
      
        3170
        ) -> None:
      
        3171
            async def assess_confidence(
      
        3172
                tool_name: str,
      
        3173
                tool_args: dict,
      
        3174
                context: str,
      
        3175
            ) -> ConfidenceAssessment:
      
        3176
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3177
        
        3178
            async def verify_action(
      
        3179
                tool_name: str,
      
        3180
                tool_args: dict,
      
        3181
                result: str,
      
        3182
                expected: str = "",
      
        3183
            ) -> ActionVerification:
      
        3184
                raise AssertionError("Verification should not run for this scenario")
      
        3185
        
        3186
            guide_root = temp_dir / "guides" / "nginx"
      
        3187
            chapters = guide_root / "chapters"
      
        3188
            chapters.mkdir(parents=True)
      
        3189
            index_path = guide_root / "index.html"
      
        3190
            chapter_one = chapters / "01-getting-started.html"
      
        3191
            chapter_two = chapters / "02-installation.html"
      
        3192
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3193
            chapter_one.write_text("<h1>One</h1>\n")
      
        3194
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3195
        
        3196
            implementation_plan = temp_dir / "implementation.md"
      
        3197
            implementation_plan.write_text(
      
        3198
                "\n".join(
      
        3199
                    [
      
        3200
                        "# Implementation Plan",
      
        3201
                        "",
      
        3202
                        "## File Changes",
      
        3203
                        f"- `{chapters}/`",
      
        3204
                        f"- `{index_path}`",
      
        3205
                        f"- `{chapter_one}`",
      
        3206
                        f"- `{chapter_two}`",
      
        3207
                        "",
      
        3208
                    ]
      
        3209
                )
      
        3210
            )
      
        3211
        
        3212
            context = build_context(
      
        3213
                temp_dir=temp_dir,
      
        3214
                messages=[],
      
        3215
                safeguards=FakeSafeguards(),
      
        3216
                assess_confidence=assess_confidence,
      
        3217
                verify_action=verify_action,
      
        3218
                auto_recover=False,
      
        3219
            )
      
        3220
            persistent_messages: list[str] = []
      
        3221
            ephemeral_messages: list[str] = []
      
        3222
            context.queue_steering_message_callback = persistent_messages.append
      
        3223
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3224
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3225
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3226
            dod.implementation_plan = str(implementation_plan)
      
        3227
            sync_todos_to_definition_of_done(
      
        3228
                dod,
      
        3229
                [
      
        3230
                    {
      
        3231
                        "content": "Create the guide files",
      
        3232
                        "active_form": "Working on: Create the guide files",
      
        3233
                        "status": "completed",
      
        3234
                    },
      
        3235
                    {
      
        3236
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3237
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3238
                        "status": "pending",
      
        3239
                    },
      
        3240
                ],
      
        3241
            )
      
        3242
            tool_call = ToolCall(
      
        3243
                id="write-final",
      
        3244
                name="write",
      
        3245
                arguments={
      
        3246
                    "file_path": str(chapter_two),
      
        3247
                    "content": "<h1>Two</h1>\n",
      
        3248
                },
      
        3249
            )
      
        3250
            executor = FakeExecutor(
      
        3251
                [
      
        3252
                    tool_outcome(
      
        3253
                        tool_call=tool_call,
      
        3254
                        output=f"Successfully wrote {chapter_two}",
      
        3255
                        is_error=False,
      
        3256
                    )
      
        3257
                ]
      
        3258
            )
      
        3259
        
        3260
            summary = TurnSummary(final_response="")
      
        3261
            await runner.execute_batch(
      
        3262
                tool_calls=[tool_call],
      
        3263
                tool_source="assistant",
      
        3264
                pending_tool_calls_seen=set(),
      
        3265
                emit=_noop_emit,
      
        3266
                summary=summary,
      
        3267
                dod=dod,
      
        3268
                executor=executor,  # type: ignore[arg-type]
      
        3269
                on_confirmation=None,
      
        3270
                on_user_question=None,
      
        3271
                emit_confirmation=None,
      
        3272
                consecutive_errors=0,
      
        3273
            )
      
        3274
        
        3275
            assert any(
      
        3276
                "All explicitly planned artifacts now exist on disk." in message
      
        3277
                for message in persistent_messages
      
        3278
            )
      
        3279
            assert any(
      
        3280
                "Ensure all files are properly linked and formatted consistently" in message
      
        3281
                for message in persistent_messages
      
        3282
            )
      
        3283
            assert any(
      
        3284
                "Move to verification once no specific mismatch remains." in message
      
        3285
                for message in persistent_messages
      
        3286
            )
      
        3287
        
        3288
        
        3289
        @pytest.mark.asyncio
      
        3290
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        3291
            temp_dir: Path,
      
        3292
        ) -> None:
      
        3293
            async def assess_confidence(
      
        3294
                tool_name: str,
      
        3295
                tool_args: dict,
      
        3296
                context: str,
      
        3297
            ) -> ConfidenceAssessment:
      
        3298
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3299
        
        3300
            async def verify_action(
      
        3301
                tool_name: str,
      
        3302
                tool_args: dict,
      
        3303
                result: str,
      
        3304
                expected: str = "",
      
        3305
            ) -> ActionVerification:
      
        3306
                raise AssertionError("Verification should not run in this scenario")
      
        3307
        
        3308
            guide_root = temp_dir / "guides" / "nginx"
      
        3309
            chapters = guide_root / "chapters"
      
        3310
            guide_root.mkdir(parents=True)
      
        3311
            chapters.mkdir()
      
        3312
            index_path = guide_root / "index.html"
      
        3313
            index_path.write_text("<html></html>\n")
      
        3314
            chapter_one = chapters / "01-getting-started.html"
      
        3315
            chapter_two = chapters / "02-installation.html"
      
        3316
            implementation_plan = temp_dir / "implementation.md"
      
        3317
            implementation_plan.write_text(
      
        3318
                "\n".join(
      
        3319
                    [
      
        3320
                        "# Implementation Plan",
      
        3321
                        "",
      
        3322
                        "## File Changes",
      
        3323
                        f"- `{guide_root}/`",
      
        3324
                        f"- `{index_path}`",
      
        3325
                        f"- `{chapter_one}`",
      
        3326
                        f"- `{chapter_two}`",
      
        3327
                        "",
      
        3328
                    ]
      
        3329
                )
      
        3330
            )
      
        3331
        
        3332
            context = build_context(
      
        3333
                temp_dir=temp_dir,
      
        3334
                messages=[],
      
        3335
                safeguards=FakeSafeguards(),
      
        3336
                assess_confidence=assess_confidence,
      
        3337
                verify_action=verify_action,
      
        3338
                auto_recover=False,
      
        3339
            )
      
        3340
            persistent_messages: list[str] = []
      
        3341
            ephemeral_messages: list[str] = []
      
        3342
            context.queue_steering_message_callback = persistent_messages.append
      
        3343
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3344
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3345
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3346
            dod.implementation_plan = str(implementation_plan)
      
        3347
            sync_todos_to_definition_of_done(
      
        3348
                dod,
      
        3349
                [
      
        3350
                    {
      
        3351
                        "content": "Create the main index.html file with proper structure",
      
        3352
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        3353
                        "status": "pending",
      
        3354
                    },
      
        3355
                    {
      
        3356
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        3357
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        3358
                        "status": "pending",
      
        3359
                    },
      
        3360
                    {
      
        3361
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3362
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3363
                        "status": "pending",
      
        3364
                    },
      
        3365
                ],
      
        3366
            )
      
        3367
            tool_call = ToolCall(
      
        3368
                id="write-index",
      
        3369
                name="write",
      
        3370
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        3371
            )
      
        3372
            executor = FakeExecutor(
      
        3373
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        3374
            )
      
        3375
        
        3376
            summary = TurnSummary(final_response="")
      
        3377
            await runner.execute_batch(
      
        3378
                tool_calls=[tool_call],
      
        3379
                tool_source="assistant",
      
        3380
                pending_tool_calls_seen=set(),
      
        3381
                emit=_noop_emit,
      
        3382
                summary=summary,
      
        3383
                dod=dod,
      
        3384
                executor=executor,  # type: ignore[arg-type]
      
        3385
                on_confirmation=None,
      
        3386
                on_user_question=None,
      
        3387
                emit_confirmation=None,
      
        3388
                consecutive_errors=0,
      
        3389
            )
      
        3390
        
        3391
            assert persistent_messages
      
        3392
            assert ephemeral_messages == []
      
        3393
            message = persistent_messages[-1]
      
        3394
            assert "Next step: create `01-getting-started.html`." in message
      
        3395
            assert "Write a compact but real initial version of that file now" not in message
      
        3396
            assert "refresh `TodoWrite`" not in message
      
        3397
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3398
        
        3399
        
        3400
        @pytest.mark.asyncio
      
        3401
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        3402
            temp_dir: Path,
      
        3403
        ) -> None:
      
        3404
            async def assess_confidence(
      
        3405
                tool_name: str,
      
        3406
                tool_args: dict,
      
        3407
                context: str,
      
        3408
            ) -> ConfidenceAssessment:
      
        3409
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3410
        
        3411
            async def verify_action(
      
        3412
                tool_name: str,
      
        3413
                tool_args: dict,
      
        3414
                result: str,
      
        3415
                expected: str = "",
      
        3416
            ) -> ActionVerification:
      
        3417
                raise AssertionError("Verification should not run in this scenario")
      
        3418
        
        3419
            guide_root = temp_dir / "guides" / "nginx"
      
        3420
            chapters = guide_root / "chapters"
      
        3421
            guide_root.mkdir(parents=True)
      
        3422
            chapters.mkdir()
      
        3423
            index_path = guide_root / "index.html"
      
        3424
            index_path.write_text("<html></html>\n")
      
        3425
        
        3426
            chapter_paths = [
      
        3427
                chapters / "01-getting-started.html",
      
        3428
                chapters / "02-installation.html",
      
        3429
                chapters / "03-first-website.html",
      
        3430
                chapters / "04-configuration-basics.html",
      
        3431
                chapters / "05-advanced-configurations.html",
      
        3432
                chapters / "06-performance-tuning.html",
      
        3433
                chapters / "07-security-best-practices.html",
      
        3434
            ]
      
        3435
            for chapter in chapter_paths[:4]:
      
        3436
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        3437
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        3438
        
        3439
            implementation_plan = temp_dir / "implementation.md"
      
        3440
            implementation_plan.write_text(
      
        3441
                "\n".join(
      
        3442
                    [
      
        3443
                        "# Implementation Plan",
      
        3444
                        "",
      
        3445
                        "## File Changes",
      
        3446
                        f"- `{guide_root}/`",
      
        3447
                        f"- `{chapters}/`",
      
        3448
                        f"- `{index_path}`",
      
        3449
                        *[f"- `{path}`" for path in chapter_paths],
      
        3450
                        "",
      
        3451
                    ]
      
        3452
                )
      
        3453
            )
      
        3454
        
        3455
            context = build_context(
      
        3456
                temp_dir=temp_dir,
      
        3457
                messages=[],
      
        3458
                safeguards=FakeSafeguards(),
      
        3459
                assess_confidence=assess_confidence,
      
        3460
                verify_action=verify_action,
      
        3461
                auto_recover=False,
      
        3462
            )
      
        3463
            persistent_messages: list[str] = []
      
        3464
            ephemeral_messages: list[str] = []
      
        3465
            context.queue_steering_message_callback = persistent_messages.append
      
        3466
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3467
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3468
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3469
            dod.implementation_plan = str(implementation_plan)
      
        3470
            sync_todos_to_definition_of_done(
      
        3471
                dod,
      
        3472
                [
      
        3473
                    {
      
        3474
                        "content": "Create the nginx guide artifacts",
      
        3475
                        "active_form": "Creating nginx guide artifacts",
      
        3476
                        "status": "pending",
      
        3477
                    },
      
        3478
                    {
      
        3479
                        "content": "Verify all guide files are linked and complete",
      
        3480
                        "active_form": "Verifying guide linkage and completeness",
      
        3481
                        "status": "pending",
      
        3482
                    },
      
        3483
                ],
      
        3484
            )
      
        3485
            tool_call = ToolCall(
      
        3486
                id="write-chapter-05",
      
        3487
                name="write",
      
        3488
                arguments={
      
        3489
                    "file_path": str(chapter_paths[4]),
      
        3490
                    "content": "<h1>Advanced configurations</h1>\n",
      
        3491
                },
      
        3492
            )
      
        3493
            executor = FakeExecutor(
      
        3494
                [
      
        3495
                    tool_outcome(
      
        3496
                        tool_call=tool_call,
      
        3497
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        3498
                        is_error=False,
      
        3499
                    )
      
        3500
                ]
      
        3501
            )
      
        3502
        
        3503
            summary = TurnSummary(final_response="")
      
        3504
            await runner.execute_batch(
      
        3505
                tool_calls=[tool_call],
      
        3506
                tool_source="assistant",
      
        3507
                pending_tool_calls_seen=set(),
      
        3508
                emit=_noop_emit,
      
        3509
                summary=summary,
      
        3510
                dod=dod,
      
        3511
                executor=executor,  # type: ignore[arg-type]
      
        3512
                on_confirmation=None,
      
        3513
                on_user_question=None,
      
        3514
                emit_confirmation=None,
      
        3515
                consecutive_errors=0,
      
        3516
            )
      
        3517
        
        3518
            assert any(
      
        3519
                "Next step: create `06-performance-tuning.html`." in message
      
        3520
                for message in ephemeral_messages
      
        3521
            )
      
        3522
            assert not any(
      
        3523
                "All explicitly planned artifacts now exist on disk." in message
      
        3524
                for message in ephemeral_messages
      
        3525
            )
      
        3526
        
        3527
        
        3528
        @pytest.mark.asyncio
      
        3529
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        3530
            temp_dir: Path,
      
        3531
        ) -> None:
      
        3532
            async def assess_confidence(
      
        3533
                tool_name: str,
      
        3534
                tool_args: dict,
      
        3535
                context: str,
      
        3536
            ) -> ConfidenceAssessment:
      
        3537
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3538
        
        3539
            async def verify_action(
      
        3540
                tool_name: str,
      
        3541
                tool_args: dict,
      
        3542
                result: str,
      
        3543
                expected: str = "",
      
        3544
            ) -> ActionVerification:
      
        3545
                raise AssertionError("Verification should not run in this scenario")
      
        3546
        
        3547
            guide_root = temp_dir / "guides" / "nginx"
      
        3548
            chapters = guide_root / "chapters"
      
        3549
            guide_root.mkdir(parents=True)
      
        3550
            chapters.mkdir()
      
        3551
            index_path = guide_root / "index.html"
      
        3552
            chapter_paths = [
      
        3553
                chapters / "01-introduction.html",
      
        3554
                chapters / "02-installation.html",
      
        3555
                chapters / "03-configuration.html",
      
        3556
                chapters / "04-basic-usage.html",
      
        3557
                chapters / "05-advanced-features.html",
      
        3558
            ]
      
        3559
            for path in (index_path, *chapter_paths[:4]):
      
        3560
                path.write_text("<html></html>\n")
      
        3561
        
        3562
            implementation_plan = temp_dir / "implementation.md"
      
        3563
            implementation_plan.write_text(
      
        3564
                "\n".join(
      
        3565
                    [
      
        3566
                        "# Implementation Plan",
      
        3567
                        "",
      
        3568
                        "## File Changes",
      
        3569
                        f"- `{guide_root}/`",
      
        3570
                        f"- `{chapters}/`",
      
        3571
                        f"- `{index_path}`",
      
        3572
                        *[f"- `{path}`" for path in chapter_paths],
      
        3573
                        "",
      
        3574
                    ]
      
        3575
                )
      
        3576
            )
      
        3577
        
        3578
            context = build_context(
      
        3579
                temp_dir=temp_dir,
      
        3580
                messages=[],
      
        3581
                safeguards=FakeSafeguards(),
      
        3582
                assess_confidence=assess_confidence,
      
        3583
                verify_action=verify_action,
      
        3584
                auto_recover=False,
      
        3585
            )
      
        3586
            persistent_messages: list[str] = []
      
        3587
            ephemeral_messages: list[str] = []
      
        3588
            context.queue_steering_message_callback = persistent_messages.append
      
        3589
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3590
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3591
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3592
            dod.implementation_plan = str(implementation_plan)
      
        3593
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3594
            dod.completed_items.extend(
      
        3595
                [
      
        3596
                    "Create the nginx directory structure",
      
        3597
                    "Create the main index.html file with proper structure",
      
        3598
                ]
      
        3599
            )
      
        3600
            sync_todos_to_definition_of_done(
      
        3601
                dod,
      
        3602
                [
      
        3603
                    {
      
        3604
                        "content": "Create each chapter file with appropriate content",
      
        3605
                        "active_form": "Creating each chapter file with appropriate content",
      
        3606
                        "status": "pending",
      
        3607
                    }
      
        3608
                ],
      
        3609
            )
      
        3610
            tool_call = ToolCall(
      
        3611
                id="write-chapter-04",
      
        3612
                name="write",
      
        3613
                arguments={
      
        3614
                    "file_path": str(chapter_paths[3]),
      
        3615
                    "content": "<html>updated</html>\n",
      
        3616
                },
      
        3617
            )
      
        3618
            executor = FakeExecutor(
      
        3619
                [
      
        3620
                    tool_outcome(
      
        3621
                        tool_call=tool_call,
      
        3622
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3623
                        is_error=False,
      
        3624
                    )
      
        3625
                ]
      
        3626
            )
      
        3627
        
        3628
            summary = TurnSummary(final_response="")
      
        3629
            await runner.execute_batch(
      
        3630
                tool_calls=[tool_call],
      
        3631
                tool_source="assistant",
      
        3632
                pending_tool_calls_seen=set(),
      
        3633
                emit=_noop_emit,
      
        3634
                summary=summary,
      
        3635
                dod=dod,
      
        3636
                executor=executor,  # type: ignore[arg-type]
      
        3637
                on_confirmation=None,
      
        3638
                on_user_question=None,
      
        3639
                emit_confirmation=None,
      
        3640
                consecutive_errors=0,
      
        3641
            )
      
        3642
        
        3643
            assert ephemeral_messages
      
        3644
            message = ephemeral_messages[-1]
      
        3645
            assert "Next step: create `05-advanced-features.html`." in message
      
        3646
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3647
            assert "refresh `TodoWrite`" not in message
      
        3648
        
        3649
        
        3650
        @pytest.mark.asyncio
      
        3651
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3652
            temp_dir: Path,
      
        3653
        ) -> None:
      
        3654
            async def assess_confidence(
      
        3655
                tool_name: str,
      
        3656
                tool_args: dict,
      
        3657
                context: str,
      
        3658
            ) -> ConfidenceAssessment:
      
        3659
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3660
        
        3661
            async def verify_action(
      
        3662
                tool_name: str,
      
        3663
                tool_args: dict,
      
        3664
                result: str,
      
        3665
                expected: str = "",
      
        3666
            ) -> ActionVerification:
      
        3667
                raise AssertionError("Verification should not run in this scenario")
      
        3668
        
        3669
            guide_root = temp_dir / "guides" / "nginx"
      
        3670
            chapters = guide_root / "chapters"
      
        3671
            guide_root.mkdir(parents=True)
      
        3672
            chapters.mkdir()
      
        3673
            index_path = guide_root / "index.html"
      
        3674
            index_path.write_text("<html></html>\n")
      
        3675
            chapter_one = chapters / "01-getting-started.html"
      
        3676
            chapter_two = chapters / "02-installation.html"
      
        3677
            chapter_one.write_text("<h1>One</h1>\n")
      
        3678
        
        3679
            implementation_plan = temp_dir / "implementation.md"
      
        3680
            implementation_plan.write_text(
      
        3681
                "\n".join(
      
        3682
                    [
      
        3683
                        "# Implementation Plan",
      
        3684
                        "",
      
        3685
                        "## File Changes",
      
        3686
                        f"- `{guide_root}/`",
      
        3687
                        f"- `{chapters}/`",
      
        3688
                        f"- `{index_path}`",
      
        3689
                        f"- `{chapter_one}`",
      
        3690
                        f"- `{chapter_two}`",
      
        3691
                        "",
      
        3692
                    ]
      
        3693
                )
      
        3694
            )
      
        3695
        
        3696
            context = build_context(
      
        3697
                temp_dir=temp_dir,
      
        3698
                messages=[],
      
        3699
                safeguards=FakeSafeguards(),
      
        3700
                assess_confidence=assess_confidence,
      
        3701
                verify_action=verify_action,
      
        3702
                auto_recover=False,
      
        3703
            )
      
        3704
            persistent_messages: list[str] = []
      
        3705
            ephemeral_messages: list[str] = []
      
        3706
            context.queue_steering_message_callback = persistent_messages.append
      
        3707
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3708
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3709
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3710
            dod.implementation_plan = str(implementation_plan)
      
        3711
            sync_todos_to_definition_of_done(
      
        3712
                dod,
      
        3713
                [
      
        3714
                    {
      
        3715
                        "content": "Create 01-getting-started.html",
      
        3716
                        "active_form": "Creating 01-getting-started.html",
      
        3717
                        "status": "completed",
      
        3718
                    },
      
        3719
                    {
      
        3720
                        "content": "Create 02-installation.html",
      
        3721
                        "active_form": "Creating 02-installation.html",
      
        3722
                        "status": "pending",
      
        3723
                    },
      
        3724
                ],
      
        3725
            )
      
        3726
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3727
        
        3728
            tool_call = ToolCall(
      
        3729
                id="todo-only",
      
        3730
                name="TodoWrite",
      
        3731
                arguments={
      
        3732
                    "todos": [
      
        3733
                        {
      
        3734
                            "content": "Create 01-getting-started.html",
      
        3735
                            "active_form": "Creating 01-getting-started.html",
      
        3736
                            "status": "completed",
      
        3737
                        },
      
        3738
                        {
      
        3739
                            "content": "Create 02-installation.html",
      
        3740
                            "active_form": "Creating 02-installation.html",
      
        3741
                            "status": "pending",
      
        3742
                        },
      
        3743
                    ]
      
        3744
                },
      
        3745
            )
      
        3746
            executor = FakeExecutor(
      
        3747
                [
      
        3748
                    tool_outcome(
      
        3749
                        tool_call=tool_call,
      
        3750
                        output="Todos updated",
      
        3751
                        is_error=False,
      
        3752
                        metadata={
      
        3753
                            "new_todos": [
      
        3754
                                {
      
        3755
                                    "content": "Create 01-getting-started.html",
      
        3756
                                    "active_form": "Creating 01-getting-started.html",
      
        3757
                                    "status": "completed",
      
        3758
                                },
      
        3759
                                {
      
        3760
                                    "content": "Create 02-installation.html",
      
        3761
                                    "active_form": "Creating 02-installation.html",
      
        3762
                                    "status": "pending",
      
        3763
                                },
      
        3764
                            ]
      
        3765
                        },
      
        3766
                    )
      
        3767
                ]
      
        3768
            )
      
        3769
        
        3770
            summary = TurnSummary(final_response="")
      
        3771
            await runner.execute_batch(
      
        3772
                tool_calls=[tool_call],
      
        3773
                tool_source="assistant",
      
        3774
                pending_tool_calls_seen=set(),
      
        3775
                emit=_noop_emit,
      
        3776
                summary=summary,
      
        3777
                dod=dod,
      
        3778
                executor=executor,  # type: ignore[arg-type]
      
        3779
                on_confirmation=None,
      
        3780
                on_user_question=None,
      
        3781
                emit_confirmation=None,
      
        3782
                consecutive_errors=0,
      
        3783
            )
      
        3784
        
        3785
            assert persistent_messages
      
        3786
            message = persistent_messages[-1]
      
        3787
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        3788
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        3789
            assert "Make your next response the concrete mutation tool call itself." in message
      
        3790
            assert ephemeral_messages == []
      
        3791
        
        3792
        
        3793
        @pytest.mark.asyncio
      
        3794
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3795
            temp_dir: Path,
      
        3796
        ) -> None:
      
        3797
            async def assess_confidence(
      
        3798
                tool_name: str,
      
        3799
                tool_args: dict,
      
        3800
                context: str,
      
        3801
            ) -> ConfidenceAssessment:
      
        3802
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3803
        
        3804
            async def verify_action(
      
        3805
                tool_name: str,
      
        3806
                tool_args: dict,
      
        3807
                result: str,
      
        3808
                expected: str = "",
      
        3809
            ) -> ActionVerification:
      
        3810
                raise AssertionError("Verification should not run in this scenario")
      
        3811
        
        3812
            guide_root = temp_dir / "guides" / "nginx"
      
        3813
            chapters = guide_root / "chapters"
      
        3814
            guide_root.mkdir(parents=True)
      
        3815
            chapters.mkdir()
      
        3816
            index_path = guide_root / "index.html"
      
        3817
            chapter_one = chapters / "01-getting-started.html"
      
        3818
            chapter_two = chapters / "02-installation.html"
      
        3819
            index_path.write_text("<html></html>\n")
      
        3820
            chapter_one.write_text("<h1>One</h1>\n")
      
        3821
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3822
        
        3823
            implementation_plan = temp_dir / "implementation.md"
      
        3824
            implementation_plan.write_text(
      
        3825
                "\n".join(
      
        3826
                    [
      
        3827
                        "# Implementation Plan",
      
        3828
                        "",
      
        3829
                        "## File Changes",
      
        3830
                        f"- `{guide_root}/`",
      
        3831
                        f"- `{chapters}/`",
      
        3832
                        f"- `{index_path}`",
      
        3833
                        f"- `{chapter_one}`",
      
        3834
                        f"- `{chapter_two}`",
      
        3835
                        "",
      
        3836
                    ]
      
        3837
                )
      
        3838
            )
      
        3839
        
        3840
            context = build_context(
      
        3841
                temp_dir=temp_dir,
      
        3842
                messages=[],
      
        3843
                safeguards=FakeSafeguards(),
      
        3844
                assess_confidence=assess_confidence,
      
        3845
                verify_action=verify_action,
      
        3846
                auto_recover=False,
      
        3847
            )
      
        3848
            queued_messages: list[str] = []
      
        3849
            context.queue_steering_message_callback = queued_messages.append
      
        3850
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3851
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3852
            dod.implementation_plan = str(implementation_plan)
      
        3853
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3854
            sync_todos_to_definition_of_done(
      
        3855
                dod,
      
        3856
                [
      
        3857
                    {
      
        3858
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3859
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3860
                        "status": "pending",
      
        3861
                    },
      
        3862
                    {
      
        3863
                        "content": "Verify all guide files are linked and complete",
      
        3864
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3865
                        "status": "pending",
      
        3866
                    },
      
        3867
                ],
      
        3868
                project_root=temp_dir,
      
        3869
            )
      
        3870
        
        3871
            tool_call = ToolCall(
      
        3872
                id="todo-only",
      
        3873
                name="TodoWrite",
      
        3874
                arguments={
      
        3875
                    "todos": [
      
        3876
                        {
      
        3877
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3878
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3879
                            "status": "pending",
      
        3880
                        },
      
        3881
                        {
      
        3882
                            "content": "Verify all guide files are linked and complete",
      
        3883
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3884
                            "status": "pending",
      
        3885
                        },
      
        3886
                    ]
      
        3887
                },
      
        3888
            )
      
        3889
            executor = FakeExecutor(
      
        3890
                [
      
        3891
                    tool_outcome(
      
        3892
                        tool_call=tool_call,
      
        3893
                        output="Todos updated",
      
        3894
                        is_error=False,
      
        3895
                        metadata={
      
        3896
                            "new_todos": [
      
        3897
                                {
      
        3898
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3899
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3900
                                    "status": "pending",
      
        3901
                                },
      
        3902
                                {
      
        3903
                                    "content": "Verify all guide files are linked and complete",
      
        3904
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        3905
                                    "status": "pending",
      
        3906
                                },
      
        3907
                            ]
      
        3908
                        },
      
        3909
                    )
      
        3910
                ]
      
        3911
            )
      
        3912
        
        3913
            summary = TurnSummary(final_response="")
      
        3914
            await runner.execute_batch(
      
        3915
                tool_calls=[tool_call],
      
        3916
                tool_source="assistant",
      
        3917
                pending_tool_calls_seen=set(),
      
        3918
                emit=_noop_emit,
      
        3919
                summary=summary,
      
        3920
                dod=dod,
      
        3921
                executor=executor,  # type: ignore[arg-type]
      
        3922
                on_confirmation=None,
      
        3923
                on_user_question=None,
      
        3924
                emit_confirmation=None,
      
        3925
                consecutive_errors=0,
      
        3926
            )
      
        3927
        
        3928
            assert queued_messages
      
        3929
            message = queued_messages[-1]
      
        3930
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        3931
            assert "Verify all guide files are linked and complete" in message
      
        3932
            assert "Move to verification once no specific mismatch remains." in message
      
        3933
            assert "reopen reference materials" in message
      
        3934
            assert "Fortran guide structure" not in message
      
        3935
        
        3936
        
        3937
        @pytest.mark.asyncio
      
        3938
        async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
      
        3939
            temp_dir: Path,
      
        3940
        ) -> None:
      
        3941
            async def assess_confidence(
      
        3942
                tool_name: str,
      
        3943
                tool_args: dict,
      
        3944
                context: str,
      
        3945
            ) -> ConfidenceAssessment:
      
        3946
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        3947
        
        3948
            async def verify_action(
      
        3949
                tool_name: str,
      
        3950
                tool_args: dict,
      
        3951
                result: str,
      
        3952
                expected: str = "",
      
        3953
            ) -> ActionVerification:
      
        3954
                raise AssertionError("Verification should not run for this scenario")
      
        3955
        
        3956
            guide_root = temp_dir / "guides" / "nginx"
      
        3957
            chapters = guide_root / "chapters"
      
        3958
            guide_root.mkdir(parents=True)
      
        3959
            chapters.mkdir()
      
        3960
            index_path = guide_root / "index.html"
      
        3961
            chapter_one = chapters / "01-introduction.html"
      
        3962
            chapter_two = chapters / "02-installation.html"
      
        3963
            index_path.write_text(
      
        3964
                "\n".join(
      
        3965
                    [
      
        3966
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        3967
                        '<a href="chapters/02-installation.html">Install</a>',
      
        3968
                        '<a href="../index.html">Back</a>',
      
        3969
                        "",
      
        3970
                    ]
      
        3971
                )
      
        3972
            )
      
        3973
            chapter_one.write_text("<html></html>\n")
      
        3974
            chapter_two.write_text("<html></html>\n")
      
        3975
        
        3976
            implementation_plan = temp_dir / "implementation.md"
      
        3977
            implementation_plan.write_text(
      
        3978
                "\n".join(
      
        3979
                    [
      
        3980
                        "# Implementation Plan",
      
        3981
                        "",
      
        3982
                        "## File Changes",
      
        3983
                        f"- `{guide_root}/`",
      
        3984
                        f"- `{chapters}/`",
      
        3985
                        f"- `{index_path}`",
      
        3986
                        f"- `{chapter_one}`",
      
        3987
                        f"- `{chapter_two}`",
      
        3988
                        "",
      
        3989
                    ]
      
        3990
                )
      
        3991
            )
      
        3992
        
        3993
            context = build_context(
      
        3994
                temp_dir=temp_dir,
      
        3995
                messages=[],
      
        3996
                safeguards=FakeSafeguards(),
      
        3997
                assess_confidence=assess_confidence,
      
        3998
                verify_action=verify_action,
      
        3999
                auto_recover=False,
      
        4000
            )
      
        4001
            queued_messages: list[str] = []
      
        4002
            context.queue_steering_message_callback = queued_messages.append
      
        4003
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4004
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4005
            dod.implementation_plan = str(implementation_plan)
      
        4006
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4007
            sync_todos_to_definition_of_done(
      
        4008
                dod,
      
        4009
                [
      
        4010
                    {
      
        4011
                        "content": "Create chapter files following the established pattern",
      
        4012
                        "active_form": "Creating chapter files",
      
        4013
                        "status": "in_progress",
      
        4014
                    }
      
        4015
                ],
      
        4016
                project_root=temp_dir,
      
        4017
            )
      
        4018
        
        4019
            tool_call = ToolCall(
      
        4020
                id="todo-post-build",
      
        4021
                name="TodoWrite",
      
        4022
                arguments={
      
        4023
                    "todos": [
      
        4024
                        {
      
        4025
                            "content": "Create chapter files following the established pattern",
      
        4026
                            "active_form": "Creating chapter files",
      
        4027
                            "status": "in_progress",
      
        4028
                        }
      
        4029
                    ]
      
        4030
                },
      
        4031
            )
      
        4032
            executor = FakeExecutor(
      
        4033
                [
      
        4034
                    tool_outcome(
      
        4035
                        tool_call=tool_call,
      
        4036
                        output="Todos updated",
      
        4037
                        is_error=False,
      
        4038
                        metadata={
      
        4039
                            "new_todos": [
      
        4040
                                {
      
        4041
                                    "content": "Create chapter files following the established pattern",
      
        4042
                                    "active_form": "Creating chapter files",
      
        4043
                                    "status": "in_progress",
      
        4044
                                }
      
        4045
                            ]
      
        4046
                        },
      
        4047
                    )
      
        4048
                ]
      
        4049
            )
      
        4050
        
        4051
            summary = TurnSummary(final_response="")
      
        4052
            await runner.execute_batch(
      
        4053
                tool_calls=[tool_call],
      
        4054
                tool_source="assistant",
      
        4055
                pending_tool_calls_seen=set(),
      
        4056
                emit=_noop_emit,
      
        4057
                summary=summary,
      
        4058
                dod=dod,
      
        4059
                executor=executor,  # type: ignore[arg-type]
      
        4060
                on_confirmation=None,
      
        4061
                on_user_question=None,
      
        4062
                emit_confirmation=None,
      
        4063
                consecutive_errors=0,
      
        4064
            )
      
        4065
        
        4066
            assert queued_messages
      
        4067
            message = queued_messages[-1]
      
        4068
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4069
            assert "Repair or verify the current files instead of expanding the artifact set." in message
      
        4070
            assert "Move to verification or final confirmation using the files already on disk." in message
      
        4071
        
        4072
        
        4073
        @pytest.mark.asyncio
      
        4074
        async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
      
        4075
            temp_dir: Path,
      
        4076
        ) -> None:
      
        4077
            async def assess_confidence(
      
        4078
                tool_name: str,
      
        4079
                tool_args: dict,
      
        4080
                context: str,
      
        4081
            ) -> ConfidenceAssessment:
      
        4082
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4083
        
        4084
            async def verify_action(
      
        4085
                tool_name: str,
      
        4086
                tool_args: dict,
      
        4087
                result: str,
      
        4088
                expected: str = "",
      
        4089
            ) -> ActionVerification:
      
        4090
                raise AssertionError("Verification should not run for this scenario")
      
        4091
        
        4092
            guide_root = temp_dir / "guides" / "nginx"
      
        4093
            chapters = guide_root / "chapters"
      
        4094
            guide_root.mkdir(parents=True)
      
        4095
            chapters.mkdir()
      
        4096
            index_path = guide_root / "index.html"
      
        4097
            chapter_one = chapters / "01-introduction.html"
      
        4098
            chapter_two = chapters / "02-installation.html"
      
        4099
            index_path.write_text(
      
        4100
                "\n".join(
      
        4101
                    [
      
        4102
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        4103
                        '<a href="chapters/02-installation.html">Install</a>',
      
        4104
                        '<a href="../index.html">Back</a>',
      
        4105
                        "",
      
        4106
                    ]
      
        4107
                )
      
        4108
            )
      
        4109
            chapter_one.write_text("<html></html>\n")
      
        4110
            chapter_two.write_text("<html></html>\n")
      
        4111
        
        4112
            implementation_plan = temp_dir / "implementation.md"
      
        4113
            implementation_plan.write_text(
      
        4114
                "\n".join(
      
        4115
                    [
      
        4116
                        "# Implementation Plan",
      
        4117
                        "",
      
        4118
                        "## File Changes",
      
        4119
                        f"- `{guide_root}/`",
      
        4120
                        f"- `{chapters}/`",
      
        4121
                        f"- `{index_path}`",
      
        4122
                        f"- `{chapter_one}`",
      
        4123
                        f"- `{chapter_two}`",
      
        4124
                        "",
      
        4125
                    ]
      
        4126
                )
      
        4127
            )
      
        4128
        
        4129
            context = build_context(
      
        4130
                temp_dir=temp_dir,
      
        4131
                messages=[],
      
        4132
                safeguards=FakeSafeguards(),
      
        4133
                assess_confidence=assess_confidence,
      
        4134
                verify_action=verify_action,
      
        4135
                auto_recover=False,
      
        4136
            )
      
        4137
            queued_messages: list[str] = []
      
        4138
            context.queue_steering_message_callback = queued_messages.append
      
        4139
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4140
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4141
            dod.implementation_plan = str(implementation_plan)
      
        4142
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4143
        
        4144
            tool_call = ToolCall(
      
        4145
                id="todo-post-build-expansion",
      
        4146
                name="TodoWrite",
      
        4147
                arguments={
      
        4148
                    "todos": [
      
        4149
                        {
      
        4150
                            "content": "Create index.html for nginx guide",
      
        4151
                            "activeForm": "Creating index.html",
      
        4152
                            "status": "in_progress",
      
        4153
                        },
      
        4154
                        {
      
        4155
                            "content": "Create chapter 01-introduction.html",
      
        4156
                            "activeForm": "Creating chapter 01-introduction.html",
      
        4157
                            "status": "completed",
      
        4158
                        },
      
        4159
                        {
      
        4160
                            "content": "Create chapter 02-installation.html",
      
        4161
                            "activeForm": "Creating chapter 02-installation.html",
      
        4162
                            "status": "completed",
      
        4163
                        },
      
        4164
                        {
      
        4165
                            "content": "Create chapter 08-troubleshooting.html",
      
        4166
                            "activeForm": "Creating chapter 08-troubleshooting.html",
      
        4167
                            "status": "pending",
      
        4168
                        },
      
        4169
                    ]
      
        4170
                },
      
        4171
            )
      
        4172
            executor = FakeExecutor(
      
        4173
                [
      
        4174
                    tool_outcome(
      
        4175
                        tool_call=tool_call,
      
        4176
                        output="Todos updated",
      
        4177
                        is_error=False,
      
        4178
                        metadata={
      
        4179
                            "new_todos": [
      
        4180
                                {
      
        4181
                                    "content": "Create index.html for nginx guide",
      
        4182
                                    "active_form": "Creating index.html",
      
        4183
                                    "status": "in_progress",
      
        4184
                                },
      
        4185
                                {
      
        4186
                                    "content": "Create chapter 01-introduction.html",
      
        4187
                                    "active_form": "Creating chapter 01-introduction.html",
      
        4188
                                    "status": "completed",
      
        4189
                                },
      
        4190
                                {
      
        4191
                                    "content": "Create chapter 02-installation.html",
      
        4192
                                    "active_form": "Creating chapter 02-installation.html",
      
        4193
                                    "status": "completed",
      
        4194
                                },
      
        4195
                                {
      
        4196
                                    "content": "Create chapter 08-troubleshooting.html",
      
        4197
                                    "active_form": "Creating chapter 08-troubleshooting.html",
      
        4198
                                    "status": "pending",
      
        4199
                                },
      
        4200
                            ]
      
        4201
                        },
      
        4202
                    )
      
        4203
                ]
      
        4204
            )
      
        4205
        
        4206
            summary = TurnSummary(final_response="")
      
        4207
            await runner.execute_batch(
      
        4208
                tool_calls=[tool_call],
      
        4209
                tool_source="assistant",
      
        4210
                pending_tool_calls_seen=set(),
      
        4211
                emit=_noop_emit,
      
        4212
                summary=summary,
      
        4213
                dod=dod,
      
        4214
                executor=executor,  # type: ignore[arg-type]
      
        4215
                on_confirmation=None,
      
        4216
                on_user_question=None,
      
        4217
                emit_confirmation=None,
      
        4218
                consecutive_errors=0,
      
        4219
            )
      
        4220
        
        4221
            assert queued_messages
      
        4222
            message = queued_messages[-1]
      
        4223
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4224
            assert "Repair or verify the current files instead of expanding the artifact set." in message
      
        4225
            assert "Move to verification or final confirmation using the files already on disk." in message
      
        4226
            assert "08-troubleshooting.html" not in message
      
        4227
        
        4228
        
        4229
        @pytest.mark.asyncio
      
        4230
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        4231
            temp_dir: Path,
      
        4232
        ) -> None:
      
        4233
            async def assess_confidence(
      
        4234
                tool_name: str,
      
        4235
                tool_args: dict,
      
        4236
                context: str,
      
        4237
            ) -> ConfidenceAssessment:
      
        4238
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4239
        
        4240
            async def verify_action(
      
        4241
                tool_name: str,
      
        4242
                tool_args: dict,
      
        4243
                result: str,
      
        4244
                expected: str = "",
      
        4245
            ) -> ActionVerification:
      
        4246
                raise AssertionError("Verification should not run in this scenario")
      
        4247
        
        4248
            guide_root = temp_dir / "guides" / "nginx"
      
        4249
            chapters = guide_root / "chapters"
      
        4250
            guide_root.mkdir(parents=True)
      
        4251
            chapters.mkdir()
      
        4252
            index_path = guide_root / "index.html"
      
        4253
            index_path.write_text(
      
        4254
                "\n".join(
      
        4255
                    [
      
        4256
                        "<!DOCTYPE html>",
      
        4257
                        "<html>",
      
        4258
                        "<body>",
      
        4259
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4260
                        "</body>",
      
        4261
                        "</html>",
      
        4262
                        "",
      
        4263
                    ]
      
        4264
                )
      
        4265
            )
      
        4266
        
        4267
            implementation_plan = temp_dir / "implementation.md"
      
        4268
            implementation_plan.write_text(
      
        4269
                "\n".join(
      
        4270
                    [
      
        4271
                        "# Implementation Plan",
      
        4272
                        "",
      
        4273
                        "## File Changes",
      
        4274
                        f"- `{guide_root}/`",
      
        4275
                        f"- `{chapters}/`",
      
        4276
                        f"- `{index_path}`",
      
        4277
                        "",
      
        4278
                    ]
      
        4279
                )
      
        4280
            )
      
        4281
        
        4282
            context = build_context(
      
        4283
                temp_dir=temp_dir,
      
        4284
                messages=[],
      
        4285
                safeguards=FakeSafeguards(),
      
        4286
                assess_confidence=assess_confidence,
      
        4287
                verify_action=verify_action,
      
        4288
                auto_recover=False,
      
        4289
            )
      
        4290
            queued_messages: list[str] = []
      
        4291
            context.queue_steering_message_callback = queued_messages.append
      
        4292
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4293
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4294
            dod.implementation_plan = str(implementation_plan)
      
        4295
            dod.touched_files.append(str(index_path))
      
        4296
            sync_todos_to_definition_of_done(
      
        4297
                dod,
      
        4298
                [
      
        4299
                    {
      
        4300
                        "content": "Examine the existing Fortran guide structure",
      
        4301
                        "active_form": "Examining the existing Fortran guide structure",
      
        4302
                        "status": "completed",
      
        4303
                    },
      
        4304
                    {
      
        4305
                        "content": "Create the nginx directory structure",
      
        4306
                        "active_form": "Creating the nginx directory structure",
      
        4307
                        "status": "completed",
      
        4308
                    },
      
        4309
                    {
      
        4310
                        "content": "Write the introduction chapter",
      
        4311
                        "active_form": "Writing the introduction chapter",
      
        4312
                        "status": "pending",
      
        4313
                    },
      
        4314
                ],
      
        4315
                project_root=temp_dir,
      
        4316
            )
      
        4317
        
        4318
            tool_call = ToolCall(
      
        4319
                id="todo-next-mutation",
      
        4320
                name="TodoWrite",
      
        4321
                arguments={
      
        4322
                    "todos": [
      
        4323
                        {
      
        4324
                            "content": "Examine the existing Fortran guide structure",
      
        4325
                            "active_form": "Examining the existing Fortran guide structure",
      
        4326
                            "status": "completed",
      
        4327
                        },
      
        4328
                        {
      
        4329
                            "content": "Create the nginx directory structure",
      
        4330
                            "active_form": "Creating the nginx directory structure",
      
        4331
                            "status": "completed",
      
        4332
                        },
      
        4333
                        {
      
        4334
                            "content": "Write the introduction chapter",
      
        4335
                            "active_form": "Writing the introduction chapter",
      
        4336
                            "status": "pending",
      
        4337
                        },
      
        4338
                    ]
      
        4339
                },
      
        4340
            )
      
        4341
            executor = FakeExecutor(
      
        4342
                [
      
        4343
                    tool_outcome(
      
        4344
                        tool_call=tool_call,
      
        4345
                        output="Todos updated",
      
        4346
                        is_error=False,
      
        4347
                        metadata={
      
        4348
                            "new_todos": [
      
        4349
                                {
      
        4350
                                    "content": "Examine the existing Fortran guide structure",
      
        4351
                                    "active_form": "Examining the existing Fortran guide structure",
      
        4352
                                    "status": "completed",
      
        4353
                                },
      
        4354
                                {
      
        4355
                                    "content": "Create the nginx directory structure",
      
        4356
                                    "active_form": "Creating the nginx directory structure",
      
        4357
                                    "status": "completed",
      
        4358
                                },
      
        4359
                                {
      
        4360
                                    "content": "Write the introduction chapter",
      
        4361
                                    "active_form": "Writing the introduction chapter",
      
        4362
                                    "status": "pending",
      
        4363
                                },
      
        4364
                            ]
      
        4365
                        },
      
        4366
                    )
      
        4367
                ]
      
        4368
            )
      
        4369
        
        4370
            summary = TurnSummary(final_response="")
      
        4371
            await runner.execute_batch(
      
        4372
                tool_calls=[tool_call],
      
        4373
                tool_source="assistant",
      
        4374
                pending_tool_calls_seen=set(),
      
        4375
                emit=_noop_emit,
      
        4376
                summary=summary,
      
        4377
                dod=dod,
      
        4378
                executor=executor,  # type: ignore[arg-type]
      
        4379
                on_confirmation=None,
      
        4380
                on_user_question=None,
      
        4381
                emit_confirmation=None,
      
        4382
                consecutive_errors=0,
      
        4383
            )
      
        4384
        
        4385
            assert queued_messages
      
        4386
            message = queued_messages[-1]
      
        4387
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        4388
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4389
            assert "Make your next response the concrete mutation tool call itself." in message
      
        4390
        
        4391
        
        4392
        @pytest.mark.asyncio
      
        4393
        async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
      
        4394
            temp_dir: Path,
      
        4395
        ) -> None:
      
        4396
            async def assess_confidence(
      
        4397
                tool_name: str,
      
        4398
                tool_args: dict,
      
        4399
                context: str,
      
        4400
            ) -> ConfidenceAssessment:
      
        4401
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4402
        
        4403
            async def verify_action(
      
        4404
                tool_name: str,
      
        4405
                tool_args: dict,
      
        4406
                result: str,
      
        4407
                expected: str = "",
      
        4408
            ) -> ActionVerification:
      
        4409
                raise AssertionError("Verification should not run in this scenario")
      
        4410
        
        4411
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4412
            chapters = guide_root / "chapters"
      
        4413
            chapters.mkdir(parents=True)
      
        4414
            index_path = guide_root / "index.html"
      
        4415
            implementation_plan = temp_dir / "implementation.md"
      
        4416
            implementation_plan.write_text(
      
        4417
                "\n".join(
      
        4418
                    [
      
        4419
                        "# Implementation Plan",
      
        4420
                        "",
      
        4421
                        "## File Changes",
      
        4422
                        f"- `{chapters}/`",
      
        4423
                        f"- `{index_path}`",
      
        4424
                        "",
      
        4425
                    ]
      
        4426
                )
      
        4427
            )
      
        4428
        
        4429
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4430
            dod.implementation_plan = str(implementation_plan)
      
        4431
            sync_todos_to_definition_of_done(
      
        4432
                dod,
      
        4433
                [
      
        4434
                    {
      
        4435
                        "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        4436
                        "active_form": "Examining the existing Fortran guide structure",
      
        4437
                        "status": "completed",
      
        4438
                    },
      
        4439
                    {
      
        4440
                        "content": "Create the new nginx guide directory structure",
      
        4441
                        "active_form": "Creating the new nginx guide directory structure",
      
        4442
                        "status": "completed",
      
        4443
                    },
      
        4444
                    {
      
        4445
                        "content": "Create a new index.html for the nginx guide",
      
        4446
                        "active_form": "Creating a new index.html for the nginx guide",
      
        4447
                        "status": "pending",
      
        4448
                    },
      
        4449
                    {
      
        4450
                        "content": "Create the first chapter for the nginx guide",
      
        4451
                        "active_form": "Creating the first chapter for the nginx guide",
      
        4452
                        "status": "pending",
      
        4453
                    },
      
        4454
                ],
      
        4455
                project_root=temp_dir,
      
        4456
            )
      
        4457
        
        4458
            queued_messages: list[str] = []
      
        4459
            context = build_context(
      
        4460
                temp_dir=temp_dir,
      
        4461
                messages=[],
      
        4462
                safeguards=FakeSafeguards(),
      
        4463
                assess_confidence=assess_confidence,
      
        4464
                verify_action=verify_action,
      
        4465
                auto_recover=False,
      
        4466
            )
      
        4467
            context.queue_steering_message_callback = queued_messages.append
      
        4468
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4469
        
        4470
            todos = [
      
        4471
                {
      
        4472
                    "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        4473
                    "active_form": "Examining the existing Fortran guide structure",
      
        4474
                    "status": "completed",
      
        4475
                },
      
        4476
                {
      
        4477
                    "content": "Create the new nginx guide directory structure",
      
        4478
                    "active_form": "Creating the new nginx guide directory structure",
      
        4479
                    "status": "completed",
      
        4480
                },
      
        4481
                {
      
        4482
                    "content": "Create a new index.html for the nginx guide",
      
        4483
                    "active_form": "Creating a new index.html for the nginx guide",
      
        4484
                    "status": "pending",
      
        4485
                },
      
        4486
                {
      
        4487
                    "content": "Create the first chapter for the nginx guide",
      
        4488
                    "active_form": "Creating the first chapter for the nginx guide",
      
        4489
                    "status": "pending",
      
        4490
                },
      
        4491
            ]
      
        4492
            tool_call = ToolCall(
      
        4493
                id="todo-index-before-chapter",
      
        4494
                name="TodoWrite",
      
        4495
                arguments={"todos": todos},
      
        4496
            )
      
        4497
            executor = FakeExecutor(
      
        4498
                [
      
        4499
                    tool_outcome(
      
        4500
                        tool_call=tool_call,
      
        4501
                        output="Todos updated",
      
        4502
                        is_error=False,
      
        4503
                        metadata={"new_todos": todos},
      
        4504
                    )
      
        4505
                ]
      
        4506
            )
      
        4507
        
        4508
            summary = TurnSummary(final_response="")
      
        4509
            await runner.execute_batch(
      
        4510
                tool_calls=[tool_call],
      
        4511
                tool_source="assistant",
      
        4512
                pending_tool_calls_seen=set(),
      
        4513
                emit=_noop_emit,
      
        4514
                summary=summary,
      
        4515
                dod=dod,
      
        4516
                executor=executor,  # type: ignore[arg-type]
      
        4517
                on_confirmation=None,
      
        4518
                on_user_question=None,
      
        4519
                emit_confirmation=None,
      
        4520
                consecutive_errors=0,
      
        4521
            )
      
        4522
        
        4523
            assert queued_messages
      
        4524
            message = queued_messages[-1]
      
        4525
            assert "Todo tracking is updated. Next step: create `index.html`." in message
      
        4526
            assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
      
        4527
            assert "01-introduction.html" not in message
      
        4528
        
        4529
        
        4530
        @pytest.mark.asyncio
      
        4531
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        4532
            temp_dir: Path,
      
        4533
        ) -> None:
      
        4534
            async def assess_confidence(
      
        4535
                tool_name: str,
      
        4536
                tool_args: dict,
      
        4537
                context: str,
      
        4538
            ) -> ConfidenceAssessment:
      
        4539
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4540
        
        4541
            async def verify_action(
      
        4542
                tool_name: str,
      
        4543
                tool_args: dict,
      
        4544
                result: str,
      
        4545
                expected: str = "",
      
        4546
            ) -> ActionVerification:
      
        4547
                raise AssertionError("Verification should not run in this scenario")
      
        4548
        
        4549
            guide_root = temp_dir / "guides" / "nginx"
      
        4550
            chapters = guide_root / "chapters"
      
        4551
            guide_root.mkdir(parents=True)
      
        4552
            chapters.mkdir()
      
        4553
            index_path = guide_root / "index.html"
      
        4554
            index_path.write_text(
      
        4555
                "\n".join(
      
        4556
                    [
      
        4557
                        "<html>",
      
        4558
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        4559
                        '<a href="chapters/installation.html">Installation</a>',
      
        4560
                        "</html>",
      
        4561
                    ]
      
        4562
                )
      
        4563
                + "\n"
      
        4564
            )
      
        4565
        
        4566
            implementation_plan = temp_dir / "implementation.md"
      
        4567
            implementation_plan.write_text(
      
        4568
                "\n".join(
      
        4569
                    [
      
        4570
                        "# Implementation Plan",
      
        4571
                        "",
      
        4572
                        "## File Changes",
      
        4573
                        f"- `{guide_root}/`",
      
        4574
                        f"- `{chapters}/`",
      
        4575
                        f"- `{index_path}`",
      
        4576
                        "",
      
        4577
                    ]
      
        4578
                )
      
        4579
            )
      
        4580
        
        4581
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4582
            dod.implementation_plan = str(implementation_plan)
      
        4583
            dod.pending_items = [
      
        4584
                "Write the introduction chapter",
      
        4585
                "Complete the requested work",
      
        4586
            ]
      
        4587
            dod.touched_files.append(str(index_path))
      
        4588
        
        4589
            queued_messages: list[str] = []
      
        4590
            context = build_context(
      
        4591
                temp_dir=temp_dir,
      
        4592
                messages=[],
      
        4593
                safeguards=FakeSafeguards(),
      
        4594
                assess_confidence=assess_confidence,
      
        4595
                verify_action=verify_action,
      
        4596
                auto_recover=False,
      
        4597
            )
      
        4598
            context.queue_steering_message_callback = queued_messages.append
      
        4599
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4600
        
        4601
            tool_call = ToolCall(
      
        4602
                id="todo-1",
      
        4603
                name="TodoWrite",
      
        4604
                arguments={
      
        4605
                    "todos": [
      
        4606
                        {
      
        4607
                            "content": "Write the introduction chapter",
      
        4608
                            "activeForm": "Writing the introduction chapter",
      
        4609
                            "status": "pending",
      
        4610
                        }
      
        4611
                    ]
      
        4612
                },
      
        4613
            )
      
        4614
            executor = FakeExecutor(
      
        4615
                [
      
        4616
                    tool_outcome(
      
        4617
                        tool_call=tool_call,
      
        4618
                        output="Todos updated",
      
        4619
                        is_error=False,
      
        4620
                        metadata={
      
        4621
                            "new_todos": [
      
        4622
                                {
      
        4623
                                    "content": "Write the introduction chapter",
      
        4624
                                    "active_form": "Writing the introduction chapter",
      
        4625
                                    "status": "pending",
      
        4626
                                }
      
        4627
                            ]
      
        4628
                        },
      
        4629
                    )
      
        4630
                ]
      
        4631
            )
      
        4632
        
        4633
            summary = TurnSummary(final_response="")
      
        4634
            await runner.execute_batch(
      
        4635
                tool_calls=[tool_call],
      
        4636
                tool_source="assistant",
      
        4637
                pending_tool_calls_seen=set(),
      
        4638
                emit=_noop_emit,
      
        4639
                summary=summary,
      
        4640
                dod=dod,
      
        4641
                executor=executor,  # type: ignore[arg-type]
      
        4642
                on_confirmation=None,
      
        4643
                on_user_question=None,
      
        4644
                emit_confirmation=None,
      
        4645
                consecutive_errors=0,
      
        4646
            )
      
        4647
        
        4648
            assert queued_messages
      
        4649
            message = queued_messages[-1]
      
        4650
            assert "Todo tracking is updated. Next step: create `introduction.html`." in message
      
        4651
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4652
            assert "Make your next response the concrete mutation tool call itself." in message
      
        4653
        
        4654
        
        4655
        @pytest.mark.asyncio
      
        4656
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        4657
            temp_dir: Path,
      
        4658
        ) -> None:
      
        4659
            async def assess_confidence(
      
        4660
                tool_name: str,
      
        4661
                tool_args: dict,
      
        4662
                context: str,
      
        4663
            ) -> ConfidenceAssessment:
      
        4664
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4665
        
        4666
            async def verify_action(
      
        4667
                tool_name: str,
      
        4668
                tool_args: dict,
      
        4669
                result: str,
      
        4670
                expected: str = "",
      
        4671
            ) -> ActionVerification:
      
        4672
                raise AssertionError("Verification should not run in this scenario")
      
        4673
        
        4674
            guide_root = temp_dir / "guides" / "nginx"
      
        4675
            chapters = guide_root / "chapters"
      
        4676
            guide_root.mkdir(parents=True)
      
        4677
            chapters.mkdir()
      
        4678
            index_path = guide_root / "index.html"
      
        4679
            chapter_one = chapters / "01-introduction.html"
      
        4680
            index_path.write_text(
      
        4681
                "\n".join(
      
        4682
                    [
      
        4683
                        "<html>",
      
        4684
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        4685
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        4686
                        "</html>",
      
        4687
                    ]
      
        4688
                )
      
        4689
                + "\n"
      
        4690
            )
      
        4691
            chapter_one.write_text("<html></html>\n")
      
        4692
        
        4693
            implementation_plan = temp_dir / "implementation.md"
      
        4694
            implementation_plan.write_text(
      
        4695
                "\n".join(
      
        4696
                    [
      
        4697
                        "# Implementation Plan",
      
        4698
                        "",
      
        4699
                        "## File Changes",
      
        4700
                        f"- `{guide_root}/`",
      
        4701
                        f"- `{chapters}/`",
      
        4702
                        f"- `{index_path}`",
      
        4703
                        "",
      
        4704
                    ]
      
        4705
                )
      
        4706
            )
      
        4707
        
        4708
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4709
            dod.implementation_plan = str(implementation_plan)
      
        4710
            dod.pending_items = [
      
        4711
                "Creating Chapter 2: Installation and Setup",
      
        4712
                "Complete the requested work",
      
        4713
            ]
      
        4714
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4715
        
        4716
            queued_messages: list[str] = []
      
        4717
            context = build_context(
      
        4718
                temp_dir=temp_dir,
      
        4719
                messages=[],
      
        4720
                safeguards=FakeSafeguards(),
      
        4721
                assess_confidence=assess_confidence,
      
        4722
                verify_action=verify_action,
      
        4723
                auto_recover=False,
      
        4724
            )
      
        4725
            context.queue_steering_message_callback = queued_messages.append
      
        4726
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4727
        
        4728
            tool_call = ToolCall(
      
        4729
                id="todo-1",
      
        4730
                name="TodoWrite",
      
        4731
                arguments={
      
        4732
                    "todos": [
      
        4733
                        {
      
        4734
                            "content": "Creating Chapter 2: Installation and Setup",
      
        4735
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        4736
                            "status": "pending",
      
        4737
                        }
      
        4738
                    ]
      
        4739
                },
      
        4740
            )
      
        4741
            executor = FakeExecutor(
      
        4742
                [
      
        4743
                    tool_outcome(
      
        4744
                        tool_call=tool_call,
      
        4745
                        output="Todos updated",
      
        4746
                        is_error=False,
      
        4747
                        metadata={
      
        4748
                            "new_todos": [
      
        4749
                                {
      
        4750
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        4751
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        4752
                                    "status": "pending",
      
        4753
                                }
      
        4754
                            ]
      
        4755
                        },
      
        4756
                    )
      
        4757
                ]
      
        4758
            )
      
        4759
        
        4760
            summary = TurnSummary(final_response="")
      
        4761
            await runner.execute_batch(
      
        4762
                tool_calls=[tool_call],
      
        4763
                tool_source="assistant",
      
        4764
                pending_tool_calls_seen=set(),
      
        4765
                emit=_noop_emit,
      
        4766
                summary=summary,
      
        4767
                dod=dod,
      
        4768
                executor=executor,  # type: ignore[arg-type]
      
        4769
                on_confirmation=None,
      
        4770
                on_user_question=None,
      
        4771
                emit_confirmation=None,
      
        4772
                consecutive_errors=0,
      
        4773
            )
      
        4774
        
        4775
            assert queued_messages
      
        4776
            message = queued_messages[-1]
      
        4777
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        4778
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4779
            assert "Make your next response the concrete mutation tool call itself" in message
      
        4780
        
        4781
        
        4782
        @pytest.mark.asyncio
      
        4783
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        4784
            temp_dir: Path,
      
        4785
        ) -> None:
      
        4786
            async def assess_confidence(
      
        4787
                tool_name: str,
      
        4788
                tool_args: dict,
      
        4789
                context: str,
      
        4790
            ) -> ConfidenceAssessment:
      
        4791
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4792
        
        4793
            async def verify_action(
      
        4794
                tool_name: str,
      
        4795
                tool_args: dict,
      
        4796
                result: str,
      
        4797
                expected: str = "",
      
        4798
            ) -> ActionVerification:
      
        4799
                raise AssertionError("Verification should not run in this scenario")
      
        4800
        
        4801
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        4802
            reference_chapters.mkdir(parents=True)
      
        4803
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        4804
        
        4805
            guide_root = temp_dir / "guides" / "nginx"
      
        4806
            chapters = guide_root / "chapters"
      
        4807
            guide_root.mkdir(parents=True)
      
        4808
            chapters.mkdir()
      
        4809
            index_path = guide_root / "index.html"
      
        4810
            index_path.write_text("<html></html>\n")
      
        4811
        
        4812
            implementation_plan = temp_dir / "implementation.md"
      
        4813
            implementation_plan.write_text(
      
        4814
                "\n".join(
      
        4815
                    [
      
        4816
                        "# Implementation Plan",
      
        4817
                        "",
      
        4818
                        "## File Changes",
      
        4819
                        f"- `{guide_root}/`",
      
        4820
                        f"- `{chapters}/`",
      
        4821
                        f"- `{index_path}`",
      
        4822
                        "",
      
        4823
                    ]
      
        4824
                )
      
        4825
            )
      
        4826
        
        4827
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4828
            dod.implementation_plan = str(implementation_plan)
      
        4829
            dod.pending_items = [
      
        4830
                "Write the introduction chapter",
      
        4831
                "Complete the requested work",
      
        4832
            ]
      
        4833
            dod.touched_files.append(str(index_path))
      
        4834
        
        4835
            queued_messages: list[str] = []
      
        4836
            context = build_context(
      
        4837
                temp_dir=temp_dir,
      
        4838
                messages=[
      
        4839
                    Message(
      
        4840
                        role=Role.ASSISTANT,
      
        4841
                        content="",
      
        4842
                        tool_calls=[
      
        4843
                            ToolCall(
      
        4844
                                id="read-ref-1",
      
        4845
                                name="read",
      
        4846
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        4847
                            )
      
        4848
                        ],
      
        4849
                    )
      
        4850
                ],
      
        4851
                safeguards=FakeSafeguards(),
      
        4852
                assess_confidence=assess_confidence,
      
        4853
                verify_action=verify_action,
      
        4854
                auto_recover=False,
      
        4855
            )
      
        4856
            context.queue_steering_message_callback = queued_messages.append
      
        4857
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4858
        
        4859
            tool_call = ToolCall(
      
        4860
                id="todo-observed-1",
      
        4861
                name="TodoWrite",
      
        4862
                arguments={
      
        4863
                    "todos": [
      
        4864
                        {
      
        4865
                            "content": "Write the introduction chapter",
      
        4866
                            "activeForm": "Writing the introduction chapter",
      
        4867
                            "status": "pending",
      
        4868
                        }
      
        4869
                    ]
      
        4870
                },
      
        4871
            )
      
        4872
            executor = FakeExecutor(
      
        4873
                [
      
        4874
                    tool_outcome(
      
        4875
                        tool_call=tool_call,
      
        4876
                        output="Todos updated",
      
        4877
                        is_error=False,
      
        4878
                        metadata={
      
        4879
                            "new_todos": [
      
        4880
                                {
      
        4881
                                    "content": "Write the introduction chapter",
      
        4882
                                    "active_form": "Writing the introduction chapter",
      
        4883
                                    "status": "pending",
      
        4884
                                }
      
        4885
                            ]
      
        4886
                        },
      
        4887
                    )
      
        4888
                ]
      
        4889
            )
      
        4890
        
        4891
            summary = TurnSummary(final_response="")
      
        4892
            await runner.execute_batch(
      
        4893
                tool_calls=[tool_call],
      
        4894
                tool_source="assistant",
      
        4895
                pending_tool_calls_seen=set(),
      
        4896
                emit=_noop_emit,
      
        4897
                summary=summary,
      
        4898
                dod=dod,
      
        4899
                executor=executor,  # type: ignore[arg-type]
      
        4900
                on_confirmation=None,
      
        4901
                on_user_question=None,
      
        4902
                emit_confirmation=None,
      
        4903
                consecutive_errors=0,
      
        4904
            )
      
        4905
        
        4906
            assert queued_messages
      
        4907
            message = queued_messages[-1]
      
        4908
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        4909
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4910
        
        4911
        
        4912
        @pytest.mark.asyncio
      
        4913
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        4914
            temp_dir: Path,
      
        4915
        ) -> None:
      
        4916
            async def assess_confidence(
      
        4917
                tool_name: str,
      
        4918
                tool_args: dict,
      
        4919
                context: str,
      
        4920
            ) -> ConfidenceAssessment:
      
        4921
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4922
        
        4923
            async def verify_action(
      
        4924
                tool_name: str,
      
        4925
                tool_args: dict,
      
        4926
                result: str,
      
        4927
                expected: str = "",
      
        4928
            ) -> ActionVerification:
      
        4929
                raise AssertionError("Verification should not run in this scenario")
      
        4930
        
        4931
            guide_root = temp_dir / "guides" / "nginx"
      
        4932
            chapters = guide_root / "chapters"
      
        4933
            guide_root.mkdir(parents=True)
      
        4934
            chapters.mkdir()
      
        4935
            index_path = guide_root / "index.html"
      
        4936
            chapter_one = chapters / "01-getting-started.html"
      
        4937
            chapter_two = chapters / "02-installation.html"
      
        4938
            index_path.write_text("<html></html>\n")
      
        4939
            chapter_one.write_text("<h1>One</h1>\n")
      
        4940
        
        4941
            implementation_plan = temp_dir / "implementation.md"
      
        4942
            implementation_plan.write_text(
      
        4943
                "\n".join(
      
        4944
                    [
      
        4945
                        "# Implementation Plan",
      
        4946
                        "",
      
        4947
                        "## File Changes",
      
        4948
                        f"- `{guide_root}/`",
      
        4949
                        f"- `{chapters}/`",
      
        4950
                        f"- `{index_path}`",
      
        4951
                        f"- `{chapter_one}`",
      
        4952
                        f"- `{chapter_two}`",
      
        4953
                        "",
      
        4954
                    ]
      
        4955
                )
      
        4956
            )
      
        4957
        
        4958
            context = build_context(
      
        4959
                temp_dir=temp_dir,
      
        4960
                messages=[],
      
        4961
                safeguards=FakeSafeguards(),
      
        4962
                assess_confidence=assess_confidence,
      
        4963
                verify_action=verify_action,
      
        4964
                auto_recover=False,
      
        4965
            )
      
        4966
            queued_messages: list[str] = []
      
        4967
            context.queue_steering_message_callback = queued_messages.append
      
        4968
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4969
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4970
            dod.implementation_plan = str(implementation_plan)
      
        4971
            sync_todos_to_definition_of_done(
      
        4972
                dod,
      
        4973
                [
      
        4974
                    {
      
        4975
                        "content": "Create 01-getting-started.html",
      
        4976
                        "active_form": "Creating 01-getting-started.html",
      
        4977
                        "status": "completed",
      
        4978
                    },
      
        4979
                    {
      
        4980
                        "content": "Create 02-installation.html",
      
        4981
                        "active_form": "Creating 02-installation.html",
      
        4982
                        "status": "pending",
      
        4983
                    },
      
        4984
                ],
      
        4985
                project_root=temp_dir,
      
        4986
            )
      
        4987
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4988
        
        4989
            tool_call = ToolCall(
      
        4990
                id="working-note",
      
        4991
                name="notepad_write_working",
      
        4992
                arguments={"content": "Creating the second chapter file: Installation"},
      
        4993
            )
      
        4994
            executor = FakeExecutor(
      
        4995
                [
      
        4996
                    tool_outcome(
      
        4997
                        tool_call=tool_call,
      
        4998
                        output="Working note recorded",
      
        4999
                        is_error=False,
      
        5000
                    )
      
        5001
                ]
      
        5002
            )
      
        5003
        
        5004
            summary = TurnSummary(final_response="")
      
        5005
            await runner.execute_batch(
      
        5006
                tool_calls=[tool_call],
      
        5007
                tool_source="assistant",
      
        5008
                pending_tool_calls_seen=set(),
      
        5009
                emit=_noop_emit,
      
        5010
                summary=summary,
      
        5011
                dod=dod,
      
        5012
                executor=executor,  # type: ignore[arg-type]
      
        5013
                on_confirmation=None,
      
        5014
                on_user_question=None,
      
        5015
                emit_confirmation=None,
      
        5016
                consecutive_errors=0,
      
        5017
            )
      
        5018
        
        5019
            assert queued_messages
      
        5020
            message = queued_messages[-1]
      
        5021
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        5022
            assert "Resume by creating `02-installation.html` now." in message
      
        5023
            assert "Make your next response the concrete mutation tool call itself" in message
      
        5024
            assert "refresh `TodoWrite`" in message
      
        5025
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        5026
        
        5027
        
        5028
        @pytest.mark.asyncio
      
        5029
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        5030
            temp_dir: Path,
      
        5031
        ) -> None:
      
        5032
            async def assess_confidence(
      
        5033
                tool_name: str,
      
        5034
                tool_args: dict,
      
        5035
                context: str,
      
        5036
            ) -> ConfidenceAssessment:
      
        5037
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5038
        
        5039
            async def verify_action(
      
        5040
                tool_name: str,
      
        5041
                tool_args: dict,
      
        5042
                result: str,
      
        5043
                expected: str = "",
      
        5044
            ) -> ActionVerification:
      
        5045
                raise AssertionError("Verification should not run in this scenario")
      
        5046
        
        5047
            implementation_plan = temp_dir / "implementation.md"
      
        5048
            implementation_plan.write_text(
      
        5049
                "\n".join(
      
        5050
                    [
      
        5051
                        "# Implementation Plan",
      
        5052
                        "",
      
        5053
                        "## File Changes",
      
        5054
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        5055
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        5056
                        "",
      
        5057
                    ]
      
        5058
                )
      
        5059
            )
      
        5060
        
        5061
            context = build_context(
      
        5062
                temp_dir=temp_dir,
      
        5063
                messages=[],
      
        5064
                safeguards=FakeSafeguards(),
      
        5065
                assess_confidence=assess_confidence,
      
        5066
                verify_action=verify_action,
      
        5067
                auto_recover=False,
      
        5068
            )
      
        5069
            queued_messages: list[str] = []
      
        5070
            context.queue_steering_message_callback = queued_messages.append
      
        5071
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5072
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5073
            dod.implementation_plan = str(implementation_plan)
      
        5074
            dod.pending_items.extend(
      
        5075
                [
      
        5076
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        5077
                    "Create the nginx directory structure",
      
        5078
                    "Develop the main index.html file for the nginx guide",
      
        5079
                ]
      
        5080
            )
      
        5081
        
        5082
            tool_call = ToolCall(
      
        5083
                id="working-note",
      
        5084
                name="notepad_write_working",
      
        5085
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        5086
            )
      
        5087
            executor = FakeExecutor(
      
        5088
                [
      
        5089
                    tool_outcome(
      
        5090
                        tool_call=tool_call,
      
        5091
                        output="Working note recorded",
      
        5092
                        is_error=False,
      
        5093
                    )
      
        5094
                ]
      
        5095
            )
      
        5096
        
        5097
            summary = TurnSummary(final_response="")
      
        5098
            await runner.execute_batch(
      
        5099
                tool_calls=[tool_call],
      
        5100
                tool_source="assistant",
      
        5101
                pending_tool_calls_seen=set(),
      
        5102
                emit=_noop_emit,
      
        5103
                summary=summary,
      
        5104
                dod=dod,
      
        5105
                executor=executor,  # type: ignore[arg-type]
      
        5106
                on_confirmation=None,
      
        5107
                on_user_question=None,
      
        5108
                emit_confirmation=None,
      
        5109
                consecutive_errors=0,
      
        5110
            )
      
        5111
        
        5112
            assert queued_messages
      
        5113
            message = queued_messages[-1]
      
        5114
            assert (
      
        5115
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        5116
                in message
      
        5117
            )
      
        5118
            assert "one concrete evidence-gathering tool call" in message
      
        5119
            assert "Resume by creating `index.html` now." not in message
      
        5120
        
        5121
        
        5122
        @pytest.mark.asyncio
      
        5123
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        5124
            temp_dir: Path,
      
        5125
        ) -> None:
      
        5126
            async def assess_confidence(
      
        5127
                tool_name: str,
      
        5128
                tool_args: dict,
      
        5129
                context: str,
      
        5130
            ) -> ConfidenceAssessment:
      
        5131
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5132
        
        5133
            async def verify_action(
      
        5134
                tool_name: str,
      
        5135
                tool_args: dict,
      
        5136
                result: str,
      
        5137
                expected: str = "",
      
        5138
            ) -> ActionVerification:
      
        5139
                raise AssertionError("Verification should not run in this scenario")
      
        5140
        
        5141
            guide_root = temp_dir / "guides" / "nginx"
      
        5142
            chapters_dir = guide_root / "chapters"
      
        5143
            chapters_dir.mkdir(parents=True)
      
        5144
            index_path = guide_root / "index.html"
      
        5145
            first_chapter = chapters_dir / "01-introduction.html"
      
        5146
            index_path.write_text(
      
        5147
                "\n".join(
      
        5148
                    [
      
        5149
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        5150
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        5151
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        5152
                    ]
      
        5153
                )
      
        5154
            )
      
        5155
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        5156
        
        5157
            implementation_plan = temp_dir / "implementation.md"
      
        5158
            implementation_plan.write_text(
      
        5159
                "\n".join(
      
        5160
                    [
      
        5161
                        "# Implementation Plan",
      
        5162
                        "",
      
        5163
                        "## File Changes",
      
        5164
                        f"- `{guide_root / 'index.html'}`",
      
        5165
                        f"- `{chapters_dir}/`",
      
        5166
                        "",
      
        5167
                    ]
      
        5168
                )
      
        5169
            )
      
        5170
        
        5171
            context = build_context(
      
        5172
                temp_dir=temp_dir,
      
        5173
                messages=[],
      
        5174
                safeguards=FakeSafeguards(),
      
        5175
                assess_confidence=assess_confidence,
      
        5176
                verify_action=verify_action,
      
        5177
                auto_recover=False,
      
        5178
            )
      
        5179
            queued_messages: list[str] = []
      
        5180
            context.queue_steering_message_callback = queued_messages.append
      
        5181
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5182
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5183
            dod.implementation_plan = str(implementation_plan)
      
        5184
            dod.pending_items.extend(
      
        5185
                [
      
        5186
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        5187
                    "Create chapter files following the established pattern",
      
        5188
                ]
      
        5189
            )
      
        5190
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        5191
        
        5192
            tool_call = ToolCall(
      
        5193
                id="working-note",
      
        5194
                name="notepad_write_working",
      
        5195
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        5196
            )
      
        5197
            executor = FakeExecutor(
      
        5198
                [
      
        5199
                    tool_outcome(
      
        5200
                        tool_call=tool_call,
      
        5201
                        output="Working note recorded",
      
        5202
                        is_error=False,
      
        5203
                    )
      
        5204
                ]
      
        5205
            )
      
        5206
        
        5207
            summary = TurnSummary(final_response="")
      
        5208
            await runner.execute_batch(
      
        5209
                tool_calls=[tool_call],
      
        5210
                tool_source="assistant",
      
        5211
                pending_tool_calls_seen=set(),
      
        5212
                emit=_noop_emit,
      
        5213
                summary=summary,
      
        5214
                dod=dod,
      
        5215
                executor=executor,  # type: ignore[arg-type]
      
        5216
                on_confirmation=None,
      
        5217
                on_user_question=None,
      
        5218
                emit_confirmation=None,
      
        5219
                consecutive_errors=0,
      
        5220
            )
      
        5221
        
        5222
            assert queued_messages
      
        5223
            message = queued_messages[-1]
      
        5224
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        5225
            assert "Resume by creating `02-installation.html` now." in message
      
        5226
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        5227
        
        5228
        
        5229
        @pytest.mark.asyncio
      
        5230
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        5231
            temp_dir: Path,
      
        5232
        ) -> None:
      
        5233
            async def assess_confidence(
      
        5234
                tool_name: str,
      
        5235
                tool_args: dict,
      
        5236
                context: str,
      
        5237
            ) -> ConfidenceAssessment:
      
        5238
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5239
        
        5240
            async def verify_action(
      
        5241
                tool_name: str,
      
        5242
                tool_args: dict,
      
        5243
                result: str,
      
        5244
                expected: str = "",
      
        5245
            ) -> ActionVerification:
      
        5246
                raise AssertionError("Verification should not run in this scenario")
      
        5247
        
        5248
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        5249
            chapters_dir = fortran_root / "chapters"
      
        5250
            chapters_dir.mkdir(parents=True)
      
        5251
        
        5252
            implementation_plan = temp_dir / "implementation.md"
      
        5253
            implementation_plan.write_text(
      
        5254
                "\n".join(
      
        5255
                    [
      
        5256
                        "# Implementation Plan",
      
        5257
                        "",
      
        5258
                        "## File Changes",
      
        5259
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        5260
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        5261
                        "",
      
        5262
                    ]
      
        5263
                )
      
        5264
            )
      
        5265
        
        5266
            context = build_context(
      
        5267
                temp_dir=temp_dir,
      
        5268
                messages=[],
      
        5269
                safeguards=FakeSafeguards(),
      
        5270
                assess_confidence=assess_confidence,
      
        5271
                verify_action=verify_action,
      
        5272
                auto_recover=False,
      
        5273
            )
      
        5274
            queued_messages: list[str] = []
      
        5275
            context.queue_steering_message_callback = queued_messages.append
      
        5276
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5277
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5278
            dod.implementation_plan = str(implementation_plan)
      
        5279
            dod.pending_items.extend(
      
        5280
                [
      
        5281
                    "First, examine the existing fortran guide structure and content",
      
        5282
                    "Create the nginx directory structure",
      
        5283
                    "Develop the main index.html file for nginx guide",
      
        5284
                ]
      
        5285
            )
      
        5286
        
        5287
            tool_call = ToolCall(
      
        5288
                id="glob-1",
      
        5289
                name="glob",
      
        5290
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        5291
            )
      
        5292
            executor = FakeExecutor(
      
        5293
                [
      
        5294
                    tool_outcome(
      
        5295
                        tool_call=tool_call,
      
        5296
                        output=f"{fortran_root}\n{chapters_dir}",
      
        5297
                        is_error=False,
      
        5298
                    )
      
        5299
                ]
      
        5300
            )
      
        5301
        
        5302
            summary = TurnSummary(final_response="")
      
        5303
            await runner.execute_batch(
      
        5304
                tool_calls=[tool_call],
      
        5305
                tool_source="assistant",
      
        5306
                pending_tool_calls_seen=set(),
      
        5307
                emit=_noop_emit,
      
        5308
                summary=summary,
      
        5309
                dod=dod,
      
        5310
                executor=executor,  # type: ignore[arg-type]
      
        5311
                on_confirmation=None,
      
        5312
                on_user_question=None,
      
        5313
                emit_confirmation=None,
      
        5314
                consecutive_errors=0,
      
        5315
            )
      
        5316
        
        5317
            assert queued_messages == []
      
        5318
        
        5319
        
        5320
        @pytest.mark.asyncio
      
        5321
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        5322
            temp_dir: Path,
      
        5323
        ) -> None:
      
        5324
            async def assess_confidence(
      
        5325
                tool_name: str,
      
        5326
                tool_args: dict,
      
        5327
                context: str,
      
        5328
            ) -> ConfidenceAssessment:
      
        5329
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5330
        
        5331
            async def verify_action(
      
        5332
                tool_name: str,
      
        5333
                tool_args: dict,
      
        5334
                result: str,
      
        5335
                expected: str = "",
      
        5336
            ) -> ActionVerification:
      
        5337
                raise AssertionError("Verification should not run in this scenario")
      
        5338
        
        5339
            prompt = (
      
        5340
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        5341
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        5342
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        5343
                "links and their link texts to be correct."
      
        5344
            )
      
        5345
            chapters = temp_dir / "chapters"
      
        5346
            chapters.mkdir()
      
        5347
            (chapters / "01-introduction.html").write_text(
      
        5348
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        5349
            )
      
        5350
            (chapters / "02-setup.html").write_text(
      
        5351
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        5352
            )
      
        5353
            current_block = (
      
        5354
                "<h2>Table of Contents</h2>\n"
      
        5355
                '        <ul class="chapter-list">\n'
      
        5356
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        5357
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        5358
                "        </ul>\n"
      
        5359
            )
      
        5360
            index_path = temp_dir / "index.html"
      
        5361
            index_path.write_text(current_block)
      
        5362
        
        5363
            context = build_context(
      
        5364
                temp_dir=temp_dir,
      
        5365
                messages=[],
      
        5366
                safeguards=FakeSafeguards(),
      
        5367
                assess_confidence=assess_confidence,
      
        5368
                verify_action=verify_action,
      
        5369
                auto_recover=False,
      
        5370
            )
      
        5371
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        5372
            queued_messages: list[str] = []
      
        5373
            context.queue_steering_message_callback = queued_messages.append
      
        5374
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5375
            tool_call = ToolCall(
      
        5376
                id="edit-1",
      
        5377
                name="edit",
      
        5378
                arguments={
      
        5379
                    "file_path": str(index_path),
      
        5380
                    "old_string": current_block,
      
        5381
                    "new_string": current_block,
      
        5382
                },
      
        5383
            )
      
        5384
            executor = FakeExecutor(
      
        5385
                [
      
        5386
                    tool_outcome(
      
        5387
                        tool_call=tool_call,
      
        5388
                        output=(
      
        5389
                            "[Blocked - old_string and new_string are identical - no change "
      
        5390
                            "would occur] Suggestion: Provide different old and new strings"
      
        5391
                        ),
      
        5392
                        is_error=True,
      
        5393
                        state=ToolExecutionState.BLOCKED,
      
        5394
                    )
      
        5395
                ]
      
        5396
            )
      
        5397
        
        5398
            await runner.execute_batch(
      
        5399
                tool_calls=[tool_call],
      
        5400
                tool_source="assistant",
      
        5401
                pending_tool_calls_seen=set(),
      
        5402
                emit=_noop_emit,
      
        5403
                summary=TurnSummary(final_response=""),
      
        5404
                dod=create_definition_of_done(prompt),
      
        5405
                executor=executor,  # type: ignore[arg-type]
      
        5406
                on_confirmation=None,
      
        5407
                on_user_question=None,
      
        5408
                emit_confirmation=None,
      
        5409
                consecutive_errors=0,
      
        5410
            )
      
        5411
        
        5412
            assert queued_messages == []
      
        5413
        
        5414
        
        5415
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        5416
            temp_dir: Path,
      
        5417
        ) -> None:
      
        5418
            async def assess_confidence(
      
        5419
                tool_name: str,
      
        5420
                tool_args: dict,
      
        5421
                context: str,
      
        5422
            ) -> ConfidenceAssessment:
      
        5423
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5424
        
        5425
            async def verify_action(
      
        5426
                tool_name: str,
      
        5427
                tool_args: dict,
      
        5428
                result: str,
      
        5429
                expected: str = "",
      
        5430
            ) -> ActionVerification:
      
        5431
                raise AssertionError("Verification should not run in this scenario")
      
        5432
        
        5433
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        5434
            context = build_context(
      
        5435
                temp_dir=temp_dir,
      
        5436
                messages=[
      
        5437
                    Message(
      
        5438
                        role=Role.ASSISTANT,
      
        5439
                        content=(
      
        5440
                            "Repair focus:\n"
      
        5441
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        5442
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5443
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        5444
                        ),
      
        5445
                    )
      
        5446
                ],
      
        5447
                safeguards=FakeSafeguards(),
      
        5448
                assess_confidence=assess_confidence,
      
        5449
                verify_action=verify_action,
      
        5450
            )
      
        5451
            queued: list[str] = []
      
        5452
            context.queue_steering_message_callback = queued.append
      
        5453
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5454
            dod = create_definition_of_done("Repair a guide page.")
      
        5455
        
        5456
            runner._queue_blocked_html_edit_nudge(
      
        5457
                ToolCall(
      
        5458
                    id="edit-1",
      
        5459
                    name="edit",
      
        5460
                    arguments={
      
        5461
                        "file_path": str(repair_target),
      
        5462
                        "old_string": "same",
      
        5463
                        "new_string": "same",
      
        5464
                    },
      
        5465
                ),
      
        5466
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        5467
                dod=dod,
      
        5468
            )
      
        5469
        
        5470
            assert queued
      
        5471
            assert str(repair_target) in queued[0]
      
        5472
            assert "no on-disk change" in queued[0]
      
        5473
            assert "replace the surrounding block" in queued[0]
      
        5474
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        5475
        
        5476
        
        5477
        def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
      
        5478
            temp_dir: Path,
      
        5479
        ) -> None:
      
        5480
            async def assess_confidence(
      
        5481
                tool_name: str,
      
        5482
                tool_args: dict,
      
        5483
                context: str,
      
        5484
            ) -> ConfidenceAssessment:
      
        5485
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5486
        
        5487
            async def verify_action(
      
        5488
                tool_name: str,
      
        5489
                tool_args: dict,
      
        5490
                result: str,
      
        5491
                expected: str = "",
      
        5492
            ) -> ActionVerification:
      
        5493
                raise AssertionError("Verification should not run in this scenario")
      
        5494
        
        5495
            guide_root = temp_dir / "guide"
      
        5496
            chapters = guide_root / "chapters"
      
        5497
            chapters.mkdir(parents=True)
      
        5498
            index_path = guide_root / "index.html"
      
        5499
            chapter_one = chapters / "01-introduction.html"
      
        5500
            index_path.write_text("<html></html>\n")
      
        5501
            chapter_one.write_text("<html></html>\n")
      
        5502
        
        5503
            implementation_plan = temp_dir / "implementation.md"
      
        5504
            implementation_plan.write_text(
      
        5505
                "\n".join(
      
        5506
                    [
      
        5507
                        "# Implementation Plan",
      
        5508
                        "",
      
        5509
                        "## File Changes",
      
        5510
                        f"- `{index_path}`",
      
        5511
                        f"- `{chapter_one}`",
      
        5512
                        "",
      
        5513
                    ]
      
        5514
                )
      
        5515
            )
      
        5516
        
        5517
            context = build_context(
      
        5518
                temp_dir=temp_dir,
      
        5519
                messages=[
      
        5520
                    Message(
      
        5521
                        role=Role.ASSISTANT,
      
        5522
                        content=(
      
        5523
                            "Repair focus:\n"
      
        5524
                            f"- Confirm the final guide state in `{index_path}`.\n"
      
        5525
                            f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
      
        5526
                        ),
      
        5527
                    )
      
        5528
                ],
      
        5529
                safeguards=FakeSafeguards(),
      
        5530
                assess_confidence=assess_confidence,
      
        5531
                verify_action=verify_action,
      
        5532
            )
      
        5533
            queued: list[str] = []
      
        5534
            context.queue_steering_message_callback = queued.append
      
        5535
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5536
        
        5537
            dod = create_definition_of_done("Create a multi-file guide.")
      
        5538
            dod.implementation_plan = str(implementation_plan)
      
        5539
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5540
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5541
        
        5542
            runner._queue_blocked_html_edit_nudge(
      
        5543
                ToolCall(
      
        5544
                    id="edit-1",
      
        5545
                    name="edit",
      
        5546
                    arguments={
      
        5547
                        "file_path": str(index_path),
      
        5548
                        "old_string": "same",
      
        5549
                        "new_string": "same",
      
        5550
                    },
      
        5551
                ),
      
        5552
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        5553
                dod=dod,
      
        5554
            )
      
        5555
        
        5556
            assert queued
      
        5557
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        5558
            assert "Move to verification or final confirmation using the files already on disk." in queued[0]
      
        5559
            assert "replace the surrounding block" not in queued[0]
      
        5560
        
        5561
        
        5562
        async def _noop_emit(event: AgentEvent) -> None:
      
        5563
            return None
      
        5564
        
        5565
        
        5566
        @pytest.mark.asyncio
      
        5567
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        5568
            temp_dir: Path,
      
        5569
        ) -> None:
      
        5570
            async def assess_confidence(
      
        5571
                tool_name: str,
      
        5572
                tool_args: dict,
      
        5573
                context: str,
      
        5574
            ) -> ConfidenceAssessment:
      
        5575
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5576
        
        5577
            async def verify_action(
      
        5578
                tool_name: str,
      
        5579
                tool_args: dict,
      
        5580
                result: str,
      
        5581
                expected: str = "",
      
        5582
            ) -> ActionVerification:
      
        5583
                raise AssertionError("Verification should not run for this scenario")
      
        5584
        
        5585
            context = build_context(
      
        5586
                temp_dir=temp_dir,
      
        5587
                messages=[],
      
        5588
                safeguards=FakeSafeguards(),
      
        5589
                assess_confidence=assess_confidence,
      
        5590
                verify_action=verify_action,
      
        5591
            )
      
        5592
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5593
            tool_call = ToolCall(
      
        5594
                id="write-1",
      
        5595
                name="write",
      
        5596
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        5597
            )
      
        5598
            executor = FakeExecutor(
      
        5599
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        5600
            )
      
        5601
            summary = TurnSummary(final_response="")
      
        5602
            dod = create_definition_of_done("Update README and verify it still works.")
      
        5603
            events: list[AgentEvent] = []
      
        5604
        
        5605
            async def emit(event: AgentEvent) -> None:
      
        5606
                events.append(event)
      
        5607
        
        5608
            await runner.execute_batch(
      
        5609
                tool_calls=[tool_call],
      
        5610
                tool_source="assistant",
      
        5611
                pending_tool_calls_seen=set(),
      
        5612
                emit=emit,
      
        5613
                summary=summary,
      
        5614
                dod=dod,
      
        5615
                executor=executor,  # type: ignore[arg-type]
      
        5616
                on_confirmation=None,
      
        5617
                on_user_question=None,
      
        5618
                emit_confirmation=None,
      
        5619
                consecutive_errors=0,
      
        5620
            )
      
        5621
        
        5622
            assert dod.last_verification_result == "planned"
      
        5623
            assert dod.verification_commands
      
        5624
            assert "Collect verification evidence" in dod.pending_items
      
        5625
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        5626
            assert dod.active_verification_attempt_number == 1
      
        5627
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        5628
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        5629
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        5630
            assert (
      
        5631
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        5632
                == "verification-attempt-1"
      
        5633
            )
      
        5634
            assert (
      
        5635
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        5636
            )
      
        5637
        
        5638
        
        5639
        @pytest.mark.asyncio
      
        5640
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        5641
            temp_dir: Path,
      
        5642
        ) -> None:
      
        5643
            async def assess_confidence(
      
        5644
                tool_name: str,
      
        5645
                tool_args: dict,
      
        5646
                context: str,
      
        5647
            ) -> ConfidenceAssessment:
      
        5648
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5649
        
        5650
            async def verify_action(
      
        5651
                tool_name: str,
      
        5652
                tool_args: dict,
      
        5653
                result: str,
      
        5654
                expected: str = "",
      
        5655
            ) -> ActionVerification:
      
        5656
                raise AssertionError("Verification should not run in this scenario")
      
        5657
        
        5658
            context = build_context(
      
        5659
                temp_dir=temp_dir,
      
        5660
                messages=[],
      
        5661
                safeguards=FakeSafeguards(),
      
        5662
                assess_confidence=assess_confidence,
      
        5663
                verify_action=verify_action,
      
        5664
            )
      
        5665
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5666
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        5667
            chapters = nginx_root / "chapters"
      
        5668
            implementation_plan = temp_dir / "implementation.md"
      
        5669
            implementation_plan.write_text(
      
        5670
                "\n".join(
      
        5671
                    [
      
        5672
                        "# Implementation Plan",
      
        5673
                        "",
      
        5674
                        "## File Changes",
      
        5675
                        f"- `{chapters}/`",
      
        5676
                        f"- `{nginx_root / 'index.html'}`",
      
        5677
                        "",
      
        5678
                    ]
      
        5679
                )
      
        5680
            )
      
        5681
        
        5682
            tool_call = ToolCall(
      
        5683
                id="mkdir-1",
      
        5684
                name="bash",
      
        5685
                arguments={"command": f"mkdir -p {chapters}"},
      
        5686
            )
      
        5687
            executor = FakeExecutor(
      
        5688
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        5689
            )
      
        5690
            summary = TurnSummary(final_response="")
      
        5691
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        5692
            dod.implementation_plan = str(implementation_plan)
      
        5693
            events: list[AgentEvent] = []
      
        5694
        
        5695
            async def emit(event: AgentEvent) -> None:
      
        5696
                events.append(event)
      
        5697
        
        5698
            await runner.execute_batch(
      
        5699
                tool_calls=[tool_call],
      
        5700
                tool_source="assistant",
      
        5701
                pending_tool_calls_seen=set(),
      
        5702
                emit=emit,
      
        5703
                summary=summary,
      
        5704
                dod=dod,
      
        5705
                executor=executor,  # type: ignore[arg-type]
      
        5706
                on_confirmation=None,
      
        5707
                on_user_question=None,
      
        5708
                emit_confirmation=None,
      
        5709
                consecutive_errors=0,
      
        5710
            )
      
        5711
        
        5712
            assert dod.last_verification_result is None
      
        5713
            assert "Collect verification evidence" not in dod.pending_items
      
        5714
            assert not any(
      
        5715
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        5716
            )
      
        5717
        
        5718
        
        5719
        @pytest.mark.asyncio
      
        5720
        async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
      
        5721
            temp_dir: Path,
      
        5722
        ) -> None:
      
        5723
            async def assess_confidence(
      
        5724
                tool_name: str,
      
        5725
                tool_args: dict,
      
        5726
                context: str,
      
        5727
            ) -> ConfidenceAssessment:
      
        5728
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5729
        
        5730
            async def verify_action(
      
        5731
                tool_name: str,
      
        5732
                tool_args: dict,
      
        5733
                result: str,
      
        5734
                expected: str = "",
      
        5735
            ) -> ActionVerification:
      
        5736
                raise AssertionError("Verification should not run in this scenario")
      
        5737
        
        5738
            context = build_context(
      
        5739
                temp_dir=temp_dir,
      
        5740
                messages=[],
      
        5741
                safeguards=FakeSafeguards(),
      
        5742
                assess_confidence=assess_confidence,
      
        5743
                verify_action=verify_action,
      
        5744
            )
      
        5745
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5746
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        5747
            chapters = nginx_root / "chapters"
      
        5748
            chapters.mkdir(parents=True)
      
        5749
            index_path = nginx_root / "index.html"
      
        5750
            implementation_plan = temp_dir / "implementation.md"
      
        5751
            implementation_plan.write_text(
      
        5752
                "\n".join(
      
        5753
                    [
      
        5754
                        "# Implementation Plan",
      
        5755
                        "",
      
        5756
                        "## File Changes",
      
        5757
                        f"- `{nginx_root}/`",
      
        5758
                        f"- `{chapters}/`",
      
        5759
                        f"- `{index_path}`",
      
        5760
                        "",
      
        5761
                    ]
      
        5762
                )
      
        5763
            )
      
        5764
        
        5765
            tool_call = ToolCall(
      
        5766
                id="write-index",
      
        5767
                name="write",
      
        5768
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        5769
            )
      
        5770
            executor = FakeExecutor(
      
        5771
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        5772
            )
      
        5773
            summary = TurnSummary(final_response="")
      
        5774
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5775
            dod.implementation_plan = str(implementation_plan)
      
        5776
            dod.pending_items.extend(
      
        5777
                [
      
        5778
                    "Develop the main index.html file with proper structure",
      
        5779
                    "Create first nginx chapter",
      
        5780
                ]
      
        5781
            )
      
        5782
            events: list[AgentEvent] = []
      
        5783
        
        5784
            async def emit(event: AgentEvent) -> None:
      
        5785
                events.append(event)
      
        5786
        
        5787
            await runner.execute_batch(
      
        5788
                tool_calls=[tool_call],
      
        5789
                tool_source="assistant",
      
        5790
                pending_tool_calls_seen=set(),
      
        5791
                emit=emit,
      
        5792
                summary=summary,
      
        5793
                dod=dod,
      
        5794
                executor=executor,  # type: ignore[arg-type]
      
        5795
                on_confirmation=None,
      
        5796
                on_user_question=None,
      
        5797
                emit_confirmation=None,
      
        5798
                consecutive_errors=0,
      
        5799
            )
      
        5800
        
        5801
            assert dod.last_verification_result is None
      
        5802
            assert "Collect verification evidence" not in dod.pending_items
      
        5803
            assert "Create first nginx chapter" in dod.pending_items
      
        5804
            assert not any(
      
        5805
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        5806
            )
      
        5807
        
        5808
        
        5809
        @pytest.mark.asyncio
      
        5810
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        5811
            temp_dir: Path,
      
        5812
        ) -> None:
      
        5813
            async def assess_confidence(
      
        5814
                tool_name: str,
      
        5815
                tool_args: dict,
      
        5816
                context: str,
      
        5817
            ) -> ConfidenceAssessment:
      
        5818
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5819
        
        5820
            async def verify_action(
      
        5821
                tool_name: str,
      
        5822
                tool_args: dict,
      
        5823
                result: str,
      
        5824
                expected: str = "",
      
        5825
            ) -> ActionVerification:
      
        5826
                raise AssertionError("Verification should not run for this scenario")
      
        5827
        
        5828
            context = build_context(
      
        5829
                temp_dir=temp_dir,
      
        5830
                messages=[],
      
        5831
                safeguards=FakeSafeguards(),
      
        5832
                assess_confidence=assess_confidence,
      
        5833
                verify_action=verify_action,
      
        5834
            )
      
        5835
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5836
            tool_call = ToolCall(
      
        5837
                id="write-1",
      
        5838
                name="write",
      
        5839
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        5840
            )
      
        5841
            executor = FakeExecutor(
      
        5842
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        5843
            )
      
        5844
            summary = TurnSummary(final_response="")
      
        5845
            dod = create_definition_of_done("Update README and verify it still works.")
      
        5846
            dod.verification_commands = ["uv run pytest -q"]
      
        5847
            dod.last_verification_result = "passed"
      
        5848
            dod.verification_attempt_counter = 1
      
        5849
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        5850
            dod.active_verification_attempt_number = 1
      
        5851
            dod.evidence = [
      
        5852
                VerificationEvidence(
      
        5853
                    command="uv run pytest -q",
      
        5854
                    passed=True,
      
        5855
                    stdout="401 passed",
      
        5856
                    kind="test",
      
        5857
                )
      
        5858
            ]
      
        5859
            dod.completed_items.append("Collect verification evidence")
      
        5860
            events: list[AgentEvent] = []
      
        5861
        
        5862
            async def emit(event: AgentEvent) -> None:
      
        5863
                events.append(event)
      
        5864
        
        5865
            await runner.execute_batch(
      
        5866
                tool_calls=[tool_call],
      
        5867
                tool_source="assistant",
      
        5868
                pending_tool_calls_seen=set(),
      
        5869
                emit=emit,
      
        5870
                summary=summary,
      
        5871
                dod=dod,
      
        5872
                executor=executor,  # type: ignore[arg-type]
      
        5873
                on_confirmation=None,
      
        5874
                on_user_question=None,
      
        5875
                emit_confirmation=None,
      
        5876
                consecutive_errors=0,
      
        5877
            )
      
        5878
        
        5879
            assert dod.last_verification_result == "stale"
      
        5880
            assert dod.evidence == []
      
        5881
            assert "Collect verification evidence" in dod.pending_items
      
        5882
            assert "Collect verification evidence" not in dod.completed_items
      
        5883
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        5884
            assert dod.active_verification_attempt_number == 2
      
        5885
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        5886
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        5887
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        5888
            assert (
      
        5889
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        5890
                == "verification-attempt-1"
      
        5891
            )
      
        5892
            assert (
      
        5893
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        5894
            )
      
        5895
            assert (
      
        5896
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        5897
                == "verification-attempt-2"
      
        5898
            )
      
        5899
            assert (
      
        5900
                summary.workflow_timeline[-1].verification_observations[0].command
      
        5901
                == "uv run pytest -q"
      
        5902
            )
      
        5903
        
        5904
        
        5905
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        5906
            async def assess_confidence(
      
        5907
                tool_name: str,
      
        5908
                tool_args: dict,
      
        5909
                context: str,
      
        5910
            ) -> ConfidenceAssessment:
      
        5911
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5912
        
        5913
            async def verify_action(
      
        5914
                tool_name: str,
      
        5915
                tool_args: dict,
      
        5916
                result: str,
      
        5917
                expected: str = "",
      
        5918
            ) -> ActionVerification:
      
        5919
                raise AssertionError("Verification should not run in this scenario")
      
        5920
        
        5921
            repair_target = temp_dir / "guide" / "index.html"
      
        5922
            context = build_context(
      
        5923
                temp_dir=temp_dir,
      
        5924
                messages=[
      
        5925
                    Message(
      
        5926
                        role=Role.ASSISTANT,
      
        5927
                        content=(
      
        5928
                            "Repair focus:\n"
      
        5929
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        5930
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5931
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        5932
                        ),
      
        5933
                    )
      
        5934
                ],
      
        5935
                safeguards=FakeSafeguards(),
      
        5936
                assess_confidence=assess_confidence,
      
        5937
                verify_action=verify_action,
      
        5938
            )
      
        5939
            queued: list[str] = []
      
        5940
            context.queue_steering_message_callback = queued.append
      
        5941
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5942
        
        5943
            runner._queue_blocked_active_repair_nudge(
      
        5944
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        5945
            )
      
        5946
        
        5947
            assert queued
      
        5948
            assert str(repair_target) in queued[0]
      
        5949
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        5950
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        5951
        
        5952
        
        5953
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        5954
            temp_dir: Path,
      
        5955
        ) -> None:
      
        5956
            async def assess_confidence(
      
        5957
                tool_name: str,
      
        5958
                tool_args: dict,
      
        5959
                context: str,
      
        5960
            ) -> ConfidenceAssessment:
      
        5961
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5962
        
        5963
            async def verify_action(
      
        5964
                tool_name: str,
      
        5965
                tool_args: dict,
      
        5966
                result: str,
      
        5967
                expected: str = "",
      
        5968
            ) -> ActionVerification:
      
        5969
                raise AssertionError("Verification should not run in this scenario")
      
        5970
        
        5971
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        5972
            stylesheet = temp_dir / "guide" / "styles.css"
      
        5973
            context = build_context(
      
        5974
                temp_dir=temp_dir,
      
        5975
                messages=[
      
        5976
                    Message(
      
        5977
                        role=Role.ASSISTANT,
      
        5978
                        content=(
      
        5979
                            "Repair focus:\n"
      
        5980
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        5981
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5982
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        5983
                        ),
      
        5984
                    )
      
        5985
                ],
      
        5986
                safeguards=FakeSafeguards(),
      
        5987
                assess_confidence=assess_confidence,
      
        5988
                verify_action=verify_action,
      
        5989
            )
      
        5990
            queued: list[str] = []
      
        5991
            context.queue_steering_message_callback = queued.append
      
        5992
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5993
        
        5994
            runner._queue_blocked_active_repair_mutation_nudge(
      
        5995
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        5996
            )
      
        5997
        
        5998
            assert queued
      
        5999
            assert str(repair_target) in queued[0]
      
        6000
            assert str(stylesheet) in queued[0]
      
        6001
            assert "before widening the change set" in queued[0]
      
        6002
        
        6003
        
        6004
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        6005
            temp_dir: Path,
      
        6006
        ) -> None:
      
        6007
            async def assess_confidence(
      
        6008
                tool_name: str,
      
        6009
                tool_args: dict,
      
        6010
                context: str,
      
        6011
            ) -> ConfidenceAssessment:
      
        6012
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6013
        
        6014
            async def verify_action(
      
        6015
                tool_name: str,
      
        6016
                tool_args: dict,
      
        6017
                result: str,
      
        6018
                expected: str = "",
      
        6019
            ) -> ActionVerification:
      
        6020
                raise AssertionError("Verification should not run in this scenario")
      
        6021
        
        6022
            context = build_context(
      
        6023
                temp_dir=temp_dir,
      
        6024
                messages=[],
      
        6025
                safeguards=FakeSafeguards(),
      
        6026
                assess_confidence=assess_confidence,
      
        6027
                verify_action=verify_action,
      
        6028
            )
      
        6029
            queued: list[str] = []
      
        6030
            context.queue_steering_message_callback = queued.append
      
        6031
            store = DefinitionOfDoneStore(temp_dir)
      
        6032
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6033
            plan_path = temp_dir / "implementation.md"
      
        6034
            plan_path.write_text(
      
        6035
                "# File Changes\n"
      
        6036
                "- `guide/index.html`\n"
      
        6037
                "- `guide/chapters/01-getting-started.html`\n"
      
        6038
                "- `guide/chapters/02-installation.html`\n"
      
        6039
                "- `guide/chapters/03-first-website.html`\n"
      
        6040
            )
      
        6041
            dod.implementation_plan = str(plan_path)
      
        6042
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        6043
            (temp_dir / "guide" / "index.html").write_text("index")
      
        6044
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        6045
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        6046
            runner = ToolBatchRunner(context, store)
      
        6047
        
        6048
            runner._queue_blocked_late_reference_drift_nudge(
      
        6049
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        6050
                dod=dod,
      
        6051
            )
      
        6052
        
        6053
            assert queued
      
        6054
            assert "03-first-website.html" in queued[0]
      
        6055
            assert "older reference materials" in queued[0]
      
        6056
        
        6057
        
        6058
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        6059
            temp_dir: Path,
      
        6060
        ) -> None:
      
        6061
            async def assess_confidence(
      
        6062
                tool_name: str,
      
        6063
                tool_args: dict,
      
        6064
                context: str,
      
        6065
            ) -> ConfidenceAssessment:
      
        6066
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6067
        
        6068
            async def verify_action(
      
        6069
                tool_name: str,
      
        6070
                tool_args: dict,
      
        6071
                result: str,
      
        6072
                expected: str = "",
      
        6073
            ) -> ActionVerification:
      
        6074
                raise AssertionError("Verification should not run in this scenario")
      
        6075
        
        6076
            guide_root = temp_dir / "guide"
      
        6077
            chapters = guide_root / "chapters"
      
        6078
            guide_root.mkdir(parents=True)
      
        6079
            chapters.mkdir()
      
        6080
            index_path = guide_root / "index.html"
      
        6081
            chapter_one = chapters / "01-getting-started.html"
      
        6082
            chapter_two = chapters / "02-installation.html"
      
        6083
            index_path.write_text("index")
      
        6084
            chapter_one.write_text("one")
      
        6085
            chapter_two.write_text("two")
      
        6086
        
        6087
            implementation_plan = temp_dir / "implementation.md"
      
        6088
            implementation_plan.write_text(
      
        6089
                "\n".join(
      
        6090
                    [
      
        6091
                        "# Implementation Plan",
      
        6092
                        "",
      
        6093
                        "## File Changes",
      
        6094
                        f"- `{guide_root}`",
      
        6095
                        f"- `{chapters}`",
      
        6096
                        f"- `{index_path}`",
      
        6097
                        f"- `{chapter_one}`",
      
        6098
                        f"- `{chapter_two}`",
      
        6099
                        "",
      
        6100
                    ]
      
        6101
                )
      
        6102
            )
      
        6103
        
        6104
            context = build_context(
      
        6105
                temp_dir=temp_dir,
      
        6106
                messages=[],
      
        6107
                safeguards=FakeSafeguards(),
      
        6108
                assess_confidence=assess_confidence,
      
        6109
                verify_action=verify_action,
      
        6110
            )
      
        6111
            queued: list[str] = []
      
        6112
            context.queue_steering_message_callback = queued.append
      
        6113
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6114
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6115
            dod.implementation_plan = str(implementation_plan)
      
        6116
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6117
            sync_todos_to_definition_of_done(
      
        6118
                dod,
      
        6119
                [
      
        6120
                    {
      
        6121
                        "content": "Verify all guide files are linked and complete",
      
        6122
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        6123
                        "status": "pending",
      
        6124
                    }
      
        6125
                ],
      
        6126
                project_root=temp_dir,
      
        6127
            )
      
        6128
        
        6129
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        6130
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        6131
                dod=dod,
      
        6132
            )
      
        6133
        
        6134
            assert queued
      
        6135
            assert context.workflow_mode == "verify"
      
        6136
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        6137
            assert "Verify all guide files are linked and complete" in queued[0]
      
        6138
            assert "Do not reopen earlier reference materials." in queued[0]
      
        6139
            assert "Verification should run next" in queued[0]
      
        6140
        
        6141
        
        6142
        def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
      
        6143
            temp_dir: Path,
      
        6144
        ) -> None:
      
        6145
            async def assess_confidence(
      
        6146
                tool_name: str,
      
        6147
                tool_args: dict,
      
        6148
                context: str,
      
        6149
            ) -> ConfidenceAssessment:
      
        6150
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6151
        
        6152
            async def verify_action(
      
        6153
                tool_name: str,
      
        6154
                tool_args: dict,
      
        6155
                result: str,
      
        6156
                expected: str = "",
      
        6157
            ) -> ActionVerification:
      
        6158
                raise AssertionError("Verification should not run in this scenario")
      
        6159
        
        6160
            guide_root = temp_dir / "guide"
      
        6161
            chapters = guide_root / "chapters"
      
        6162
            guide_root.mkdir(parents=True)
      
        6163
            chapters.mkdir()
      
        6164
            index_path = guide_root / "index.html"
      
        6165
            chapter_one = chapters / "01-getting-started.html"
      
        6166
            chapter_two = chapters / "02-installation.html"
      
        6167
            index_path.write_text("index")
      
        6168
            chapter_one.write_text("one")
      
        6169
            chapter_two.write_text("two")
      
        6170
        
        6171
            implementation_plan = temp_dir / "implementation.md"
      
        6172
            implementation_plan.write_text(
      
        6173
                "\n".join(
      
        6174
                    [
      
        6175
                        "# Implementation Plan",
      
        6176
                        "",
      
        6177
                        "## File Changes",
      
        6178
                        f"- `{guide_root}`",
      
        6179
                        f"- `{chapters}`",
      
        6180
                        f"- `{index_path}`",
      
        6181
                        f"- `{chapter_one}`",
      
        6182
                        f"- `{chapter_two}`",
      
        6183
                        "",
      
        6184
                    ]
      
        6185
                )
      
        6186
            )
      
        6187
        
        6188
            context = build_context(
      
        6189
                temp_dir=temp_dir,
      
        6190
                messages=[],
      
        6191
                safeguards=FakeSafeguards(),
      
        6192
                assess_confidence=assess_confidence,
      
        6193
                verify_action=verify_action,
      
        6194
            )
      
        6195
            queued: list[str] = []
      
        6196
            context.queue_steering_message_callback = queued.append
      
        6197
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6198
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6199
            dod.implementation_plan = str(implementation_plan)
      
        6200
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6201
        
        6202
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        6203
                "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
      
        6204
                dod=dod,
      
        6205
            )
      
        6206
        
        6207
            assert queued
      
        6208
            assert context.workflow_mode == "verify"
      
        6209
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        6210
            assert "move to verification or final confirmation" in queued[0]
      
        6211
        
        6212
        
        6213
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        6214
            temp_dir: Path,
      
        6215
        ) -> None:
      
        6216
            async def assess_confidence(
      
        6217
                tool_name: str,
      
        6218
                tool_args: dict,
      
        6219
                context: str,
      
        6220
            ) -> ConfidenceAssessment:
      
        6221
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6222
        
        6223
            async def verify_action(
      
        6224
                tool_name: str,
      
        6225
                tool_args: dict,
      
        6226
                result: str,
      
        6227
                expected: str = "",
      
        6228
            ) -> ActionVerification:
      
        6229
                raise AssertionError("Verification should not run in this scenario")
      
        6230
        
        6231
            context = build_context(
      
        6232
                temp_dir=temp_dir,
      
        6233
                messages=[],
      
        6234
                safeguards=FakeSafeguards(),
      
        6235
                assess_confidence=assess_confidence,
      
        6236
                verify_action=verify_action,
      
        6237
            )
      
        6238
            queued: list[str] = []
      
        6239
            context.queue_steering_message_callback = queued.append
      
        6240
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6241
        
        6242
            runner._queue_blocked_html_declared_target_nudge(
      
        6243
                ToolCall(
      
        6244
                    id="write-ch1",
      
        6245
                    name="write",
      
        6246
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        6247
                ),
      
        6248
                (
      
        6249
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        6250
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        6251
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        6252
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        6253
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        6254
                ),
      
        6255
            )
      
        6256
        
        6257
            assert queued
      
        6258
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        6259
            assert "`chapters/02-installation.html`" in queued[0]
      
        6260
            assert "same file now" in queued[0]
      
        6261
        
        6262
        
        6263
        def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
      
        6264
            temp_dir: Path,
      
        6265
        ) -> None:
      
        6266
            async def assess_confidence(
      
        6267
                tool_name: str,
      
        6268
                tool_args: dict,
      
        6269
                context: str,
      
        6270
            ) -> ConfidenceAssessment:
      
        6271
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6272
        
        6273
            async def verify_action(
      
        6274
                tool_name: str,
      
        6275
                tool_args: dict,
      
        6276
                result: str,
      
        6277
                expected: str = "",
      
        6278
            ) -> ActionVerification:
      
        6279
                raise AssertionError("Verification should not run in this scenario")
      
        6280
        
        6281
            context = build_context(
      
        6282
                temp_dir=temp_dir,
      
        6283
                messages=[],
      
        6284
                safeguards=FakeSafeguards(),
      
        6285
                assess_confidence=assess_confidence,
      
        6286
                verify_action=verify_action,
      
        6287
            )
      
        6288
            queued: list[str] = []
      
        6289
            context.queue_steering_message_callback = queued.append
      
        6290
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6291
        
        6292
            runner._queue_blocked_html_declared_target_nudge(
      
        6293
                ToolCall(
      
        6294
                    id="write-ch1",
      
        6295
                    name="write",
      
        6296
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
      
        6297
                ),
      
        6298
                (
      
        6299
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        6300
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        6301
                    "introducing new sibling targets that the guide root does not declare; remove or replace "
      
        6302
                    "undeclared hrefs like: troubleshooting.html. "
      
        6303
                    "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
      
        6304
                    "chapters/configuration.html."
      
        6305
                ),
      
        6306
            )
      
        6307
        
        6308
            assert queued
      
        6309
            assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
      
        6310
            assert "`chapters/installation.html`" in queued[0]
      
        6311
            assert "closest declared target(s)" not in queued[0]
      
        6312
        
        6313
        
        6314
        def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
      
        6315
            temp_dir: Path,
      
        6316
        ) -> None:
      
        6317
            async def assess_confidence(
      
        6318
                tool_name: str,
      
        6319
                tool_args: dict,
      
        6320
                context: str,
      
        6321
            ) -> ConfidenceAssessment:
      
        6322
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6323
        
        6324
            async def verify_action(
      
        6325
                tool_name: str,
      
        6326
                tool_args: dict,
      
        6327
                result: str,
      
        6328
                expected: str = "",
      
        6329
            ) -> ActionVerification:
      
        6330
                raise AssertionError("Verification should not run in this scenario")
      
        6331
        
        6332
            context = build_context(
      
        6333
                temp_dir=temp_dir,
      
        6334
                messages=[],
      
        6335
                safeguards=FakeSafeguards(),
      
        6336
                assess_confidence=assess_confidence,
      
        6337
                verify_action=verify_action,
      
        6338
            )
      
        6339
            queued: list[str] = []
      
        6340
            context.queue_steering_message_callback = queued.append
      
        6341
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6342
            dod = create_definition_of_done("Create a guide.")
      
        6343
        
        6344
            target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
      
        6345
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        6346
                ToolCall(
      
        6347
                    id="write-troubleshooting",
      
        6348
                    name="write",
      
        6349
                    arguments={"file_path": str(target)},
      
        6350
                ),
      
        6351
                (
      
        6352
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        6353
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        6354
                    f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
      
        6355
                    "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
      
        6356
                    "Already-declared local targets include: chapters/advanced-topics.html, "
      
        6357
                    "chapters/basic-usage.html, chapters/configuration.html"
      
        6358
                ),
      
        6359
                dod=dod,
      
        6360
            )
      
        6361
        
        6362
            assert queued
      
        6363
            assert "update" in queued[0].lower()
      
        6364
            assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
      
        6365
            assert "`chapters/troubleshooting.html`" in queued[0]
      
        6366
            assert "retry the file creation" in queued[0]
      
        6367
        
        6368
        
        6369
        def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
      
        6370
            temp_dir: Path,
      
        6371
        ) -> None:
      
        6372
            async def assess_confidence(
      
        6373
                tool_name: str,
      
        6374
                tool_args: dict,
      
        6375
                context: str,
      
        6376
            ) -> ConfidenceAssessment:
      
        6377
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6378
        
        6379
            async def verify_action(
      
        6380
                tool_name: str,
      
        6381
                tool_args: dict,
      
        6382
                result: str,
      
        6383
                expected: str = "",
      
        6384
            ) -> ActionVerification:
      
        6385
                raise AssertionError("Verification should not run in this scenario")
      
        6386
        
        6387
            guide = temp_dir / "guide"
      
        6388
            chapters = guide / "chapters"
      
        6389
            guide.mkdir()
      
        6390
            chapters.mkdir()
      
        6391
            index = guide / "index.html"
      
        6392
            index.write_text(
      
        6393
                "\n".join(
      
        6394
                    [
      
        6395
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        6396
                        '<a href="chapters/02-installation.html">Install</a>',
      
        6397
                        '<a href="../index.html">Back</a>',
      
        6398
                        "",
      
        6399
                    ]
      
        6400
                )
      
        6401
            )
      
        6402
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        6403
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        6404
        
        6405
            implementation_plan = temp_dir / "implementation.md"
      
        6406
            implementation_plan.write_text(
      
        6407
                "\n".join(
      
        6408
                    [
      
        6409
                        "# Implementation Plan",
      
        6410
                        "",
      
        6411
                        "## File Changes",
      
        6412
                        f"- `{index}`",
      
        6413
                        f"- `{chapters / '01-introduction.html'}`",
      
        6414
                        f"- `{chapters / '02-installation.html'}`",
      
        6415
                        "",
      
        6416
                    ]
      
        6417
                )
      
        6418
            )
      
        6419
        
        6420
            context = build_context(
      
        6421
                temp_dir=temp_dir,
      
        6422
                messages=[],
      
        6423
                safeguards=FakeSafeguards(),
      
        6424
                assess_confidence=assess_confidence,
      
        6425
                verify_action=verify_action,
      
        6426
            )
      
        6427
            queued: list[str] = []
      
        6428
            context.queue_steering_message_callback = queued.append
      
        6429
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6430
            dod = create_definition_of_done("Create a guide.")
      
        6431
            dod.implementation_plan = str(implementation_plan)
      
        6432
            dod.verification_commands = [f"ls -la {guide}"]
      
        6433
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        6434
        
        6435
            target = guide / "chapters" / "08-advanced-configuration.html"
      
        6436
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        6437
                ToolCall(
      
        6438
                    id="write-extra",
      
        6439
                    name="write",
      
        6440
                    arguments={"file_path": str(target)},
      
        6441
                ),
      
        6442
                (
      
        6443
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        6444
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        6445
                    f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
      
        6446
                    "for example: chapters/08-advanced-configuration.html."
      
        6447
                ),
      
        6448
                dod=dod,
      
        6449
            )
      
        6450
        
        6451
            assert queued
      
        6452
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        6453
            assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
      
        6454
            assert "Move to verification or final confirmation using the files already on disk." in queued[0]
      
        6455
            assert "update the guide root" not in queued[0]
      
        6456
        
        6457
        
        6458
        def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
      
        6459
            temp_dir: Path,
      
        6460
        ) -> None:
      
        6461
            async def assess_confidence(
      
        6462
                tool_name: str,
      
        6463
                tool_args: dict,
      
        6464
                context: str,
      
        6465
            ) -> ConfidenceAssessment:
      
        6466
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6467
        
        6468
            async def verify_action(
      
        6469
                tool_name: str,
      
        6470
                tool_args: dict,
      
        6471
                result: str,
      
        6472
                expected: str = "",
      
        6473
            ) -> ActionVerification:
      
        6474
                raise AssertionError("Verification should not run in this scenario")
      
        6475
        
        6476
            guide = temp_dir / "guide"
      
        6477
            chapters = guide / "chapters"
      
        6478
            guide.mkdir()
      
        6479
            chapters.mkdir()
      
        6480
            index = guide / "index.html"
      
        6481
            index.write_text(
      
        6482
                "\n".join(
      
        6483
                    [
      
        6484
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        6485
                        '<a href="chapters/02-installation.html">Install</a>',
      
        6486
                        '<a href="../index.html">Back</a>',
      
        6487
                        "",
      
        6488
                    ]
      
        6489
                )
      
        6490
            )
      
        6491
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        6492
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        6493
        
        6494
            implementation_plan = temp_dir / "implementation.md"
      
        6495
            implementation_plan.write_text(
      
        6496
                "\n".join(
      
        6497
                    [
      
        6498
                        "# Implementation Plan",
      
        6499
                        "",
      
        6500
                        "## File Changes",
      
        6501
                        f"- `{index}`",
      
        6502
                        f"- `{chapters / '01-introduction.html'}`",
      
        6503
                        f"- `{chapters / '02-installation.html'}`",
      
        6504
                        "",
      
        6505
                    ]
      
        6506
                )
      
        6507
            )
      
        6508
        
        6509
            context = build_context(
      
        6510
                temp_dir=temp_dir,
      
        6511
                messages=[],
      
        6512
                safeguards=FakeSafeguards(),
      
        6513
                assess_confidence=assess_confidence,
      
        6514
                verify_action=verify_action,
      
        6515
            )
      
        6516
            queued: list[str] = []
      
        6517
            context.queue_steering_message_callback = queued.append
      
        6518
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6519
            dod = create_definition_of_done("Create a guide.")
      
        6520
            dod.implementation_plan = str(implementation_plan)
      
        6521
            dod.verification_commands = [f"ls -la {guide}"]
      
        6522
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        6523
        
        6524
            runner._queue_blocked_html_missing_target_nudge(
      
        6525
                ToolCall(
      
        6526
                    id="edit-root",
      
        6527
                    name="edit",
      
        6528
                    arguments={"file_path": str(index)},
      
        6529
                ),
      
        6530
                (
      
        6531
                    "[Blocked - Edited HTML links point to files that do not exist] "
      
        6532
                    "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
      
        6533
                    "for example fix: chapters/08-advanced-configuration.html"
      
        6534
                ),
      
        6535
                dod=dod,
      
        6536
            )
      
        6537
        
        6538
            assert queued
      
        6539
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        6540
            assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
      
        6541
            assert "Repair the existing generated files instead of expanding the guide." in queued[0]
      
        6542
        
        6543
        
        6544
        @pytest.mark.asyncio
      
        6545
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        6546
            temp_dir: Path,
      
        6547
        ) -> None:
      
        6548
            async def assess_confidence(
      
        6549
                tool_name: str,
      
        6550
                tool_args: dict,
      
        6551
                context: str,
      
        6552
            ) -> ConfidenceAssessment:
      
        6553
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6554
        
        6555
            async def verify_action(
      
        6556
                tool_name: str,
      
        6557
                tool_args: dict,
      
        6558
                result: str,
      
        6559
                expected: str = "",
      
        6560
            ) -> ActionVerification:
      
        6561
                raise AssertionError("Verification should not run in this scenario")
      
        6562
        
        6563
            guide_root = temp_dir / "guides" / "nginx"
      
        6564
            chapters = guide_root / "chapters"
      
        6565
            chapters.mkdir(parents=True)
      
        6566
            index_path = guide_root / "index.html"
      
        6567
            chapter_one = chapters / "01-introduction.html"
      
        6568
            chapter_two = chapters / "02-installation.html"
      
        6569
            index_path.write_text("<html></html>\n")
      
        6570
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        6571
        
        6572
            implementation_plan = temp_dir / "implementation.md"
      
        6573
            implementation_plan.write_text(
      
        6574
                "\n".join(
      
        6575
                    [
      
        6576
                        "# Implementation Plan",
      
        6577
                        "",
      
        6578
                        "## File Changes",
      
        6579
                        f"- `{index_path}`",
      
        6580
                        f"- `{chapter_one}`",
      
        6581
                        f"- `{chapter_two}`",
      
        6582
                        "",
      
        6583
                    ]
      
        6584
                )
      
        6585
            )
      
        6586
        
        6587
            context = build_context(
      
        6588
                temp_dir=temp_dir,
      
        6589
                messages=[],
      
        6590
                safeguards=FakeSafeguards(),
      
        6591
                assess_confidence=assess_confidence,
      
        6592
                verify_action=verify_action,
      
        6593
                auto_recover=False,
      
        6594
            )
      
        6595
            queued: list[str] = []
      
        6596
            context.queue_steering_message_callback = queued.append
      
        6597
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6598
            tool_call = ToolCall(
      
        6599
                id="write-2",
      
        6600
                name="write",
      
        6601
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        6602
            )
      
        6603
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        6604
            executor = FakeExecutor(
      
        6605
                [
      
        6606
                    ToolExecutionOutcome(
      
        6607
                        tool_call=tool_call,
      
        6608
                        state=ToolExecutionState.BLOCKED,
      
        6609
                        message=Message.tool_result_message(
      
        6610
                            tool_call_id=tool_call.id,
      
        6611
                            display_content=blocked_message,
      
        6612
                            result_content=blocked_message,
      
        6613
                            is_error=True,
      
        6614
                        ),
      
        6615
                        event_content=blocked_message,
      
        6616
                        is_error=True,
      
        6617
                        result_output=blocked_message,
      
        6618
                    )
      
        6619
                ]
      
        6620
            )
      
        6621
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6622
            dod.implementation_plan = str(implementation_plan)
      
        6623
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        6624
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        6625
        
        6626
            await runner.execute_batch(
      
        6627
                tool_calls=[tool_call],
      
        6628
                tool_source="assistant",
      
        6629
                pending_tool_calls_seen=set(),
      
        6630
                emit=_noop_emit,
      
        6631
                summary=TurnSummary(final_response=""),
      
        6632
                dod=dod,
      
        6633
                executor=executor,  # type: ignore[arg-type]
      
        6634
                on_confirmation=None,
      
        6635
                on_user_question=None,
      
        6636
                emit_confirmation=None,
      
        6637
                consecutive_errors=0,
      
        6638
            )
      
        6639
        
        6640
            assert queued
      
        6641
            assert "did not provide a valid `file_path`" in queued[0]
      
        6642
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        6643
            assert (
      
        6644
                f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
      
        6645
                in queued[0]
      
        6646
            )
      
        6647
            assert context.recovery_context is not None
      
        6648
            assert context.recovery_context.attempts[-1].error == blocked_message