loader Public

Watch 0 Fork 0 Star 0
Python · 185615 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.permissions import (
      
        20
            PermissionMode,
      
        21
            build_permission_policy,
      
        22
            load_permission_rules,
      
        23
        )
      
        24
        from loader.runtime.reasoning_types import (
      
        25
            ActionVerification,
      
        26
            ConfidenceAssessment,
      
        27
            ConfidenceLevel,
      
        28
        )
      
        29
        from loader.runtime.recovery import RecoveryContext
      
        30
        from loader.runtime.tool_batches import (
      
        31
            ToolBatchRunner,
      
        32
        )
      
        33
        from loader.runtime.tool_batches import (
      
        34
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        35
        )
      
        36
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        37
        from loader.tools.base import ToolResult as RegistryToolResult
      
        38
        from loader.tools.base import create_default_registry
      
        39
        from tests.helpers.runtime_harness import ScriptedBackend
      
        40
        
        41
        
        42
        class FakeSession:
      
        43
            def __init__(self, messages: list[Message]) -> None:
      
        44
                self.messages = list(messages)
      
        45
                self.workflow_timeline = []
      
        46
        
        47
            def append(self, message: Message) -> None:
      
        48
                self.messages.append(message)
      
        49
        
        50
            def append_workflow_timeline_entry(self, entry) -> None:
      
        51
                self.workflow_timeline.append(entry)
      
        52
        
        53
        
        54
        class FakeCodeFilter:
      
        55
            def reset(self) -> None:
      
        56
                return None
      
        57
        
        58
        
        59
        class FakeSafeguards:
      
        60
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        61
                self.action_tracker = object()
      
        62
                self.validator = object()
      
        63
                self.code_filter = FakeCodeFilter()
      
        64
                self._detect_loop_result = detect_loop_result
      
        65
        
        66
            def filter_stream_chunk(self, content: str) -> str:
      
        67
                return content
      
        68
        
        69
            def filter_complete_content(self, content: str) -> str:
      
        70
                return content
      
        71
        
        72
            def should_steer(self) -> bool:
      
        73
                return False
      
        74
        
        75
            def get_steering_message(self) -> str | None:
      
        76
                return None
      
        77
        
        78
            def record_response(self, content: str) -> None:
      
        79
                return None
      
        80
        
        81
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        82
                return False, ""
      
        83
        
        84
            def detect_loop(self) -> tuple[bool, str]:
      
        85
                return self._detect_loop_result
      
        86
        
        87
        
        88
        class FakeExecutor:
      
        89
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        90
                self._outcomes = list(outcomes)
      
        91
                self.calls: list[ToolCall] = []
      
        92
        
        93
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        94
                self.calls.append(tool_call)
      
        95
                if not self._outcomes:
      
        96
                    raise AssertionError("No fake tool outcome queued")
      
        97
                return self._outcomes.pop(0)
      
        98
        
        99
        
        100
        def build_context(
      
        101
            *,
      
        102
            temp_dir: Path,
      
        103
            messages: list[Message],
      
        104
            safeguards: FakeSafeguards,
      
        105
            assess_confidence,
      
        106
            verify_action,
      
        107
            recovery_context: RecoveryContext | None = None,
      
        108
            confidence_scoring: bool = False,
      
        109
            verification: bool = False,
      
        110
            auto_recover: bool = True,
      
        111
            min_confidence_for_action: int = 3,
      
        112
        ) -> RuntimeContext:
      
        113
            registry = create_default_registry(temp_dir)
      
        114
            registry.configure_workspace_root(temp_dir)
      
        115
            rule_status = load_permission_rules(temp_dir)
      
        116
            policy = build_permission_policy(
      
        117
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        118
                workspace_root=temp_dir,
      
        119
                tool_requirements=registry.get_tool_requirements(),
      
        120
                rules=rule_status.rules,
      
        121
            )
      
        122
            context = RuntimeContext(
      
        123
                project_root=temp_dir,
      
        124
                backend=ScriptedBackend(),
      
        125
                registry=registry,
      
        126
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        127
                config=SimpleNamespace(
      
        128
                    force_react=False,
      
        129
                    max_recovery_attempts=2,
      
        130
                    auto_recover=auto_recover,
      
        131
                    reasoning=SimpleNamespace(
      
        132
                        rollback=False,
      
        133
                        show_rollback_plan=False,
      
        134
                        completion_check=True,
      
        135
                        max_continuation_prompts=5,
      
        136
                        self_critique=False,
      
        137
                        confidence_scoring=confidence_scoring,
      
        138
                        min_confidence_for_action=min_confidence_for_action,
      
        139
                        verification=verification,
      
        140
                    ),
      
        141
                ),
      
        142
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        143
                project_context=None,
      
        144
                permission_policy=policy,
      
        145
                permission_config_status=rule_status,
      
        146
                workflow_mode="execute",
      
        147
                safeguards=safeguards,
      
        148
                reasoning=SimpleNamespace(
      
        149
                    assess_confidence=assess_confidence,
      
        150
                    verify_action=verify_action,
      
        151
                ),
      
        152
                recovery_context=recovery_context,
      
        153
            )
      
        154
            return context
      
        155
        
        156
        
        157
        def tool_outcome(
      
        158
            *,
      
        159
            tool_call: ToolCall,
      
        160
            output: str,
      
        161
            is_error: bool,
      
        162
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        163
            metadata: dict[str, object] | None = None,
      
        164
        ) -> ToolExecutionOutcome:
      
        165
            return ToolExecutionOutcome(
      
        166
                tool_call=tool_call,
      
        167
                state=state,
      
        168
                message=Message.tool_result_message(
      
        169
                    tool_call_id=tool_call.id,
      
        170
                    display_content=output,
      
        171
                    result_content=output,
      
        172
                    is_error=is_error,
      
        173
                ),
      
        174
                event_content=output,
      
        175
                is_error=is_error,
      
        176
                result_output=output,
      
        177
                registry_result=RegistryToolResult(
      
        178
                    output=output,
      
        179
                    is_error=is_error,
      
        180
                    metadata=metadata or {},
      
        181
                ),
      
        182
            )
      
        183
        
        184
        
        185
        @pytest.mark.asyncio
      
        186
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        187
            captured: dict[str, str] = {}
      
        188
        
        189
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        190
                captured["context"] = context
      
        191
                return ConfidenceAssessment(
      
        192
                    action=f"{tool_name} with {tool_args}",
      
        193
                    tool_name=tool_name,
      
        194
                    tool_args=tool_args,
      
        195
                    level=ConfidenceLevel.LOW,
      
        196
                    reasoning="Need to inspect the target first.",
      
        197
                    risks=["Unknown target file"],
      
        198
                )
      
        199
        
        200
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        201
                raise AssertionError("Verification should not run for skipped actions")
      
        202
        
        203
            context = build_context(
      
        204
                temp_dir=temp_dir,
      
        205
                messages=[
      
        206
                    Message(role=Role.USER, content="Please inspect the project."),
      
        207
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        208
                ],
      
        209
                safeguards=FakeSafeguards(),
      
        210
                assess_confidence=assess_confidence,
      
        211
                verify_action=verify_action,
      
        212
                confidence_scoring=True,
      
        213
                min_confidence_for_action=3,
      
        214
            )
      
        215
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        216
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        217
            events: list[AgentEvent] = []
      
        218
        
        219
            async def emit(event: AgentEvent) -> None:
      
        220
                events.append(event)
      
        221
        
        222
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        223
            result = await runner.execute_batch(
      
        224
                tool_calls=[tool_call],
      
        225
                tool_source="assistant",
      
        226
                pending_tool_calls_seen=set(),
      
        227
                emit=emit,
      
        228
                summary=TurnSummary(final_response=""),
      
        229
                dod=create_definition_of_done("Read the docs"),
      
        230
                executor=executor,  # type: ignore[arg-type]
      
        231
                on_confirmation=None,
      
        232
                on_user_question=None,
      
        233
                emit_confirmation=None,
      
        234
                consecutive_errors=0,
      
        235
            )
      
        236
        
        237
            assert result.actions_taken == []
      
        238
            assert executor.calls == []
      
        239
            assert "Please inspect the project." in captured["context"]
      
        240
            assert context.session.messages[-1].role == Role.USER
      
        241
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        242
            event_types = [event.type for event in events]
      
        243
            assert "confidence" in event_types
      
        244
        
        245
        
        246
        @pytest.mark.asyncio
      
        247
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        248
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        250
        
        251
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        252
                raise AssertionError("Verification should not run for failed actions")
      
        253
        
        254
            context = build_context(
      
        255
                temp_dir=temp_dir,
      
        256
                messages=[],
      
        257
                safeguards=FakeSafeguards(),
      
        258
                assess_confidence=assess_confidence,
      
        259
                verify_action=verify_action,
      
        260
                auto_recover=True,
      
        261
            )
      
        262
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        263
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        264
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        265
            summary = TurnSummary(final_response="")
      
        266
            events: list[AgentEvent] = []
      
        267
        
        268
            async def emit(event: AgentEvent) -> None:
      
        269
                events.append(event)
      
        270
        
        271
            await runner.execute_batch(
      
        272
                tool_calls=[tool_call],
      
        273
                tool_source="assistant",
      
        274
                pending_tool_calls_seen=set(),
      
        275
                emit=emit,
      
        276
                summary=summary,
      
        277
                dod=create_definition_of_done("Run tests"),
      
        278
                executor=executor,  # type: ignore[arg-type]
      
        279
                on_confirmation=None,
      
        280
                on_user_question=None,
      
        281
                emit_confirmation=None,
      
        282
                consecutive_errors=0,
      
        283
            )
      
        284
        
        285
            assert context.recovery_context is not None
      
        286
            assert summary.tool_result_messages
      
        287
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        288
            assert any(event.type == "recovery" for event in events)
      
        289
        
        290
        
        291
        @pytest.mark.asyncio
      
        292
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        293
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        294
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        295
        
        296
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        297
                raise AssertionError("Verification should not run for this scenario")
      
        298
        
        299
            context = build_context(
      
        300
                temp_dir=temp_dir,
      
        301
                messages=[],
      
        302
                safeguards=FakeSafeguards(),
      
        303
                assess_confidence=assess_confidence,
      
        304
                verify_action=verify_action,
      
        305
                auto_recover=False,
      
        306
            )
      
        307
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        308
            tool_call = ToolCall(
      
        309
                id="bash-1",
      
        310
                name="bash",
      
        311
                arguments={"command": "python -m http.server 8000", "background": True},
      
        312
            )
      
        313
            metadata = {
      
        314
                "job_id": "bash-1",
      
        315
                "status": "running",
      
        316
                "background": True,
      
        317
            }
      
        318
            executor = FakeExecutor(
      
        319
                [
      
        320
                    tool_outcome(
      
        321
                        tool_call=tool_call,
      
        322
                        output="Started bash job bash-1",
      
        323
                        is_error=False,
      
        324
                        metadata=metadata,
      
        325
                    )
      
        326
                ]
      
        327
            )
      
        328
            events: list[AgentEvent] = []
      
        329
        
        330
            async def emit(event: AgentEvent) -> None:
      
        331
                events.append(event)
      
        332
        
        333
            await runner.execute_batch(
      
        334
                tool_calls=[tool_call],
      
        335
                tool_source="assistant",
      
        336
                pending_tool_calls_seen=set(),
      
        337
                emit=emit,
      
        338
                summary=TurnSummary(final_response=""),
      
        339
                dod=create_definition_of_done("Launch a preview server"),
      
        340
                executor=executor,  # type: ignore[arg-type]
      
        341
                on_confirmation=None,
      
        342
                on_user_question=None,
      
        343
                emit_confirmation=None,
      
        344
                consecutive_errors=0,
      
        345
            )
      
        346
        
        347
            tool_result = next(event for event in events if event.type == "tool_result")
      
        348
            assert tool_result.tool_metadata == metadata
      
        349
        
        350
        
        351
        @pytest.mark.asyncio
      
        352
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        353
            verification_calls: list[str] = []
      
        354
        
        355
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        356
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        357
        
        358
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        359
                verification_calls.append(result)
      
        360
                return ActionVerification(
      
        361
                    tool_name=tool_name,
      
        362
                    tool_args=tool_args,
      
        363
                    expected_outcome="Success",
      
        364
                    actual_result=result,
      
        365
                    verified=False,
      
        366
                    discrepancies=["File contents did not match"],
      
        367
                    needs_correction=True,
      
        368
                    correction_suggestion="Read the file before editing again.",
      
        369
                )
      
        370
        
        371
            existing_recovery = RecoveryContext(
      
        372
                original_tool="edit",
      
        373
                original_args={"file_path": "README.md"},
      
        374
            )
      
        375
            context = build_context(
      
        376
                temp_dir=temp_dir,
      
        377
                messages=[],
      
        378
                safeguards=FakeSafeguards(),
      
        379
                assess_confidence=assess_confidence,
      
        380
                verify_action=verify_action,
      
        381
                recovery_context=existing_recovery,
      
        382
                verification=True,
      
        383
            )
      
        384
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        385
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        386
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        387
            events: list[AgentEvent] = []
      
        388
        
        389
            async def emit(event: AgentEvent) -> None:
      
        390
                events.append(event)
      
        391
        
        392
            await runner.execute_batch(
      
        393
                tool_calls=[tool_call],
      
        394
                tool_source="assistant",
      
        395
                pending_tool_calls_seen=set(),
      
        396
                emit=emit,
      
        397
                summary=TurnSummary(final_response=""),
      
        398
                dod=create_definition_of_done("Read the docs"),
      
        399
                executor=executor,  # type: ignore[arg-type]
      
        400
                on_confirmation=None,
      
        401
                on_user_question=None,
      
        402
                emit_confirmation=None,
      
        403
                consecutive_errors=0,
      
        404
            )
      
        405
        
        406
            assert verification_calls == ["file contents"]
      
        407
            assert context.recovery_context is existing_recovery
      
        408
            assert existing_recovery.successful_steps == [
      
        409
                ("read", {"file_path": "README.md"})
      
        410
            ]
      
        411
            assert context.session.messages[-1].role == Role.TOOL
      
        412
            assert context.session.messages[-1].content == "file contents"
      
        413
            assert any(event.type == "verification" for event in events)
      
        414
        
        415
        
        416
        @pytest.mark.asyncio
      
        417
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        418
            temp_dir: Path,
      
        419
        ) -> None:
      
        420
            async def assess_confidence(
      
        421
                tool_name: str,
      
        422
                tool_args: dict,
      
        423
                context: str,
      
        424
            ) -> ConfidenceAssessment:
      
        425
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        426
        
        427
            async def verify_action(
      
        428
                tool_name: str,
      
        429
                tool_args: dict,
      
        430
                result: str,
      
        431
                expected: str = "",
      
        432
            ) -> ActionVerification:
      
        433
                raise AssertionError("Verification should not run for this scenario")
      
        434
        
        435
            existing_recovery = RecoveryContext(
      
        436
                original_tool="read",
      
        437
                original_args={"file_path": "chapters/04-data-types.html"},
      
        438
            )
      
        439
            existing_recovery.add_attempt(
      
        440
                "read",
      
        441
                {"file_path": "chapters/04-data-types.html"},
      
        442
                "File not found",
      
        443
            )
      
        444
            context = build_context(
      
        445
                temp_dir=temp_dir,
      
        446
                messages=[],
      
        447
                safeguards=FakeSafeguards(),
      
        448
                assess_confidence=assess_confidence,
      
        449
                verify_action=verify_action,
      
        450
                recovery_context=existing_recovery,
      
        451
                auto_recover=False,
      
        452
            )
      
        453
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        454
            tool_call = ToolCall(
      
        455
                id="bash-1",
      
        456
                name="bash",
      
        457
                arguments={"command": "ls chapters"},
      
        458
            )
      
        459
            executor = FakeExecutor(
      
        460
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        461
            )
      
        462
        
        463
            summary = TurnSummary(final_response="")
      
        464
            await runner.execute_batch(
      
        465
                tool_calls=[tool_call],
      
        466
                tool_source="assistant",
      
        467
                pending_tool_calls_seen=set(),
      
        468
                emit=_noop_emit,
      
        469
                summary=summary,
      
        470
                dod=create_definition_of_done("Fix the chapter links"),
      
        471
                executor=executor,  # type: ignore[arg-type]
      
        472
                on_confirmation=None,
      
        473
                on_user_question=None,
      
        474
                emit_confirmation=None,
      
        475
                consecutive_errors=0,
      
        476
            )
      
        477
        
        478
            assert context.recovery_context is existing_recovery
      
        479
            assert existing_recovery.successful_steps == [
      
        480
                ("bash", {"command": "ls chapters"})
      
        481
            ]
      
        482
        
        483
        
        484
        @pytest.mark.asyncio
      
        485
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        486
            temp_dir: Path,
      
        487
        ) -> None:
      
        488
            async def assess_confidence(
      
        489
                tool_name: str,
      
        490
                tool_args: dict,
      
        491
                context: str,
      
        492
            ) -> ConfidenceAssessment:
      
        493
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        494
        
        495
            async def verify_action(
      
        496
                tool_name: str,
      
        497
                tool_args: dict,
      
        498
                result: str,
      
        499
                expected: str = "",
      
        500
            ) -> ActionVerification:
      
        501
                raise AssertionError("Verification should not run for this scenario")
      
        502
        
        503
            existing_recovery = RecoveryContext(
      
        504
                original_tool="read",
      
        505
                original_args={"file_path": "chapters/04-data-types.html"},
      
        506
            )
      
        507
            existing_recovery.add_attempt(
      
        508
                "read",
      
        509
                {"file_path": "chapters/04-data-types.html"},
      
        510
                "File not found",
      
        511
            )
      
        512
            context = build_context(
      
        513
                temp_dir=temp_dir,
      
        514
                messages=[],
      
        515
                safeguards=FakeSafeguards(),
      
        516
                assess_confidence=assess_confidence,
      
        517
                verify_action=verify_action,
      
        518
                recovery_context=existing_recovery,
      
        519
                auto_recover=False,
      
        520
            )
      
        521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        522
            tool_call = ToolCall(
      
        523
                id="patch-1",
      
        524
                name="patch",
      
        525
                arguments={
      
        526
                    "file_path": "index.html",
      
        527
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        528
                },
      
        529
            )
      
        530
            executor = FakeExecutor(
      
        531
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        532
            )
      
        533
        
        534
            summary = TurnSummary(final_response="")
      
        535
            await runner.execute_batch(
      
        536
                tool_calls=[tool_call],
      
        537
                tool_source="assistant",
      
        538
                pending_tool_calls_seen=set(),
      
        539
                emit=_noop_emit,
      
        540
                summary=summary,
      
        541
                dod=create_definition_of_done("Fix the chapter links"),
      
        542
                executor=executor,  # type: ignore[arg-type]
      
        543
                on_confirmation=None,
      
        544
                on_user_question=None,
      
        545
                emit_confirmation=None,
      
        546
                consecutive_errors=0,
      
        547
            )
      
        548
        
        549
            assert context.recovery_context is None
      
        550
        
        551
        
        552
        @pytest.mark.asyncio
      
        553
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        554
            temp_dir: Path,
      
        555
        ) -> None:
      
        556
            async def assess_confidence(
      
        557
                tool_name: str,
      
        558
                tool_args: dict,
      
        559
                context: str,
      
        560
            ) -> ConfidenceAssessment:
      
        561
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        562
        
        563
            async def verify_action(
      
        564
                tool_name: str,
      
        565
                tool_args: dict,
      
        566
                result: str,
      
        567
                expected: str = "",
      
        568
            ) -> ActionVerification:
      
        569
                raise AssertionError("Verification should not run for this scenario")
      
        570
        
        571
            messages = [
      
        572
                Message(
      
        573
                    role=Role.TOOL,
      
        574
                    content=(
      
        575
                        "Observation [glob]: Result: "
      
        576
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        577
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        578
                        f"{temp_dir}/chapters/03-basics.html"
      
        579
                    ),
      
        580
                    tool_results=[],
      
        581
                ),
      
        582
                Message(
      
        583
                    role=Role.ASSISTANT,
      
        584
                    content="I already inspected the first chapter title.",
      
        585
                    tool_calls=[
      
        586
                        ToolCall(
      
        587
                            id="read-ch1",
      
        588
                            name="read",
      
        589
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        590
                        )
      
        591
                    ],
      
        592
                ),
      
        593
                Message.tool_result_message(
      
        594
                    tool_call_id="read-ch1",
      
        595
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        596
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                ),
      
        598
                Message(
      
        599
                    role=Role.ASSISTANT,
      
        600
                    content="I should update the index now.",
      
        601
                    tool_calls=[
      
        602
                        ToolCall(
      
        603
                            id="read-index",
      
        604
                            name="read",
      
        605
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        606
                        )
      
        607
                    ],
      
        608
                ),
      
        609
            ]
      
        610
            context = build_context(
      
        611
                temp_dir=temp_dir,
      
        612
                messages=messages,
      
        613
                safeguards=FakeSafeguards(),
      
        614
                assess_confidence=assess_confidence,
      
        615
                verify_action=verify_action,
      
        616
                auto_recover=False,
      
        617
            )
      
        618
            (temp_dir / "chapters").mkdir()
      
        619
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        620
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        621
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        622
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        623
            implementation_plan = temp_dir / "implementation.md"
      
        624
            implementation_plan.write_text(
      
        625
                "\n".join(
      
        626
                    [
      
        627
                        "# Implementation Plan",
      
        628
                        "",
      
        629
                        "## File Changes",
      
        630
                        f"- `{temp_dir / 'index.html'}`",
      
        631
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        635
                    ]
      
        636
                )
      
        637
            )
      
        638
            context.session.current_task = (
      
        639
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        640
            )
      
        641
            persistent_messages: list[str] = []
      
        642
            ephemeral_messages: list[str] = []
      
        643
            context.queue_steering_message_callback = persistent_messages.append
      
        644
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        645
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        646
            tool_call = ToolCall(
      
        647
                id="read-dup",
      
        648
                name="read",
      
        649
                arguments={"file_path": str(temp_dir / "index.html")},
      
        650
            )
      
        651
            duplicate_message = (
      
        652
                "[Skipped - duplicate action: Already read "
      
        653
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        654
                "reuse the earlier read result instead of rereading]"
      
        655
            )
      
        656
            executor = FakeExecutor(
      
        657
                [
      
        658
                    ToolExecutionOutcome(
      
        659
                        tool_call=tool_call,
      
        660
                        state=ToolExecutionState.DUPLICATE,
      
        661
                        message=Message.tool_result_message(
      
        662
                            tool_call_id=tool_call.id,
      
        663
                            display_content=duplicate_message,
      
        664
                            result_content=duplicate_message,
      
        665
                        ),
      
        666
                        event_content=duplicate_message,
      
        667
                        is_error=False,
      
        668
                        result_output=duplicate_message,
      
        669
                    )
      
        670
                ]
      
        671
            )
      
        672
        
        673
            summary = TurnSummary(final_response="")
      
        674
            dod = create_definition_of_done("Fix the chapter links")
      
        675
            dod.implementation_plan = str(implementation_plan)
      
        676
            dod.pending_items.append("Create the remaining chapter files")
      
        677
            await runner.execute_batch(
      
        678
                tool_calls=[tool_call],
      
        679
                tool_source="assistant",
      
        680
                pending_tool_calls_seen=set(),
      
        681
                emit=_noop_emit,
      
        682
                summary=summary,
      
        683
                dod=dod,
      
        684
                executor=executor,  # type: ignore[arg-type]
      
        685
                on_confirmation=None,
      
        686
                on_user_question=None,
      
        687
                emit_confirmation=None,
      
        688
                consecutive_errors=0,
      
        689
            )
      
        690
        
        691
            assert len(persistent_messages) == 1
      
        692
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        693
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        694
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        695
            assert (
      
        696
                f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
      
        697
                in persistent_messages[0]
      
        698
            )
      
        699
            assert ephemeral_messages == []
      
        700
        
        701
        
        702
        @pytest.mark.asyncio
      
        703
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        704
            temp_dir: Path,
      
        705
        ) -> None:
      
        706
            async def assess_confidence(
      
        707
                tool_name: str,
      
        708
                tool_args: dict,
      
        709
                context: str,
      
        710
            ) -> ConfidenceAssessment:
      
        711
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        712
        
        713
            async def verify_action(
      
        714
                tool_name: str,
      
        715
                tool_args: dict,
      
        716
                result: str,
      
        717
                expected: str = "",
      
        718
            ) -> ActionVerification:
      
        719
                raise AssertionError("Verification should not run for this scenario")
      
        720
        
        721
            context = build_context(
      
        722
                temp_dir=temp_dir,
      
        723
                messages=[],
      
        724
                safeguards=FakeSafeguards(),
      
        725
                assess_confidence=assess_confidence,
      
        726
                verify_action=verify_action,
      
        727
                auto_recover=False,
      
        728
            )
      
        729
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        730
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        731
            sync_todos_to_definition_of_done(
      
        732
                dod,
      
        733
                [
      
        734
                    {
      
        735
                        "content": "Create 03-first-website.html",
      
        736
                        "active_form": "Creating 03-first-website.html",
      
        737
                        "status": "pending",
      
        738
                    },
      
        739
                    {
      
        740
                        "content": "Create 04-configuration-basics.html",
      
        741
                        "active_form": "Creating 04-configuration-basics.html",
      
        742
                        "status": "pending",
      
        743
                    },
      
        744
                ],
      
        745
            )
      
        746
        
        747
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        748
            chapter_path.parent.mkdir(parents=True)
      
        749
            write_call = ToolCall(
      
        750
                id="write-ch3",
      
        751
                name="write",
      
        752
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        753
            )
      
        754
            stale_todo_call = ToolCall(
      
        755
                id="todo-stale",
      
        756
                name="TodoWrite",
      
        757
                arguments={
      
        758
                    "todos": [
      
        759
                        {
      
        760
                            "content": "Create 03-first-website.html",
      
        761
                            "active_form": "Creating 03-first-website.html",
      
        762
                            "status": "pending",
      
        763
                        },
      
        764
                        {
      
        765
                            "content": "Create 04-configuration-basics.html",
      
        766
                            "active_form": "Creating 04-configuration-basics.html",
      
        767
                            "status": "pending",
      
        768
                        },
      
        769
                    ]
      
        770
                },
      
        771
            )
      
        772
            executor = FakeExecutor(
      
        773
                [
      
        774
                    tool_outcome(
      
        775
                        tool_call=write_call,
      
        776
                        output=f"Successfully wrote {chapter_path}",
      
        777
                        is_error=False,
      
        778
                    ),
      
        779
                    tool_outcome(
      
        780
                        tool_call=stale_todo_call,
      
        781
                        output="Todos updated",
      
        782
                        is_error=False,
      
        783
                        metadata={
      
        784
                            "new_todos": [
      
        785
                                {
      
        786
                                    "content": "Create 03-first-website.html",
      
        787
                                    "active_form": "Creating 03-first-website.html",
      
        788
                                    "status": "pending",
      
        789
                                },
      
        790
                                {
      
        791
                                    "content": "Create 04-configuration-basics.html",
      
        792
                                    "active_form": "Creating 04-configuration-basics.html",
      
        793
                                    "status": "pending",
      
        794
                                },
      
        795
                            ]
      
        796
                        },
      
        797
                    ),
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            await runner.execute_batch(
      
        803
                tool_calls=[write_call, stale_todo_call],
      
        804
                tool_source="assistant",
      
        805
                pending_tool_calls_seen=set(),
      
        806
                emit=_noop_emit,
      
        807
                summary=summary,
      
        808
                dod=dod,
      
        809
                executor=executor,  # type: ignore[arg-type]
      
        810
                on_confirmation=None,
      
        811
                on_user_question=None,
      
        812
                emit_confirmation=None,
      
        813
                consecutive_errors=0,
      
        814
            )
      
        815
        
        816
            assert "Create 03-first-website.html" in dod.completed_items
      
        817
            assert "Create 03-first-website.html" not in dod.pending_items
      
        818
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        819
        
        820
        
        821
        @pytest.mark.asyncio
      
        822
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        823
            temp_dir: Path,
      
        824
        ) -> None:
      
        825
            async def assess_confidence(
      
        826
                tool_name: str,
      
        827
                tool_args: dict,
      
        828
                context: str,
      
        829
            ) -> ConfidenceAssessment:
      
        830
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        831
        
        832
            async def verify_action(
      
        833
                tool_name: str,
      
        834
                tool_args: dict,
      
        835
                result: str,
      
        836
                expected: str = "",
      
        837
            ) -> ActionVerification:
      
        838
                raise AssertionError("Verification should not run for this scenario")
      
        839
        
        840
            chapters = temp_dir / "chapters"
      
        841
            chapters.mkdir()
      
        842
            (chapters / "01-introduction.html").write_text(
      
        843
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        844
            )
      
        845
            (chapters / "02-setup.html").write_text(
      
        846
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        847
            )
      
        848
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        849
        
        850
            context = build_context(
      
        851
                temp_dir=temp_dir,
      
        852
                messages=[],
      
        853
                safeguards=FakeSafeguards(),
      
        854
                assess_confidence=assess_confidence,
      
        855
                verify_action=verify_action,
      
        856
                auto_recover=False,
      
        857
            )
      
        858
            context.session.current_task = (
      
        859
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        860
            )
      
        861
            persistent_messages: list[str] = []
      
        862
            ephemeral_messages: list[str] = []
      
        863
            context.queue_steering_message_callback = persistent_messages.append
      
        864
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        865
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        866
            tool_call = ToolCall(
      
        867
                id="glob-1",
      
        868
                name="glob",
      
        869
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        870
            )
      
        871
            executor = FakeExecutor(
      
        872
                [
      
        873
                    tool_outcome(
      
        874
                        tool_call=tool_call,
      
        875
                        output="\n".join(
      
        876
                            [
      
        877
                                str(chapters / "01-introduction.html"),
      
        878
                                str(chapters / "02-setup.html"),
      
        879
                            ]
      
        880
                        ),
      
        881
                        is_error=False,
      
        882
                    )
      
        883
                ]
      
        884
            )
      
        885
        
        886
            summary = TurnSummary(final_response="")
      
        887
            await runner.execute_batch(
      
        888
                tool_calls=[tool_call],
      
        889
                tool_source="assistant",
      
        890
                pending_tool_calls_seen=set(),
      
        891
                emit=_noop_emit,
      
        892
                summary=summary,
      
        893
                dod=create_definition_of_done("Fix the chapter links"),
      
        894
                executor=executor,  # type: ignore[arg-type]
      
        895
                on_confirmation=None,
      
        896
                on_user_question=None,
      
        897
                emit_confirmation=None,
      
        898
                consecutive_errors=0,
      
        899
            )
      
        900
        
        901
            assert persistent_messages == []
      
        902
            assert ephemeral_messages == []
      
        903
            assert len(summary.tool_result_messages) == 1
      
        904
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        905
        
        906
        
        907
        @pytest.mark.asyncio
      
        908
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        909
            temp_dir: Path,
      
        910
        ) -> None:
      
        911
            async def assess_confidence(
      
        912
                tool_name: str,
      
        913
                tool_args: dict,
      
        914
                context: str,
      
        915
            ) -> ConfidenceAssessment:
      
        916
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        917
        
        918
            async def verify_action(
      
        919
                tool_name: str,
      
        920
                tool_args: dict,
      
        921
                result: str,
      
        922
                expected: str = "",
      
        923
            ) -> ActionVerification:
      
        924
                raise AssertionError("Verification should not run for this scenario")
      
        925
        
        926
            chapters = temp_dir / "chapters"
      
        927
            chapters.mkdir()
      
        928
            (chapters / "01-introduction.html").write_text(
      
        929
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        930
            )
      
        931
            (chapters / "02-setup.html").write_text(
      
        932
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        933
            )
      
        934
            index_path = temp_dir / "index.html"
      
        935
            old_block = (
      
        936
                '<ul class="chapter-list">\n'
      
        937
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        938
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        939
                "</ul>\n"
      
        940
            )
      
        941
            new_block = (
      
        942
                '<ul class="chapter-list">\n'
      
        943
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        944
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        945
                "</ul>\n"
      
        946
            )
      
        947
            index_path.write_text(new_block)
      
        948
        
        949
            context = build_context(
      
        950
                temp_dir=temp_dir,
      
        951
                messages=[],
      
        952
                safeguards=FakeSafeguards(),
      
        953
                assess_confidence=assess_confidence,
      
        954
                verify_action=verify_action,
      
        955
                auto_recover=False,
      
        956
            )
      
        957
            context.session.current_task = (
      
        958
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        959
            )
      
        960
            persistent_messages: list[str] = []
      
        961
            ephemeral_messages: list[str] = []
      
        962
            context.queue_steering_message_callback = persistent_messages.append
      
        963
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        964
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        965
            tool_call = ToolCall(
      
        966
                id="edit-1",
      
        967
                name="edit",
      
        968
                arguments={
      
        969
                    "file_path": str(index_path),
      
        970
                    "old_string": old_block,
      
        971
                    "new_string": new_block,
      
        972
                },
      
        973
            )
      
        974
            executor = FakeExecutor(
      
        975
                [
      
        976
                    tool_outcome(
      
        977
                        tool_call=tool_call,
      
        978
                        output=f"Successfully edited {index_path}",
      
        979
                        is_error=False,
      
        980
                    )
      
        981
                ]
      
        982
            )
      
        983
        
        984
            summary = TurnSummary(final_response="")
      
        985
            await runner.execute_batch(
      
        986
                tool_calls=[tool_call],
      
        987
                tool_source="assistant",
      
        988
                pending_tool_calls_seen=set(),
      
        989
                emit=_noop_emit,
      
        990
                summary=summary,
      
        991
                dod=create_definition_of_done(
      
        992
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        993
                ),
      
        994
                executor=executor,  # type: ignore[arg-type]
      
        995
                on_confirmation=None,
      
        996
                on_user_question=None,
      
        997
                emit_confirmation=None,
      
        998
                consecutive_errors=0,
      
        999
            )
      
        1000
        
        1001
            assert all(
      
        1002
                "Semantic verification preview:" not in message.content
      
        1003
                for message in summary.tool_result_messages
      
        1004
            )
      
        1005
            assert persistent_messages == []
      
        1006
            assert ephemeral_messages == []
      
        1007
        
        1008
        
        1009
        @pytest.mark.asyncio
      
        1010
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1011
            temp_dir: Path,
      
        1012
        ) -> None:
      
        1013
            async def assess_confidence(
      
        1014
                tool_name: str,
      
        1015
                tool_args: dict,
      
        1016
                context: str,
      
        1017
            ) -> ConfidenceAssessment:
      
        1018
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1019
        
        1020
            async def verify_action(
      
        1021
                tool_name: str,
      
        1022
                tool_args: dict,
      
        1023
                result: str,
      
        1024
                expected: str = "",
      
        1025
            ) -> ActionVerification:
      
        1026
                raise AssertionError("Verification should not run for this scenario")
      
        1027
        
        1028
            chapters = temp_dir / "chapters"
      
        1029
            chapters.mkdir()
      
        1030
            (chapters / "01-introduction.html").write_text(
      
        1031
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1032
            )
      
        1033
            (chapters / "02-setup.html").write_text(
      
        1034
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1035
            )
      
        1036
            index_path = temp_dir / "index.html"
      
        1037
            index_path.write_text(
      
        1038
                "<h2>Table of Contents</h2>\n"
      
        1039
                '<ul class="chapter-list">\n'
      
        1040
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1041
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1042
                "</ul>\n"
      
        1043
            )
      
        1044
        
        1045
            prompt = (
      
        1046
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1047
                "for the structure and cadence of the guide. We are going to make an all "
      
        1048
                "new equally thorough guide on how to use the nginx tool."
      
        1049
            )
      
        1050
        
        1051
            context = build_context(
      
        1052
                temp_dir=temp_dir,
      
        1053
                messages=[],
      
        1054
                safeguards=FakeSafeguards(),
      
        1055
                assess_confidence=assess_confidence,
      
        1056
                verify_action=verify_action,
      
        1057
                auto_recover=False,
      
        1058
            )
      
        1059
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1060
            persistent_messages: list[str] = []
      
        1061
            ephemeral_messages: list[str] = []
      
        1062
            context.queue_steering_message_callback = persistent_messages.append
      
        1063
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1064
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1065
            tool_call = ToolCall(
      
        1066
                id="read-index",
      
        1067
                name="read",
      
        1068
                arguments={"file_path": str(index_path)},
      
        1069
            )
      
        1070
            executor = FakeExecutor(
      
        1071
                [
      
        1072
                    tool_outcome(
      
        1073
                        tool_call=tool_call,
      
        1074
                        output=index_path.read_text(),
      
        1075
                        is_error=False,
      
        1076
                    )
      
        1077
                ]
      
        1078
            )
      
        1079
        
        1080
            summary = TurnSummary(final_response="")
      
        1081
            await runner.execute_batch(
      
        1082
                tool_calls=[tool_call],
      
        1083
                tool_source="assistant",
      
        1084
                pending_tool_calls_seen=set(),
      
        1085
                emit=_noop_emit,
      
        1086
                summary=summary,
      
        1087
                dod=create_definition_of_done(prompt),
      
        1088
                executor=executor,  # type: ignore[arg-type]
      
        1089
                on_confirmation=None,
      
        1090
                on_user_question=None,
      
        1091
                emit_confirmation=None,
      
        1092
                consecutive_errors=0,
      
        1093
            )
      
        1094
        
        1095
            assert persistent_messages == []
      
        1096
            assert ephemeral_messages == []
      
        1097
            assert all(
      
        1098
                "Semantic verification preview:" not in message.content
      
        1099
                for message in summary.tool_result_messages
      
        1100
            )
      
        1101
        
        1102
        
        1103
        @pytest.mark.asyncio
      
        1104
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1105
            temp_dir: Path,
      
        1106
        ) -> None:
      
        1107
            async def assess_confidence(
      
        1108
                tool_name: str,
      
        1109
                tool_args: dict,
      
        1110
                context: str,
      
        1111
            ) -> ConfidenceAssessment:
      
        1112
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1113
        
        1114
            async def verify_action(
      
        1115
                tool_name: str,
      
        1116
                tool_args: dict,
      
        1117
                result: str,
      
        1118
                expected: str = "",
      
        1119
            ) -> ActionVerification:
      
        1120
                raise AssertionError("Verification should not run for this scenario")
      
        1121
        
        1122
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1123
            reference.parent.mkdir(parents=True)
      
        1124
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1125
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1126
            chapters = nginx_root / "chapters"
      
        1127
            implementation_plan = temp_dir / "implementation.md"
      
        1128
            implementation_plan.write_text(
      
        1129
                "\n".join(
      
        1130
                    [
      
        1131
                        "# Implementation Plan",
      
        1132
                        "",
      
        1133
                        "## File Changes",
      
        1134
                        f"- `{chapters}/`",
      
        1135
                        f"- `{nginx_root / 'index.html'}`",
      
        1136
                        "",
      
        1137
                    ]
      
        1138
                )
      
        1139
            )
      
        1140
        
        1141
            context = build_context(
      
        1142
                temp_dir=temp_dir,
      
        1143
                messages=[],
      
        1144
                safeguards=FakeSafeguards(),
      
        1145
                assess_confidence=assess_confidence,
      
        1146
                verify_action=verify_action,
      
        1147
                auto_recover=False,
      
        1148
            )
      
        1149
            persistent_messages: list[str] = []
      
        1150
            ephemeral_messages: list[str] = []
      
        1151
            context.queue_steering_message_callback = persistent_messages.append
      
        1152
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1153
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1154
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1155
            dod.implementation_plan = str(implementation_plan)
      
        1156
            sync_todos_to_definition_of_done(
      
        1157
                dod,
      
        1158
                [
      
        1159
                    {
      
        1160
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1161
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1162
                        "status": "pending",
      
        1163
                    },
      
        1164
                    {
      
        1165
                        "content": "Create the nginx directory structure",
      
        1166
                        "active_form": "Working on: Create the nginx directory structure",
      
        1167
                        "status": "pending",
      
        1168
                    },
      
        1169
                    {
      
        1170
                        "content": "Create the nginx index.html file",
      
        1171
                        "active_form": "Working on: Create the nginx index.html file",
      
        1172
                        "status": "pending",
      
        1173
                    },
      
        1174
                ],
      
        1175
            )
      
        1176
            tool_call = ToolCall(
      
        1177
                id="read-reference",
      
        1178
                name="read",
      
        1179
                arguments={"file_path": str(reference)},
      
        1180
            )
      
        1181
            executor = FakeExecutor(
      
        1182
                [
      
        1183
                    tool_outcome(
      
        1184
                        tool_call=tool_call,
      
        1185
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1186
                        is_error=False,
      
        1187
                    )
      
        1188
                ]
      
        1189
            )
      
        1190
        
        1191
            summary = TurnSummary(final_response="")
      
        1192
            await runner.execute_batch(
      
        1193
                tool_calls=[tool_call],
      
        1194
                tool_source="assistant",
      
        1195
                pending_tool_calls_seen=set(),
      
        1196
                emit=_noop_emit,
      
        1197
                summary=summary,
      
        1198
                dod=dod,
      
        1199
                executor=executor,  # type: ignore[arg-type]
      
        1200
                on_confirmation=None,
      
        1201
                on_user_question=None,
      
        1202
                emit_confirmation=None,
      
        1203
                consecutive_errors=0,
      
        1204
            )
      
        1205
        
        1206
            assert (
      
        1207
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1208
                in dod.completed_items
      
        1209
            )
      
        1210
            assert any(
      
        1211
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1212
                in message
      
        1213
                for message in persistent_messages
      
        1214
            )
      
        1215
            assert any(
      
        1216
                "Resume by creating `chapters/` now." in message
      
        1217
                for message in persistent_messages
      
        1218
            )
      
        1219
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1220
            assert ephemeral_messages == []
      
        1221
        
        1222
        
        1223
        @pytest.mark.asyncio
      
        1224
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1225
            temp_dir: Path,
      
        1226
        ) -> None:
      
        1227
            async def assess_confidence(
      
        1228
                tool_name: str,
      
        1229
                tool_args: dict,
      
        1230
                context: str,
      
        1231
            ) -> ConfidenceAssessment:
      
        1232
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1233
        
        1234
            async def verify_action(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                result: str,
      
        1238
                expected: str = "",
      
        1239
            ) -> ActionVerification:
      
        1240
                raise AssertionError("Verification should not run for this scenario")
      
        1241
        
        1242
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1243
            reference.parent.mkdir(parents=True)
      
        1244
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1245
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1246
            chapters = nginx_root / "chapters"
      
        1247
            implementation_plan = temp_dir / "implementation.md"
      
        1248
            implementation_plan.write_text(
      
        1249
                "\n".join(
      
        1250
                    [
      
        1251
                        "# Implementation Plan",
      
        1252
                        "",
      
        1253
                        "## File Changes",
      
        1254
                        f"- `{nginx_root / 'index.html'}`",
      
        1255
                        f"- `{chapters}/`",
      
        1256
                        "",
      
        1257
                    ]
      
        1258
                )
      
        1259
            )
      
        1260
        
        1261
            context = build_context(
      
        1262
                temp_dir=temp_dir,
      
        1263
                messages=[],
      
        1264
                safeguards=FakeSafeguards(),
      
        1265
                assess_confidence=assess_confidence,
      
        1266
                verify_action=verify_action,
      
        1267
                auto_recover=False,
      
        1268
            )
      
        1269
            persistent_messages: list[str] = []
      
        1270
            ephemeral_messages: list[str] = []
      
        1271
            context.queue_steering_message_callback = persistent_messages.append
      
        1272
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1273
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1274
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1275
            dod.implementation_plan = str(implementation_plan)
      
        1276
            sync_todos_to_definition_of_done(
      
        1277
                dod,
      
        1278
                [
      
        1279
                    {
      
        1280
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1281
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1282
                        "status": "pending",
      
        1283
                    },
      
        1284
                    {
      
        1285
                        "content": "Create the nginx directory structure",
      
        1286
                        "active_form": "Working on: Create the nginx directory structure",
      
        1287
                        "status": "pending",
      
        1288
                    },
      
        1289
                    {
      
        1290
                        "content": "Create the nginx index.html file",
      
        1291
                        "active_form": "Working on: Create the nginx index.html file",
      
        1292
                        "status": "pending",
      
        1293
                    },
      
        1294
                ],
      
        1295
                project_root=temp_dir,
      
        1296
            )
      
        1297
            tool_call = ToolCall(
      
        1298
                id="read-reference-index-first",
      
        1299
                name="read",
      
        1300
                arguments={"file_path": str(reference)},
      
        1301
            )
      
        1302
            executor = FakeExecutor(
      
        1303
                [
      
        1304
                    tool_outcome(
      
        1305
                        tool_call=tool_call,
      
        1306
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1307
                        is_error=False,
      
        1308
                    )
      
        1309
                ]
      
        1310
            )
      
        1311
        
        1312
            summary = TurnSummary(final_response="")
      
        1313
            await runner.execute_batch(
      
        1314
                tool_calls=[tool_call],
      
        1315
                tool_source="assistant",
      
        1316
                pending_tool_calls_seen=set(),
      
        1317
                emit=_noop_emit,
      
        1318
                summary=summary,
      
        1319
                dod=dod,
      
        1320
                executor=executor,  # type: ignore[arg-type]
      
        1321
                on_confirmation=None,
      
        1322
                on_user_question=None,
      
        1323
                emit_confirmation=None,
      
        1324
                consecutive_errors=0,
      
        1325
            )
      
        1326
        
        1327
            assert persistent_messages
      
        1328
            assert any(
      
        1329
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1330
                in message
      
        1331
                for message in persistent_messages
      
        1332
            )
      
        1333
            assert any(
      
        1334
                "Resume by creating `chapters/` now." in message
      
        1335
                for message in persistent_messages
      
        1336
            )
      
        1337
            assert all(
      
        1338
                "Next step: create `index.html`." not in message
      
        1339
                for message in persistent_messages
      
        1340
            )
      
        1341
            assert ephemeral_messages == []
      
        1342
        
        1343
        
        1344
        @pytest.mark.asyncio
      
        1345
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1346
            temp_dir: Path,
      
        1347
        ) -> None:
      
        1348
            async def assess_confidence(
      
        1349
                tool_name: str,
      
        1350
                tool_args: dict,
      
        1351
                context: str,
      
        1352
            ) -> ConfidenceAssessment:
      
        1353
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1354
        
        1355
            async def verify_action(
      
        1356
                tool_name: str,
      
        1357
                tool_args: dict,
      
        1358
                result: str,
      
        1359
                expected: str = "",
      
        1360
            ) -> ActionVerification:
      
        1361
                raise AssertionError("Verification should not run for this scenario")
      
        1362
        
        1363
            reference = temp_dir / "fortran" / "index.html"
      
        1364
            reference.parent.mkdir(parents=True)
      
        1365
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1366
        
        1367
            messages = [
      
        1368
                Message(
      
        1369
                    role=Role.TOOL,
      
        1370
                    content=(
      
        1371
                        "Observation [read]: Result: "
      
        1372
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1373
                    ),
      
        1374
                )
      
        1375
            ]
      
        1376
            context = build_context(
      
        1377
                temp_dir=temp_dir,
      
        1378
                messages=messages,
      
        1379
                safeguards=FakeSafeguards(),
      
        1380
                assess_confidence=assess_confidence,
      
        1381
                verify_action=verify_action,
      
        1382
                auto_recover=False,
      
        1383
            )
      
        1384
            prompt = (
      
        1385
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1386
                "for the structure and cadence of the guide. We are going to make an all "
      
        1387
                "new equally thorough guide on how to use the nginx tool."
      
        1388
            )
      
        1389
            context.session.current_task = prompt
      
        1390
            persistent_messages: list[str] = []
      
        1391
            ephemeral_messages: list[str] = []
      
        1392
            context.queue_steering_message_callback = persistent_messages.append
      
        1393
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1394
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1395
            dod = create_definition_of_done(prompt)
      
        1396
            sync_todos_to_definition_of_done(
      
        1397
                dod,
      
        1398
                [
      
        1399
                    {
      
        1400
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1401
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1402
                        "status": "completed",
      
        1403
                    },
      
        1404
                    {
      
        1405
                        "content": "Create the nginx directory structure",
      
        1406
                        "active_form": "Working on: Create the nginx directory structure",
      
        1407
                        "status": "pending",
      
        1408
                    },
      
        1409
                    {
      
        1410
                        "content": "Create the nginx index.html file",
      
        1411
                        "active_form": "Working on: Create the nginx index.html file",
      
        1412
                        "status": "pending",
      
        1413
                    },
      
        1414
                ],
      
        1415
            )
      
        1416
            tool_call = ToolCall(
      
        1417
                id="read-dup",
      
        1418
                name="read",
      
        1419
                arguments={"file_path": str(reference)},
      
        1420
            )
      
        1421
            duplicate_message = (
      
        1422
                "[Skipped - duplicate action: Already read "
      
        1423
                f"{reference} recently without any intervening changes; "
      
        1424
                "reuse the earlier read result instead of rereading]"
      
        1425
            )
      
        1426
            executor = FakeExecutor(
      
        1427
                [
      
        1428
                    ToolExecutionOutcome(
      
        1429
                        tool_call=tool_call,
      
        1430
                        state=ToolExecutionState.DUPLICATE,
      
        1431
                        message=Message.tool_result_message(
      
        1432
                            tool_call_id=tool_call.id,
      
        1433
                            display_content=duplicate_message,
      
        1434
                            result_content=duplicate_message,
      
        1435
                        ),
      
        1436
                        event_content=duplicate_message,
      
        1437
                        is_error=False,
      
        1438
                        result_output=duplicate_message,
      
        1439
                    )
      
        1440
                ]
      
        1441
            )
      
        1442
        
        1443
            summary = TurnSummary(final_response="")
      
        1444
            await runner.execute_batch(
      
        1445
                tool_calls=[tool_call],
      
        1446
                tool_source="assistant",
      
        1447
                pending_tool_calls_seen=set(),
      
        1448
                emit=_noop_emit,
      
        1449
                summary=summary,
      
        1450
                dod=dod,
      
        1451
                executor=executor,  # type: ignore[arg-type]
      
        1452
                on_confirmation=None,
      
        1453
                on_user_question=None,
      
        1454
                emit_confirmation=None,
      
        1455
                consecutive_errors=0,
      
        1456
            )
      
        1457
        
        1458
            assert len(persistent_messages) == 1
      
        1459
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1460
            assert (
      
        1461
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1462
                in persistent_messages[0]
      
        1463
            )
      
        1464
            assert "Update `" not in persistent_messages[0]
      
        1465
            assert ephemeral_messages == []
      
        1466
        
        1467
        
        1468
        @pytest.mark.asyncio
      
        1469
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1470
            temp_dir: Path,
      
        1471
        ) -> None:
      
        1472
            async def assess_confidence(
      
        1473
                tool_name: str,
      
        1474
                tool_args: dict,
      
        1475
                context: str,
      
        1476
            ) -> ConfidenceAssessment:
      
        1477
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1478
        
        1479
            async def verify_action(
      
        1480
                tool_name: str,
      
        1481
                tool_args: dict,
      
        1482
                result: str,
      
        1483
                expected: str = "",
      
        1484
            ) -> ActionVerification:
      
        1485
                raise AssertionError("Verification should not run for this scenario")
      
        1486
        
        1487
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1488
            chapters = guide_root / "chapters"
      
        1489
            chapters.mkdir(parents=True)
      
        1490
            chapter_one = chapters / "01-introduction.html"
      
        1491
            chapter_one.write_text("<html></html>\n")
      
        1492
            index_path = guide_root / "index.html"
      
        1493
        
        1494
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1495
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1496
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1497
        
        1498
            implementation_plan = temp_dir / "implementation.md"
      
        1499
            implementation_plan.write_text(
      
        1500
                "\n".join(
      
        1501
                    [
      
        1502
                        "# Implementation Plan",
      
        1503
                        "",
      
        1504
                        "## File Changes",
      
        1505
                        f"- `{guide_root}/`",
      
        1506
                        f"- `{chapters}/`",
      
        1507
                        f"- `{index_path}`",
      
        1508
                        f"- `{chapter_one}`",
      
        1509
                        f"- `{chapters / '02-installation.html'}`",
      
        1510
                        "",
      
        1511
                    ]
      
        1512
                )
      
        1513
            )
      
        1514
        
        1515
            context = build_context(
      
        1516
                temp_dir=temp_dir,
      
        1517
                messages=[],
      
        1518
                safeguards=FakeSafeguards(),
      
        1519
                assess_confidence=assess_confidence,
      
        1520
                verify_action=verify_action,
      
        1521
                auto_recover=False,
      
        1522
            )
      
        1523
            persistent_messages: list[str] = []
      
        1524
            ephemeral_messages: list[str] = []
      
        1525
            context.queue_steering_message_callback = persistent_messages.append
      
        1526
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1527
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1528
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1529
            dod.implementation_plan = str(implementation_plan)
      
        1530
            dod.touched_files.append(str(chapter_one))
      
        1531
            sync_todos_to_definition_of_done(
      
        1532
                dod,
      
        1533
                [
      
        1534
                    {
      
        1535
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1536
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1537
                        "status": "pending",
      
        1538
                    },
      
        1539
                    {
      
        1540
                        "content": "Create each chapter file with appropriate content",
      
        1541
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1542
                        "status": "pending",
      
        1543
                    },
      
        1544
                    {
      
        1545
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1546
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1547
                        "status": "pending",
      
        1548
                    },
      
        1549
                ],
      
        1550
            )
      
        1551
            tool_call = ToolCall(
      
        1552
                id="read-reference-chapter",
      
        1553
                name="read",
      
        1554
                arguments={"file_path": str(reference)},
      
        1555
            )
      
        1556
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1557
            executor = FakeExecutor(
      
        1558
                [
      
        1559
                    ToolExecutionOutcome(
      
        1560
                        tool_call=tool_call,
      
        1561
                        state=ToolExecutionState.EXECUTED,
      
        1562
                        message=Message.tool_result_message(
      
        1563
                            tool_call_id=tool_call.id,
      
        1564
                            display_content=read_output,
      
        1565
                            result_content=read_output,
      
        1566
                        ),
      
        1567
                        event_content=read_output,
      
        1568
                        is_error=False,
      
        1569
                        result_output=read_output,
      
        1570
                    )
      
        1571
                ]
      
        1572
            )
      
        1573
        
        1574
            summary = TurnSummary(final_response="")
      
        1575
            await runner.execute_batch(
      
        1576
                tool_calls=[tool_call],
      
        1577
                tool_source="assistant",
      
        1578
                pending_tool_calls_seen=set(),
      
        1579
                emit=_noop_emit,
      
        1580
                summary=summary,
      
        1581
                dod=dod,
      
        1582
                executor=executor,  # type: ignore[arg-type]
      
        1583
                on_confirmation=None,
      
        1584
                on_user_question=None,
      
        1585
                emit_confirmation=None,
      
        1586
                consecutive_errors=0,
      
        1587
            )
      
        1588
        
        1589
            assert persistent_messages
      
        1590
            assert any(
      
        1591
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1592
                in message
      
        1593
                for message in persistent_messages
      
        1594
            )
      
        1595
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1596
            assert not any(
      
        1597
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1598
                in message
      
        1599
                for message in persistent_messages
      
        1600
            )
      
        1601
            assert ephemeral_messages == []
      
        1602
        
        1603
        
        1604
        @pytest.mark.asyncio
      
        1605
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1606
            temp_dir: Path,
      
        1607
        ) -> None:
      
        1608
            async def assess_confidence(
      
        1609
                tool_name: str,
      
        1610
                tool_args: dict,
      
        1611
                context: str,
      
        1612
            ) -> ConfidenceAssessment:
      
        1613
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1614
        
        1615
            async def verify_action(
      
        1616
                tool_name: str,
      
        1617
                tool_args: dict,
      
        1618
                result: str,
      
        1619
                expected: str = "",
      
        1620
            ) -> ActionVerification:
      
        1621
                raise AssertionError("Verification should not run for this scenario")
      
        1622
        
        1623
            guide_root = temp_dir / "guides" / "nginx"
      
        1624
            chapters = guide_root / "chapters"
      
        1625
            guide_root.mkdir(parents=True)
      
        1626
            chapters.mkdir()
      
        1627
            index_path = guide_root / "index.html"
      
        1628
            chapter_one = chapters / "01-getting-started.html"
      
        1629
            chapter_two = chapters / "02-installation.html"
      
        1630
            index_path.write_text("<html></html>\n")
      
        1631
            chapter_one.write_text("<h1>One</h1>\n")
      
        1632
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1633
        
        1634
            implementation_plan = temp_dir / "implementation.md"
      
        1635
            implementation_plan.write_text(
      
        1636
                "\n".join(
      
        1637
                    [
      
        1638
                        "# Implementation Plan",
      
        1639
                        "",
      
        1640
                        "## File Changes",
      
        1641
                        f"- `{guide_root}/`",
      
        1642
                        f"- `{chapters}/`",
      
        1643
                        f"- `{index_path}`",
      
        1644
                        f"- `{chapter_one}`",
      
        1645
                        f"- `{chapter_two}`",
      
        1646
                        "",
      
        1647
                    ]
      
        1648
                )
      
        1649
            )
      
        1650
        
        1651
            context = build_context(
      
        1652
                temp_dir=temp_dir,
      
        1653
                messages=[],
      
        1654
                safeguards=FakeSafeguards(),
      
        1655
                assess_confidence=assess_confidence,
      
        1656
                verify_action=verify_action,
      
        1657
                auto_recover=False,
      
        1658
            )
      
        1659
            persistent_messages: list[str] = []
      
        1660
            ephemeral_messages: list[str] = []
      
        1661
            context.queue_steering_message_callback = persistent_messages.append
      
        1662
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1663
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1664
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1665
            dod.implementation_plan = str(implementation_plan)
      
        1666
            dod.pending_items = [
      
        1667
                "Create 07-performance-tuning.html",
      
        1668
                "Verify all guide files are linked and complete",
      
        1669
                "Complete the requested work",
      
        1670
            ]
      
        1671
        
        1672
            tool_call = ToolCall(
      
        1673
                id="read-dup",
      
        1674
                name="read",
      
        1675
                arguments={"file_path": str(chapter_one)},
      
        1676
            )
      
        1677
            duplicate_message = (
      
        1678
                "[Skipped - duplicate action: Already read "
      
        1679
                f"{chapter_one} recently without any intervening changes; "
      
        1680
                "reuse the earlier read result instead of rereading]"
      
        1681
            )
      
        1682
            executor = FakeExecutor(
      
        1683
                [
      
        1684
                    ToolExecutionOutcome(
      
        1685
                        tool_call=tool_call,
      
        1686
                        state=ToolExecutionState.DUPLICATE,
      
        1687
                        message=Message.tool_result_message(
      
        1688
                            tool_call_id=tool_call.id,
      
        1689
                            display_content=duplicate_message,
      
        1690
                            result_content=duplicate_message,
      
        1691
                        ),
      
        1692
                        event_content=duplicate_message,
      
        1693
                        is_error=False,
      
        1694
                        result_output=duplicate_message,
      
        1695
                    )
      
        1696
                ]
      
        1697
            )
      
        1698
        
        1699
            summary = TurnSummary(final_response="")
      
        1700
            await runner.execute_batch(
      
        1701
                tool_calls=[tool_call],
      
        1702
                tool_source="assistant",
      
        1703
                pending_tool_calls_seen=set(),
      
        1704
                emit=_noop_emit,
      
        1705
                summary=summary,
      
        1706
                dod=dod,
      
        1707
                executor=executor,  # type: ignore[arg-type]
      
        1708
                on_confirmation=None,
      
        1709
                on_user_question=None,
      
        1710
                emit_confirmation=None,
      
        1711
                consecutive_errors=0,
      
        1712
            )
      
        1713
        
        1714
            assert len(persistent_messages) == 1
      
        1715
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1716
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1717
            assert ephemeral_messages == []
      
        1718
        
        1719
        
        1720
        @pytest.mark.asyncio
      
        1721
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1722
            temp_dir: Path,
      
        1723
        ) -> None:
      
        1724
            async def assess_confidence(
      
        1725
                tool_name: str,
      
        1726
                tool_args: dict,
      
        1727
                context: str,
      
        1728
            ) -> ConfidenceAssessment:
      
        1729
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1730
        
        1731
            async def verify_action(
      
        1732
                tool_name: str,
      
        1733
                tool_args: dict,
      
        1734
                result: str,
      
        1735
                expected: str = "",
      
        1736
            ) -> ActionVerification:
      
        1737
                raise AssertionError("Verification should not run for this scenario")
      
        1738
        
        1739
            guide_root = temp_dir / "guides" / "nginx"
      
        1740
            chapters = guide_root / "chapters"
      
        1741
            guide_root.mkdir(parents=True)
      
        1742
            chapters.mkdir()
      
        1743
            index_path = guide_root / "index.html"
      
        1744
            chapter_one = chapters / "01-getting-started.html"
      
        1745
            chapter_two = chapters / "02-installation.html"
      
        1746
            index_path.write_text("<html></html>\n")
      
        1747
            chapter_one.write_text("<h1>One</h1>\n")
      
        1748
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1749
        
        1750
            implementation_plan = temp_dir / "implementation.md"
      
        1751
            implementation_plan.write_text(
      
        1752
                "\n".join(
      
        1753
                    [
      
        1754
                        "# Implementation Plan",
      
        1755
                        "",
      
        1756
                        "## File Changes",
      
        1757
                        f"- `{guide_root}/`",
      
        1758
                        f"- `{chapters}/`",
      
        1759
                        f"- `{index_path}`",
      
        1760
                        f"- `{chapter_one}`",
      
        1761
                        f"- `{chapter_two}`",
      
        1762
                        "",
      
        1763
                    ]
      
        1764
                )
      
        1765
            )
      
        1766
        
        1767
            context = build_context(
      
        1768
                temp_dir=temp_dir,
      
        1769
                messages=[],
      
        1770
                safeguards=FakeSafeguards(),
      
        1771
                assess_confidence=assess_confidence,
      
        1772
                verify_action=verify_action,
      
        1773
                auto_recover=False,
      
        1774
            )
      
        1775
            persistent_messages: list[str] = []
      
        1776
            ephemeral_messages: list[str] = []
      
        1777
            context.queue_steering_message_callback = persistent_messages.append
      
        1778
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1779
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1780
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1781
            dod.implementation_plan = str(implementation_plan)
      
        1782
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1783
            dod.pending_items = [
      
        1784
                "Create 07-performance-tuning.html",
      
        1785
                "Complete the requested work",
      
        1786
            ]
      
        1787
        
        1788
            tool_call = ToolCall(
      
        1789
                id="read-dup",
      
        1790
                name="read",
      
        1791
                arguments={"file_path": str(chapter_one)},
      
        1792
            )
      
        1793
            duplicate_message = (
      
        1794
                "[Skipped - duplicate action: Already read "
      
        1795
                f"{chapter_one} recently without any intervening changes; "
      
        1796
                "reuse the earlier read result instead of rereading]"
      
        1797
            )
      
        1798
            executor = FakeExecutor(
      
        1799
                [
      
        1800
                    ToolExecutionOutcome(
      
        1801
                        tool_call=tool_call,
      
        1802
                        state=ToolExecutionState.DUPLICATE,
      
        1803
                        message=Message.tool_result_message(
      
        1804
                            tool_call_id=tool_call.id,
      
        1805
                            display_content=duplicate_message,
      
        1806
                            result_content=duplicate_message,
      
        1807
                        ),
      
        1808
                        event_content=duplicate_message,
      
        1809
                        is_error=False,
      
        1810
                        result_output=duplicate_message,
      
        1811
                    )
      
        1812
                ]
      
        1813
            )
      
        1814
        
        1815
            summary = TurnSummary(final_response="")
      
        1816
            await runner.execute_batch(
      
        1817
                tool_calls=[tool_call],
      
        1818
                tool_source="assistant",
      
        1819
                pending_tool_calls_seen=set(),
      
        1820
                emit=_noop_emit,
      
        1821
                summary=summary,
      
        1822
                dod=dod,
      
        1823
                executor=executor,  # type: ignore[arg-type]
      
        1824
                on_confirmation=None,
      
        1825
                on_user_question=None,
      
        1826
                emit_confirmation=None,
      
        1827
                consecutive_errors=0,
      
        1828
            )
      
        1829
        
        1830
            assert len(persistent_messages) == 1
      
        1831
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1832
            assert (
      
        1833
                "Move to verification or final confirmation using the files already on disk."
      
        1834
                in persistent_messages[0]
      
        1835
            )
      
        1836
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1837
            assert ephemeral_messages == []
      
        1838
        
        1839
        
        1840
        @pytest.mark.asyncio
      
        1841
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1842
            temp_dir: Path,
      
        1843
        ) -> None:
      
        1844
            async def assess_confidence(
      
        1845
                tool_name: str,
      
        1846
                tool_args: dict,
      
        1847
                context: str,
      
        1848
            ) -> ConfidenceAssessment:
      
        1849
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1850
        
        1851
            async def verify_action(
      
        1852
                tool_name: str,
      
        1853
                tool_args: dict,
      
        1854
                result: str,
      
        1855
                expected: str = "",
      
        1856
            ) -> ActionVerification:
      
        1857
                raise AssertionError("Verification should not run for this scenario")
      
        1858
        
        1859
            guide_root = temp_dir / "guides" / "nginx"
      
        1860
            chapters = guide_root / "chapters"
      
        1861
            guide_root.mkdir(parents=True)
      
        1862
            chapters.mkdir()
      
        1863
            index_path = guide_root / "index.html"
      
        1864
            chapter_one = chapters / "01-getting-started.html"
      
        1865
            chapter_two = chapters / "02-installation.html"
      
        1866
            index_path.write_text("<html></html>\n")
      
        1867
            chapter_one.write_text("<h1>One</h1>\n")
      
        1868
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1869
        
        1870
            implementation_plan = temp_dir / "implementation.md"
      
        1871
            implementation_plan.write_text(
      
        1872
                "\n".join(
      
        1873
                    [
      
        1874
                        "# Implementation Plan",
      
        1875
                        "",
      
        1876
                        "## File Changes",
      
        1877
                        f"- `{guide_root}/`",
      
        1878
                        f"- `{chapters}/`",
      
        1879
                        f"- `{index_path}`",
      
        1880
                        f"- `{chapter_one}`",
      
        1881
                        f"- `{chapter_two}`",
      
        1882
                        "",
      
        1883
                    ]
      
        1884
                )
      
        1885
            )
      
        1886
        
        1887
            context = build_context(
      
        1888
                temp_dir=temp_dir,
      
        1889
                messages=[],
      
        1890
                safeguards=FakeSafeguards(),
      
        1891
                assess_confidence=assess_confidence,
      
        1892
                verify_action=verify_action,
      
        1893
                auto_recover=False,
      
        1894
            )
      
        1895
            persistent_messages: list[str] = []
      
        1896
            ephemeral_messages: list[str] = []
      
        1897
            context.queue_steering_message_callback = persistent_messages.append
      
        1898
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1899
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1900
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1901
            dod.implementation_plan = str(implementation_plan)
      
        1902
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1903
            dod.pending_items = [
      
        1904
                "Create 01-getting-started.html",
      
        1905
                "Creating 02-installation.html",
      
        1906
                "Complete the requested work",
      
        1907
            ]
      
        1908
        
        1909
            tool_call = ToolCall(
      
        1910
                id="read-dup-built-stale",
      
        1911
                name="read",
      
        1912
                arguments={"file_path": str(chapter_one)},
      
        1913
            )
      
        1914
            duplicate_message = (
      
        1915
                "[Skipped - duplicate action: Already read "
      
        1916
                f"{chapter_one} recently without any intervening changes; "
      
        1917
                "reuse the earlier read result instead of rereading]"
      
        1918
            )
      
        1919
            executor = FakeExecutor(
      
        1920
                [
      
        1921
                    ToolExecutionOutcome(
      
        1922
                        tool_call=tool_call,
      
        1923
                        state=ToolExecutionState.DUPLICATE,
      
        1924
                        message=Message.tool_result_message(
      
        1925
                            tool_call_id=tool_call.id,
      
        1926
                            display_content=duplicate_message,
      
        1927
                            result_content=duplicate_message,
      
        1928
                        ),
      
        1929
                        event_content=duplicate_message,
      
        1930
                        is_error=False,
      
        1931
                        result_output=duplicate_message,
      
        1932
                    )
      
        1933
                ]
      
        1934
            )
      
        1935
        
        1936
            summary = TurnSummary(final_response="")
      
        1937
            await runner.execute_batch(
      
        1938
                tool_calls=[tool_call],
      
        1939
                tool_source="assistant",
      
        1940
                pending_tool_calls_seen=set(),
      
        1941
                emit=_noop_emit,
      
        1942
                summary=summary,
      
        1943
                dod=dod,
      
        1944
                executor=executor,  # type: ignore[arg-type]
      
        1945
                on_confirmation=None,
      
        1946
                on_user_question=None,
      
        1947
                emit_confirmation=None,
      
        1948
                consecutive_errors=0,
      
        1949
            )
      
        1950
        
        1951
            assert len(persistent_messages) == 1
      
        1952
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1953
            assert (
      
        1954
                "Move to verification or final confirmation using the files already on disk."
      
        1955
                in persistent_messages[0]
      
        1956
            )
      
        1957
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        1958
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        1959
            assert ephemeral_messages == []
      
        1960
        
        1961
        
        1962
        @pytest.mark.asyncio
      
        1963
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        1964
            temp_dir: Path,
      
        1965
        ) -> None:
      
        1966
            async def assess_confidence(
      
        1967
                tool_name: str,
      
        1968
                tool_args: dict,
      
        1969
                context: str,
      
        1970
            ) -> ConfidenceAssessment:
      
        1971
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1972
        
        1973
            async def verify_action(
      
        1974
                tool_name: str,
      
        1975
                tool_args: dict,
      
        1976
                result: str,
      
        1977
                expected: str = "",
      
        1978
            ) -> ActionVerification:
      
        1979
                raise AssertionError("Verification should not run for this scenario")
      
        1980
        
        1981
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1982
            reference.parent.mkdir(parents=True)
      
        1983
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1984
        
        1985
            context = build_context(
      
        1986
                temp_dir=temp_dir,
      
        1987
                messages=[],
      
        1988
                safeguards=FakeSafeguards(),
      
        1989
                assess_confidence=assess_confidence,
      
        1990
                verify_action=verify_action,
      
        1991
                auto_recover=False,
      
        1992
            )
      
        1993
            persistent_messages: list[str] = []
      
        1994
            ephemeral_messages: list[str] = []
      
        1995
            context.queue_steering_message_callback = persistent_messages.append
      
        1996
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1997
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1998
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1999
            sync_todos_to_definition_of_done(
      
        2000
                dod,
      
        2001
                [
      
        2002
                    {
      
        2003
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2004
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2005
                        "status": "pending",
      
        2006
                    },
      
        2007
                    {
      
        2008
                        "content": "Create the nginx index.html file",
      
        2009
                        "active_form": "Working on: Create the nginx index.html file",
      
        2010
                        "status": "pending",
      
        2011
                    },
      
        2012
                ],
      
        2013
            )
      
        2014
            tool_call = ToolCall(
      
        2015
                id="read-reference",
      
        2016
                name="read",
      
        2017
                arguments={"file_path": str(reference)},
      
        2018
            )
      
        2019
            executor = FakeExecutor(
      
        2020
                [
      
        2021
                    tool_outcome(
      
        2022
                        tool_call=tool_call,
      
        2023
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2024
                        is_error=False,
      
        2025
                    )
      
        2026
                ]
      
        2027
            )
      
        2028
        
        2029
            summary = TurnSummary(final_response="")
      
        2030
            await runner.execute_batch(
      
        2031
                tool_calls=[tool_call],
      
        2032
                tool_source="assistant",
      
        2033
                pending_tool_calls_seen=set(),
      
        2034
                emit=_noop_emit,
      
        2035
                summary=summary,
      
        2036
                dod=dod,
      
        2037
                executor=executor,  # type: ignore[arg-type]
      
        2038
                on_confirmation=None,
      
        2039
                on_user_question=None,
      
        2040
                emit_confirmation=None,
      
        2041
                consecutive_errors=0,
      
        2042
            )
      
        2043
        
        2044
            assert any(
      
        2045
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2046
                in message
      
        2047
                for message in persistent_messages
      
        2048
            )
      
        2049
            assert any(
      
        2050
                "stop gathering more reference material and perform the change now" in message
      
        2051
                for message in persistent_messages
      
        2052
            )
      
        2053
            assert ephemeral_messages == []
      
        2054
        
        2055
        
        2056
        @pytest.mark.asyncio
      
        2057
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2058
            temp_dir: Path,
      
        2059
        ) -> None:
      
        2060
            async def assess_confidence(
      
        2061
                tool_name: str,
      
        2062
                tool_args: dict,
      
        2063
                context: str,
      
        2064
            ) -> ConfidenceAssessment:
      
        2065
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2066
        
        2067
            async def verify_action(
      
        2068
                tool_name: str,
      
        2069
                tool_args: dict,
      
        2070
                result: str,
      
        2071
                expected: str = "",
      
        2072
            ) -> ActionVerification:
      
        2073
                raise AssertionError("Verification should not run for this scenario")
      
        2074
        
        2075
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2076
            reference.parent.mkdir(parents=True)
      
        2077
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2078
        
        2079
            context = build_context(
      
        2080
                temp_dir=temp_dir,
      
        2081
                messages=[],
      
        2082
                safeguards=FakeSafeguards(),
      
        2083
                assess_confidence=assess_confidence,
      
        2084
                verify_action=verify_action,
      
        2085
                auto_recover=False,
      
        2086
            )
      
        2087
            persistent_messages: list[str] = []
      
        2088
            ephemeral_messages: list[str] = []
      
        2089
            context.queue_steering_message_callback = persistent_messages.append
      
        2090
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2091
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2092
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2093
            sync_todos_to_definition_of_done(
      
        2094
                dod,
      
        2095
                [
      
        2096
                    {
      
        2097
                        "content": "First, examine the existing fortran guide structure and content",
      
        2098
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2099
                        "status": "pending",
      
        2100
                    },
      
        2101
                    {
      
        2102
                        "content": "Create the nginx directory structure",
      
        2103
                        "active_form": "Working on: Create the nginx directory structure",
      
        2104
                        "status": "pending",
      
        2105
                    },
      
        2106
                ],
      
        2107
            )
      
        2108
            tool_call = ToolCall(
      
        2109
                id="read-reference",
      
        2110
                name="read",
      
        2111
                arguments={"file_path": str(reference)},
      
        2112
            )
      
        2113
            executor = FakeExecutor(
      
        2114
                [
      
        2115
                    tool_outcome(
      
        2116
                        tool_call=tool_call,
      
        2117
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2118
                        is_error=False,
      
        2119
                    )
      
        2120
                ]
      
        2121
            )
      
        2122
        
        2123
            summary = TurnSummary(final_response="")
      
        2124
            await runner.execute_batch(
      
        2125
                tool_calls=[tool_call],
      
        2126
                tool_source="assistant",
      
        2127
                pending_tool_calls_seen=set(),
      
        2128
                emit=_noop_emit,
      
        2129
                summary=summary,
      
        2130
                dod=dod,
      
        2131
                executor=executor,  # type: ignore[arg-type]
      
        2132
                on_confirmation=None,
      
        2133
                on_user_question=None,
      
        2134
                emit_confirmation=None,
      
        2135
                consecutive_errors=0,
      
        2136
            )
      
        2137
        
        2138
            assert persistent_messages
      
        2139
            assert any(
      
        2140
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2141
                in message
      
        2142
                for message in persistent_messages
      
        2143
            )
      
        2144
            assert ephemeral_messages == []
      
        2145
        
        2146
        
        2147
        @pytest.mark.asyncio
      
        2148
        async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
      
        2149
            temp_dir: Path,
      
        2150
        ) -> None:
      
        2151
            async def assess_confidence(
      
        2152
                tool_name: str,
      
        2153
                tool_args: dict,
      
        2154
                context: str,
      
        2155
            ) -> ConfidenceAssessment:
      
        2156
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2157
        
        2158
            async def verify_action(
      
        2159
                tool_name: str,
      
        2160
                tool_args: dict,
      
        2161
                result: str,
      
        2162
                expected: str = "",
      
        2163
            ) -> ActionVerification:
      
        2164
                raise AssertionError("Verification should not run for this scenario")
      
        2165
        
        2166
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2167
            chapters = nginx_root / "chapters"
      
        2168
            implementation_plan = temp_dir / "implementation.md"
      
        2169
            implementation_plan.write_text(
      
        2170
                "\n".join(
      
        2171
                    [
      
        2172
                        "# Implementation Plan",
      
        2173
                        "",
      
        2174
                        "## File Changes",
      
        2175
                        f"- `{chapters}/`",
      
        2176
                        f"- `{nginx_root / 'index.html'}`",
      
        2177
                        "",
      
        2178
                    ]
      
        2179
                )
      
        2180
            )
      
        2181
        
        2182
            context = build_context(
      
        2183
                temp_dir=temp_dir,
      
        2184
                messages=[],
      
        2185
                safeguards=FakeSafeguards(),
      
        2186
                assess_confidence=assess_confidence,
      
        2187
                verify_action=verify_action,
      
        2188
                auto_recover=False,
      
        2189
            )
      
        2190
            persistent_messages: list[str] = []
      
        2191
            ephemeral_messages: list[str] = []
      
        2192
            context.queue_steering_message_callback = persistent_messages.append
      
        2193
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2194
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2195
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2196
            dod.implementation_plan = str(implementation_plan)
      
        2197
            sync_todos_to_definition_of_done(
      
        2198
                dod,
      
        2199
                [
      
        2200
                    {
      
        2201
                        "content": "Create the nginx directory structure",
      
        2202
                        "active_form": "Creating the nginx directory structure",
      
        2203
                        "status": "pending",
      
        2204
                    },
      
        2205
                    {
      
        2206
                        "content": "Develop the main index.html file with proper structure",
      
        2207
                        "active_form": "Developing the main index.html file with proper structure",
      
        2208
                        "status": "pending",
      
        2209
                    },
      
        2210
                ],
      
        2211
            )
      
        2212
        
        2213
            tool_call = ToolCall(
      
        2214
                id="mkdir-nginx",
      
        2215
                name="bash",
      
        2216
                arguments={"command": f"mkdir -p {chapters}"},
      
        2217
            )
      
        2218
            executor = FakeExecutor(
      
        2219
                [
      
        2220
                    tool_outcome(
      
        2221
                        tool_call=tool_call,
      
        2222
                        output="",
      
        2223
                        is_error=False,
      
        2224
                    )
      
        2225
                ]
      
        2226
            )
      
        2227
        
        2228
            summary = TurnSummary(final_response="")
      
        2229
            await runner.execute_batch(
      
        2230
                tool_calls=[tool_call],
      
        2231
                tool_source="assistant",
      
        2232
                pending_tool_calls_seen=set(),
      
        2233
                emit=_noop_emit,
      
        2234
                summary=summary,
      
        2235
                dod=dod,
      
        2236
                executor=executor,  # type: ignore[arg-type]
      
        2237
                on_confirmation=None,
      
        2238
                on_user_question=None,
      
        2239
                emit_confirmation=None,
      
        2240
                consecutive_errors=0,
      
        2241
            )
      
        2242
        
        2243
            assert persistent_messages
      
        2244
            message = persistent_messages[-1]
      
        2245
            assert "Directory setup is complete." in message
      
        2246
            assert "Continue with the next pending item: `Develop the main index.html file with proper structure`." in message
      
        2247
            assert "Resume by creating `index.html` now." in message
      
        2248
            assert ephemeral_messages == []
      
        2249
        
        2250
        
        2251
        @pytest.mark.asyncio
      
        2252
        async def test_tool_batch_runner_first_chapter_handoff_becomes_ephemeral_after_first_file(
      
        2253
            temp_dir: Path,
      
        2254
        ) -> None:
      
        2255
            async def assess_confidence(
      
        2256
                tool_name: str,
      
        2257
                tool_args: dict,
      
        2258
                context: str,
      
        2259
            ) -> ConfidenceAssessment:
      
        2260
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2261
        
        2262
            async def verify_action(
      
        2263
                tool_name: str,
      
        2264
                tool_args: dict,
      
        2265
                result: str,
      
        2266
                expected: str = "",
      
        2267
            ) -> ActionVerification:
      
        2268
                raise AssertionError("Verification should not run for this scenario")
      
        2269
        
        2270
            nginx_root = temp_dir / "guides" / "nginx"
      
        2271
            chapters = nginx_root / "chapters"
      
        2272
            chapters.mkdir(parents=True)
      
        2273
            index_path = nginx_root / "index.html"
      
        2274
        
        2275
            implementation_plan = temp_dir / "implementation.md"
      
        2276
            implementation_plan.write_text(
      
        2277
                "\n".join(
      
        2278
                    [
      
        2279
                        "# Implementation Plan",
      
        2280
                        "",
      
        2281
                        "## File Changes",
      
        2282
                        f"- `{chapters}/`",
      
        2283
                        f"- `{index_path}`",
      
        2284
                        f"- `{chapters / '01-introduction.html'}`",
      
        2285
                        "",
      
        2286
                    ]
      
        2287
                )
      
        2288
            )
      
        2289
        
        2290
            context = build_context(
      
        2291
                temp_dir=temp_dir,
      
        2292
                messages=[],
      
        2293
                safeguards=FakeSafeguards(),
      
        2294
                assess_confidence=assess_confidence,
      
        2295
                verify_action=verify_action,
      
        2296
                auto_recover=False,
      
        2297
            )
      
        2298
            persistent_messages: list[str] = []
      
        2299
            ephemeral_messages: list[str] = []
      
        2300
            context.queue_steering_message_callback = persistent_messages.append
      
        2301
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2302
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2303
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2304
            dod.implementation_plan = str(implementation_plan)
      
        2305
            sync_todos_to_definition_of_done(
      
        2306
                dod,
      
        2307
                [
      
        2308
                    {
      
        2309
                        "content": "Create the main index.html file with proper structure",
      
        2310
                        "active_form": "Creating the main index.html file with proper structure",
      
        2311
                        "status": "pending",
      
        2312
                    },
      
        2313
                    {
      
        2314
                        "content": "Create each chapter file with appropriate content",
      
        2315
                        "active_form": "Creating each chapter file with appropriate content",
      
        2316
                        "status": "pending",
      
        2317
                    },
      
        2318
                ],
      
        2319
            )
      
        2320
        
        2321
            tool_call = ToolCall(
      
        2322
                id="write-index",
      
        2323
                name="write",
      
        2324
                arguments={
      
        2325
                    "file_path": str(index_path),
      
        2326
                    "content": "<html></html>\n",
      
        2327
                },
      
        2328
            )
      
        2329
            executor = FakeExecutor(
      
        2330
                [
      
        2331
                    tool_outcome(
      
        2332
                        tool_call=tool_call,
      
        2333
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2334
                        is_error=False,
      
        2335
                    )
      
        2336
                ]
      
        2337
            )
      
        2338
        
        2339
            summary = TurnSummary(final_response="")
      
        2340
            await runner.execute_batch(
      
        2341
                tool_calls=[tool_call],
      
        2342
                tool_source="assistant",
      
        2343
                pending_tool_calls_seen=set(),
      
        2344
                emit=_noop_emit,
      
        2345
                summary=summary,
      
        2346
                dod=dod,
      
        2347
                executor=executor,  # type: ignore[arg-type]
      
        2348
                on_confirmation=None,
      
        2349
                on_user_question=None,
      
        2350
                emit_confirmation=None,
      
        2351
                consecutive_errors=0,
      
        2352
            )
      
        2353
        
        2354
            assert persistent_messages == []
      
        2355
            assert ephemeral_messages
      
        2356
            message = ephemeral_messages[-1]
      
        2357
            assert "Confirmed progress:" in message
      
        2358
            assert "Next step: create `01-introduction.html`." in message
      
        2359
            assert (
      
        2360
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2361
                in message
      
        2362
            )
      
        2363
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2364
        
        2365
        
        2366
        @pytest.mark.asyncio
      
        2367
        async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
      
        2368
            temp_dir: Path,
      
        2369
        ) -> None:
      
        2370
            async def assess_confidence(
      
        2371
                tool_name: str,
      
        2372
                tool_args: dict,
      
        2373
                context: str,
      
        2374
            ) -> ConfidenceAssessment:
      
        2375
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2376
        
        2377
            async def verify_action(
      
        2378
                tool_name: str,
      
        2379
                tool_args: dict,
      
        2380
                result: str,
      
        2381
                expected: str = "",
      
        2382
            ) -> ActionVerification:
      
        2383
                raise AssertionError("Verification should not run in this scenario")
      
        2384
        
        2385
            nginx_root = temp_dir / "guides" / "nginx"
      
        2386
            chapters = nginx_root / "chapters"
      
        2387
            chapters.mkdir(parents=True)
      
        2388
            index_path = nginx_root / "index.html"
      
        2389
            index_path.write_text(
      
        2390
                "\n".join(
      
        2391
                    [
      
        2392
                        "<html>",
      
        2393
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2394
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2395
                        "</html>",
      
        2396
                    ]
      
        2397
                )
      
        2398
                + "\n"
      
        2399
            )
      
        2400
        
        2401
            implementation_plan = temp_dir / "implementation.md"
      
        2402
            implementation_plan.write_text(
      
        2403
                "\n".join(
      
        2404
                    [
      
        2405
                        "# Implementation Plan",
      
        2406
                        "",
      
        2407
                        "## File Changes",
      
        2408
                        f"- `{nginx_root}/`",
      
        2409
                        f"- `{chapters}/`",
      
        2410
                        f"- `{index_path}`",
      
        2411
                        f"- `{chapters / '01-introduction.html'}`",
      
        2412
                        "",
      
        2413
                    ]
      
        2414
                )
      
        2415
            )
      
        2416
        
        2417
            context = build_context(
      
        2418
                temp_dir=temp_dir,
      
        2419
                messages=[],
      
        2420
                safeguards=FakeSafeguards(),
      
        2421
                assess_confidence=assess_confidence,
      
        2422
                verify_action=verify_action,
      
        2423
                auto_recover=False,
      
        2424
            )
      
        2425
            persistent_messages: list[str] = []
      
        2426
            ephemeral_messages: list[str] = []
      
        2427
            context.queue_steering_message_callback = persistent_messages.append
      
        2428
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2429
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2430
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2431
            dod.implementation_plan = str(implementation_plan)
      
        2432
            dod.touched_files.append(str(index_path))
      
        2433
            dod.completed_items.append("Develop the main index.html file for the nginx guide")
      
        2434
            dod.pending_items.append("Create chapter files for the nginx guide")
      
        2435
        
        2436
            tool_call = ToolCall(
      
        2437
                id="read-index-self-audit",
      
        2438
                name="read",
      
        2439
                arguments={"file_path": str(index_path)},
      
        2440
            )
      
        2441
            executor = FakeExecutor(
      
        2442
                [
      
        2443
                    tool_outcome(
      
        2444
                        tool_call=tool_call,
      
        2445
                        output="1\t<html>\n",
      
        2446
                        is_error=False,
      
        2447
                    )
      
        2448
                ]
      
        2449
            )
      
        2450
        
        2451
            summary = TurnSummary(final_response="")
      
        2452
            await runner.execute_batch(
      
        2453
                tool_calls=[tool_call],
      
        2454
                tool_source="assistant",
      
        2455
                pending_tool_calls_seen=set(),
      
        2456
                emit=_noop_emit,
      
        2457
                summary=summary,
      
        2458
                dod=dod,
      
        2459
                executor=executor,  # type: ignore[arg-type]
      
        2460
                on_confirmation=None,
      
        2461
                on_user_question=None,
      
        2462
                emit_confirmation=None,
      
        2463
                consecutive_errors=0,
      
        2464
            )
      
        2465
        
        2466
            assert persistent_messages
      
        2467
            message = persistent_messages[-1]
      
        2468
            assert "You already have the current contents of `index.html` from the successful write." in message
      
        2469
            assert "Resume by creating `01-introduction.html` now." in message
      
        2470
            assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
      
        2471
            assert ephemeral_messages == []
      
        2472
        
        2473
        
        2474
        @pytest.mark.asyncio
      
        2475
        async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
      
        2476
            temp_dir: Path,
      
        2477
        ) -> None:
      
        2478
            async def assess_confidence(
      
        2479
                tool_name: str,
      
        2480
                tool_args: dict,
      
        2481
                context: str,
      
        2482
            ) -> ConfidenceAssessment:
      
        2483
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2484
        
        2485
            async def verify_action(
      
        2486
                tool_name: str,
      
        2487
                tool_args: dict,
      
        2488
                result: str,
      
        2489
                expected: str = "",
      
        2490
            ) -> ActionVerification:
      
        2491
                raise AssertionError("Verification should not run for this scenario")
      
        2492
        
        2493
            nginx_root = temp_dir / "guides" / "nginx"
      
        2494
            chapters = nginx_root / "chapters"
      
        2495
            chapters.mkdir(parents=True)
      
        2496
            index_path = nginx_root / "index.html"
      
        2497
        
        2498
            implementation_plan = temp_dir / "implementation.md"
      
        2499
            implementation_plan.write_text(
      
        2500
                "\n".join(
      
        2501
                    [
      
        2502
                        "# Implementation Plan",
      
        2503
                        "",
      
        2504
                        "## File Changes",
      
        2505
                        f"- `{chapters}/`",
      
        2506
                        f"- `{index_path}`",
      
        2507
                        f"- `{chapters / '01-introduction.html'}`",
      
        2508
                        "",
      
        2509
                    ]
      
        2510
                )
      
        2511
            )
      
        2512
        
        2513
            context = build_context(
      
        2514
                temp_dir=temp_dir,
      
        2515
                messages=[
      
        2516
                    Message(
      
        2517
                        role=Role.USER,
      
        2518
                        content=(
      
        2519
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        2520
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        2521
                        ),
      
        2522
                    )
      
        2523
                ],
      
        2524
                safeguards=FakeSafeguards(),
      
        2525
                assess_confidence=assess_confidence,
      
        2526
                verify_action=verify_action,
      
        2527
                auto_recover=False,
      
        2528
            )
      
        2529
            persistent_messages: list[str] = []
      
        2530
            ephemeral_messages: list[str] = []
      
        2531
            context.queue_steering_message_callback = persistent_messages.append
      
        2532
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2533
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2534
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2535
            dod.implementation_plan = str(implementation_plan)
      
        2536
            sync_todos_to_definition_of_done(
      
        2537
                dod,
      
        2538
                [
      
        2539
                    {
      
        2540
                        "content": "Create the main index.html file with proper structure",
      
        2541
                        "active_form": "Creating the main index.html file with proper structure",
      
        2542
                        "status": "pending",
      
        2543
                    },
      
        2544
                    {
      
        2545
                        "content": "Create each chapter file with appropriate content",
      
        2546
                        "active_form": "Creating each chapter file with appropriate content",
      
        2547
                        "status": "pending",
      
        2548
                    },
      
        2549
                ],
      
        2550
            )
      
        2551
        
        2552
            tool_call = ToolCall(
      
        2553
                id="write-index-recovered",
      
        2554
                name="write",
      
        2555
                arguments={
      
        2556
                    "file_path": str(index_path),
      
        2557
                    "content": "<html></html>\n",
      
        2558
                },
      
        2559
            )
      
        2560
            executor = FakeExecutor(
      
        2561
                [
      
        2562
                    tool_outcome(
      
        2563
                        tool_call=tool_call,
      
        2564
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2565
                        is_error=False,
      
        2566
                    )
      
        2567
                ]
      
        2568
            )
      
        2569
        
        2570
            summary = TurnSummary(final_response="")
      
        2571
            await runner.execute_batch(
      
        2572
                tool_calls=[tool_call],
      
        2573
                tool_source="assistant",
      
        2574
                pending_tool_calls_seen=set(),
      
        2575
                emit=_noop_emit,
      
        2576
                summary=summary,
      
        2577
                dod=dod,
      
        2578
                executor=executor,  # type: ignore[arg-type]
      
        2579
                on_confirmation=None,
      
        2580
                on_user_question=None,
      
        2581
                emit_confirmation=None,
      
        2582
                consecutive_errors=0,
      
        2583
            )
      
        2584
        
        2585
            assert persistent_messages == []
      
        2586
            assert ephemeral_messages
      
        2587
            message = ephemeral_messages[-1]
      
        2588
            assert "Next step: create `01-introduction.html`." in message
      
        2589
        
        2590
        
        2591
        @pytest.mark.asyncio
      
        2592
        async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
      
        2593
            temp_dir: Path,
      
        2594
        ) -> None:
      
        2595
            async def assess_confidence(
      
        2596
                tool_name: str,
      
        2597
                tool_args: dict,
      
        2598
                context: str,
      
        2599
            ) -> ConfidenceAssessment:
      
        2600
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2601
        
        2602
            async def verify_action(
      
        2603
                tool_name: str,
      
        2604
                tool_args: dict,
      
        2605
                result: str,
      
        2606
                expected: str = "",
      
        2607
            ) -> ActionVerification:
      
        2608
                raise AssertionError("Verification should not run in this scenario")
      
        2609
        
        2610
            guide_root = temp_dir / "guides" / "nginx"
      
        2611
            chapters = guide_root / "chapters"
      
        2612
            chapters.mkdir(parents=True)
      
        2613
            index_path = guide_root / "index.html"
      
        2614
            index_path.write_text(
      
        2615
                "\n".join(
      
        2616
                    [
      
        2617
                        "<html>",
      
        2618
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2619
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2620
                        "</html>",
      
        2621
                    ]
      
        2622
                )
      
        2623
                + "\n"
      
        2624
            )
      
        2625
        
        2626
            implementation_plan = temp_dir / "implementation.md"
      
        2627
            implementation_plan.write_text(
      
        2628
                "\n".join(
      
        2629
                    [
      
        2630
                        "# Implementation Plan",
      
        2631
                        "",
      
        2632
                        "## File Changes",
      
        2633
                        f"- `{guide_root}/`",
      
        2634
                        f"- `{chapters}/`",
      
        2635
                        f"- `{index_path}`",
      
        2636
                        "",
      
        2637
                    ]
      
        2638
                )
      
        2639
            )
      
        2640
        
        2641
            context = build_context(
      
        2642
                temp_dir=temp_dir,
      
        2643
                messages=[],
      
        2644
                safeguards=FakeSafeguards(),
      
        2645
                assess_confidence=assess_confidence,
      
        2646
                verify_action=verify_action,
      
        2647
            )
      
        2648
            queued_messages: list[str] = []
      
        2649
            context.queue_steering_message_callback = queued_messages.append
      
        2650
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2651
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2652
            dod.implementation_plan = str(implementation_plan)
      
        2653
            dod.touched_files.append(str(index_path))
      
        2654
            sync_todos_to_definition_of_done(
      
        2655
                dod,
      
        2656
                [
      
        2657
                    {
      
        2658
                        "content": "Develop the main index.html file with proper structure",
      
        2659
                        "active_form": "Developing the main index.html file with proper structure",
      
        2660
                        "status": "completed",
      
        2661
                    },
      
        2662
                    {
      
        2663
                        "content": "Create chapter files with content and structure",
      
        2664
                        "active_form": "Creating chapter files with content and structure",
      
        2665
                        "status": "pending",
      
        2666
                    },
      
        2667
                ],
      
        2668
            )
      
        2669
        
        2670
            todos = [
      
        2671
                {
      
        2672
                    "content": "Develop the main index.html file with proper structure",
      
        2673
                    "active_form": "Developing the main index.html file with proper structure",
      
        2674
                    "status": "completed",
      
        2675
                },
      
        2676
                {
      
        2677
                    "content": "Create chapter files with content and structure",
      
        2678
                    "active_form": "Creating chapter files with content and structure",
      
        2679
                    "status": "pending",
      
        2680
                },
      
        2681
            ]
      
        2682
            tool_call = ToolCall(
      
        2683
                id="todo-aggregate",
      
        2684
                name="TodoWrite",
      
        2685
                arguments={"todos": todos},
      
        2686
            )
      
        2687
            executor = FakeExecutor(
      
        2688
                [
      
        2689
                    tool_outcome(
      
        2690
                        tool_call=tool_call,
      
        2691
                        output="Todos updated",
      
        2692
                        is_error=False,
      
        2693
                        metadata={"new_todos": todos},
      
        2694
                    )
      
        2695
                ]
      
        2696
            )
      
        2697
        
        2698
            summary = TurnSummary(final_response="")
      
        2699
            await runner.execute_batch(
      
        2700
                tool_calls=[tool_call],
      
        2701
                tool_source="assistant",
      
        2702
                pending_tool_calls_seen=set(),
      
        2703
                emit=_noop_emit,
      
        2704
                summary=summary,
      
        2705
                dod=dod,
      
        2706
                executor=executor,  # type: ignore[arg-type]
      
        2707
                on_confirmation=None,
      
        2708
                on_user_question=None,
      
        2709
                emit_confirmation=None,
      
        2710
                consecutive_errors=0,
      
        2711
            )
      
        2712
        
        2713
            assert queued_messages
      
        2714
            message = queued_messages[-1]
      
        2715
            assert "Continue with the next concrete output: `01-introduction.html`." in message
      
        2716
            assert "Resume by creating `01-introduction.html` now." in message
      
        2717
            assert (
      
        2718
                "Continue with the next pending item: `Create chapter files with content and structure`."
      
        2719
                not in message
      
        2720
            )
      
        2721
        
        2722
        
        2723
        @pytest.mark.asyncio
      
        2724
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        2725
            temp_dir: Path,
      
        2726
        ) -> None:
      
        2727
            async def assess_confidence(
      
        2728
                tool_name: str,
      
        2729
                tool_args: dict,
      
        2730
                context: str,
      
        2731
            ) -> ConfidenceAssessment:
      
        2732
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2733
        
        2734
            async def verify_action(
      
        2735
                tool_name: str,
      
        2736
                tool_args: dict,
      
        2737
                result: str,
      
        2738
                expected: str = "",
      
        2739
            ) -> ActionVerification:
      
        2740
                raise AssertionError("Verification should not run for this scenario")
      
        2741
        
        2742
            guide_root = temp_dir / "guides" / "nginx"
      
        2743
            chapters = guide_root / "chapters"
      
        2744
            chapters.mkdir(parents=True)
      
        2745
            index_path = guide_root / "index.html"
      
        2746
            chapter_one = chapters / "01-getting-started.html"
      
        2747
            chapter_one.write_text("<h1>One</h1>\n")
      
        2748
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2749
        
        2750
            implementation_plan = temp_dir / "implementation.md"
      
        2751
            implementation_plan.write_text(
      
        2752
                "\n".join(
      
        2753
                    [
      
        2754
                        "# Implementation Plan",
      
        2755
                        "",
      
        2756
                        "## File Changes",
      
        2757
                        f"- `{index_path}`",
      
        2758
                        f"- `{chapter_one}`",
      
        2759
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        2760
                        "",
      
        2761
                    ]
      
        2762
                )
      
        2763
            )
      
        2764
        
        2765
            context = build_context(
      
        2766
                temp_dir=temp_dir,
      
        2767
                messages=[],
      
        2768
                safeguards=FakeSafeguards(),
      
        2769
                assess_confidence=assess_confidence,
      
        2770
                verify_action=verify_action,
      
        2771
                auto_recover=False,
      
        2772
            )
      
        2773
            persistent_messages: list[str] = []
      
        2774
            ephemeral_messages: list[str] = []
      
        2775
            context.queue_steering_message_callback = persistent_messages.append
      
        2776
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2777
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2778
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2779
            dod.implementation_plan = str(implementation_plan)
      
        2780
            sync_todos_to_definition_of_done(
      
        2781
                dod,
      
        2782
                [
      
        2783
                    {
      
        2784
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2785
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2786
                        "status": "pending",
      
        2787
                    },
      
        2788
                    {
      
        2789
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        2790
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        2791
                        "status": "pending",
      
        2792
                    },
      
        2793
                ],
      
        2794
            )
      
        2795
            assert tool_batches_should_prioritize_missing_artifact(
      
        2796
                dod=dod,
      
        2797
                next_pending=dod.pending_items[0],
      
        2798
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        2799
                project_root=temp_dir,
      
        2800
            )
      
        2801
        
        2802
            tool_call = ToolCall(
      
        2803
                id="dup-read",
      
        2804
                name="read",
      
        2805
                arguments={"file_path": str(index_path)},
      
        2806
            )
      
        2807
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        2808
        
        2809
            assert persistent_messages
      
        2810
            message = persistent_messages[-1]
      
        2811
            assert "06-ssl-configuration.html" in message
      
        2812
            assert "Do not switch into review or consistency-check mode" in message
      
        2813
            assert (
      
        2814
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        2815
                not in message
      
        2816
            )
      
        2817
        
        2818
        
        2819
        @pytest.mark.asyncio
      
        2820
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        2821
            temp_dir: Path,
      
        2822
        ) -> None:
      
        2823
            async def assess_confidence(
      
        2824
                tool_name: str,
      
        2825
                tool_args: dict,
      
        2826
                context: str,
      
        2827
            ) -> ConfidenceAssessment:
      
        2828
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2829
        
        2830
            async def verify_action(
      
        2831
                tool_name: str,
      
        2832
                tool_args: dict,
      
        2833
                result: str,
      
        2834
                expected: str = "",
      
        2835
            ) -> ActionVerification:
      
        2836
                raise AssertionError("Verification should not run for this scenario")
      
        2837
        
        2838
            guide_root = temp_dir / "guides" / "nginx"
      
        2839
            chapters = guide_root / "chapters"
      
        2840
            chapters.mkdir(parents=True)
      
        2841
            index_path = guide_root / "index.html"
      
        2842
            chapter_one = chapters / "01-getting-started.html"
      
        2843
            chapter_two = chapters / "02-installation.html"
      
        2844
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2845
            chapter_one.write_text("<h1>One</h1>\n")
      
        2846
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2847
        
        2848
            implementation_plan = temp_dir / "implementation.md"
      
        2849
            implementation_plan.write_text(
      
        2850
                "\n".join(
      
        2851
                    [
      
        2852
                        "# Implementation Plan",
      
        2853
                        "",
      
        2854
                        "## File Changes",
      
        2855
                        f"- `{chapters}/`",
      
        2856
                        f"- `{index_path}`",
      
        2857
                        f"- `{chapter_one}`",
      
        2858
                        f"- `{chapter_two}`",
      
        2859
                        "",
      
        2860
                    ]
      
        2861
                )
      
        2862
            )
      
        2863
        
        2864
            context = build_context(
      
        2865
                temp_dir=temp_dir,
      
        2866
                messages=[],
      
        2867
                safeguards=FakeSafeguards(),
      
        2868
                assess_confidence=assess_confidence,
      
        2869
                verify_action=verify_action,
      
        2870
                auto_recover=False,
      
        2871
            )
      
        2872
            persistent_messages: list[str] = []
      
        2873
            ephemeral_messages: list[str] = []
      
        2874
            context.queue_steering_message_callback = persistent_messages.append
      
        2875
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2876
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2877
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2878
            dod.implementation_plan = str(implementation_plan)
      
        2879
            sync_todos_to_definition_of_done(
      
        2880
                dod,
      
        2881
                [
      
        2882
                    {
      
        2883
                        "content": "Create the guide files",
      
        2884
                        "active_form": "Working on: Create the guide files",
      
        2885
                        "status": "completed",
      
        2886
                    },
      
        2887
                    {
      
        2888
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2889
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2890
                        "status": "pending",
      
        2891
                    },
      
        2892
                ],
      
        2893
            )
      
        2894
            tool_call = ToolCall(
      
        2895
                id="write-final",
      
        2896
                name="write",
      
        2897
                arguments={
      
        2898
                    "file_path": str(chapter_two),
      
        2899
                    "content": "<h1>Two</h1>\n",
      
        2900
                },
      
        2901
            )
      
        2902
            executor = FakeExecutor(
      
        2903
                [
      
        2904
                    tool_outcome(
      
        2905
                        tool_call=tool_call,
      
        2906
                        output=f"Successfully wrote {chapter_two}",
      
        2907
                        is_error=False,
      
        2908
                    )
      
        2909
                ]
      
        2910
            )
      
        2911
        
        2912
            summary = TurnSummary(final_response="")
      
        2913
            await runner.execute_batch(
      
        2914
                tool_calls=[tool_call],
      
        2915
                tool_source="assistant",
      
        2916
                pending_tool_calls_seen=set(),
      
        2917
                emit=_noop_emit,
      
        2918
                summary=summary,
      
        2919
                dod=dod,
      
        2920
                executor=executor,  # type: ignore[arg-type]
      
        2921
                on_confirmation=None,
      
        2922
                on_user_question=None,
      
        2923
                emit_confirmation=None,
      
        2924
                consecutive_errors=0,
      
        2925
            )
      
        2926
        
        2927
            assert any(
      
        2928
                "All explicitly planned artifacts now exist." in message
      
        2929
                for message in persistent_messages
      
        2930
            )
      
        2931
            assert any(
      
        2932
                "Ensure all files are properly linked and formatted consistently" in message
      
        2933
                for message in persistent_messages
      
        2934
            )
      
        2935
            assert any(
      
        2936
                "Move to verification once no specific mismatch remains." in message
      
        2937
                for message in persistent_messages
      
        2938
            )
      
        2939
        
        2940
        
        2941
        @pytest.mark.asyncio
      
        2942
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        2943
            temp_dir: Path,
      
        2944
        ) -> None:
      
        2945
            async def assess_confidence(
      
        2946
                tool_name: str,
      
        2947
                tool_args: dict,
      
        2948
                context: str,
      
        2949
            ) -> ConfidenceAssessment:
      
        2950
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2951
        
        2952
            async def verify_action(
      
        2953
                tool_name: str,
      
        2954
                tool_args: dict,
      
        2955
                result: str,
      
        2956
                expected: str = "",
      
        2957
            ) -> ActionVerification:
      
        2958
                raise AssertionError("Verification should not run in this scenario")
      
        2959
        
        2960
            guide_root = temp_dir / "guides" / "nginx"
      
        2961
            chapters = guide_root / "chapters"
      
        2962
            guide_root.mkdir(parents=True)
      
        2963
            chapters.mkdir()
      
        2964
            index_path = guide_root / "index.html"
      
        2965
            index_path.write_text("<html></html>\n")
      
        2966
            chapter_one = chapters / "01-getting-started.html"
      
        2967
            chapter_two = chapters / "02-installation.html"
      
        2968
            implementation_plan = temp_dir / "implementation.md"
      
        2969
            implementation_plan.write_text(
      
        2970
                "\n".join(
      
        2971
                    [
      
        2972
                        "# Implementation Plan",
      
        2973
                        "",
      
        2974
                        "## File Changes",
      
        2975
                        f"- `{guide_root}/`",
      
        2976
                        f"- `{index_path}`",
      
        2977
                        f"- `{chapter_one}`",
      
        2978
                        f"- `{chapter_two}`",
      
        2979
                        "",
      
        2980
                    ]
      
        2981
                )
      
        2982
            )
      
        2983
        
        2984
            context = build_context(
      
        2985
                temp_dir=temp_dir,
      
        2986
                messages=[],
      
        2987
                safeguards=FakeSafeguards(),
      
        2988
                assess_confidence=assess_confidence,
      
        2989
                verify_action=verify_action,
      
        2990
                auto_recover=False,
      
        2991
            )
      
        2992
            persistent_messages: list[str] = []
      
        2993
            ephemeral_messages: list[str] = []
      
        2994
            context.queue_steering_message_callback = persistent_messages.append
      
        2995
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2996
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2997
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2998
            dod.implementation_plan = str(implementation_plan)
      
        2999
            sync_todos_to_definition_of_done(
      
        3000
                dod,
      
        3001
                [
      
        3002
                    {
      
        3003
                        "content": "Create the main index.html file with proper structure",
      
        3004
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        3005
                        "status": "pending",
      
        3006
                    },
      
        3007
                    {
      
        3008
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        3009
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        3010
                        "status": "pending",
      
        3011
                    },
      
        3012
                    {
      
        3013
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3014
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3015
                        "status": "pending",
      
        3016
                    },
      
        3017
                ],
      
        3018
            )
      
        3019
            tool_call = ToolCall(
      
        3020
                id="write-index",
      
        3021
                name="write",
      
        3022
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        3023
            )
      
        3024
            executor = FakeExecutor(
      
        3025
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        3026
            )
      
        3027
        
        3028
            summary = TurnSummary(final_response="")
      
        3029
            await runner.execute_batch(
      
        3030
                tool_calls=[tool_call],
      
        3031
                tool_source="assistant",
      
        3032
                pending_tool_calls_seen=set(),
      
        3033
                emit=_noop_emit,
      
        3034
                summary=summary,
      
        3035
                dod=dod,
      
        3036
                executor=executor,  # type: ignore[arg-type]
      
        3037
                on_confirmation=None,
      
        3038
                on_user_question=None,
      
        3039
                emit_confirmation=None,
      
        3040
                consecutive_errors=0,
      
        3041
            )
      
        3042
        
        3043
            assert persistent_messages == []
      
        3044
            assert ephemeral_messages
      
        3045
            message = ephemeral_messages[-1]
      
        3046
            assert "Next step: create `01-getting-started.html`." in message
      
        3047
            assert "refresh `TodoWrite`" not in message
      
        3048
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3049
        
        3050
        
        3051
        @pytest.mark.asyncio
      
        3052
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        3053
            temp_dir: Path,
      
        3054
        ) -> None:
      
        3055
            async def assess_confidence(
      
        3056
                tool_name: str,
      
        3057
                tool_args: dict,
      
        3058
                context: str,
      
        3059
            ) -> ConfidenceAssessment:
      
        3060
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3061
        
        3062
            async def verify_action(
      
        3063
                tool_name: str,
      
        3064
                tool_args: dict,
      
        3065
                result: str,
      
        3066
                expected: str = "",
      
        3067
            ) -> ActionVerification:
      
        3068
                raise AssertionError("Verification should not run in this scenario")
      
        3069
        
        3070
            guide_root = temp_dir / "guides" / "nginx"
      
        3071
            chapters = guide_root / "chapters"
      
        3072
            guide_root.mkdir(parents=True)
      
        3073
            chapters.mkdir()
      
        3074
            index_path = guide_root / "index.html"
      
        3075
            index_path.write_text("<html></html>\n")
      
        3076
        
        3077
            chapter_paths = [
      
        3078
                chapters / "01-getting-started.html",
      
        3079
                chapters / "02-installation.html",
      
        3080
                chapters / "03-first-website.html",
      
        3081
                chapters / "04-configuration-basics.html",
      
        3082
                chapters / "05-advanced-configurations.html",
      
        3083
                chapters / "06-performance-tuning.html",
      
        3084
                chapters / "07-security-best-practices.html",
      
        3085
            ]
      
        3086
            for chapter in chapter_paths[:4]:
      
        3087
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        3088
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        3089
        
        3090
            implementation_plan = temp_dir / "implementation.md"
      
        3091
            implementation_plan.write_text(
      
        3092
                "\n".join(
      
        3093
                    [
      
        3094
                        "# Implementation Plan",
      
        3095
                        "",
      
        3096
                        "## File Changes",
      
        3097
                        f"- `{guide_root}/`",
      
        3098
                        f"- `{chapters}/`",
      
        3099
                        f"- `{index_path}`",
      
        3100
                        *[f"- `{path}`" for path in chapter_paths],
      
        3101
                        "",
      
        3102
                    ]
      
        3103
                )
      
        3104
            )
      
        3105
        
        3106
            context = build_context(
      
        3107
                temp_dir=temp_dir,
      
        3108
                messages=[],
      
        3109
                safeguards=FakeSafeguards(),
      
        3110
                assess_confidence=assess_confidence,
      
        3111
                verify_action=verify_action,
      
        3112
                auto_recover=False,
      
        3113
            )
      
        3114
            persistent_messages: list[str] = []
      
        3115
            ephemeral_messages: list[str] = []
      
        3116
            context.queue_steering_message_callback = persistent_messages.append
      
        3117
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3118
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3119
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3120
            dod.implementation_plan = str(implementation_plan)
      
        3121
            sync_todos_to_definition_of_done(
      
        3122
                dod,
      
        3123
                [
      
        3124
                    {
      
        3125
                        "content": "Create the nginx guide artifacts",
      
        3126
                        "active_form": "Creating nginx guide artifacts",
      
        3127
                        "status": "pending",
      
        3128
                    },
      
        3129
                    {
      
        3130
                        "content": "Verify all guide files are linked and complete",
      
        3131
                        "active_form": "Verifying guide linkage and completeness",
      
        3132
                        "status": "pending",
      
        3133
                    },
      
        3134
                ],
      
        3135
            )
      
        3136
            tool_call = ToolCall(
      
        3137
                id="write-chapter-05",
      
        3138
                name="write",
      
        3139
                arguments={
      
        3140
                    "file_path": str(chapter_paths[4]),
      
        3141
                    "content": "<h1>Advanced configurations</h1>\n",
      
        3142
                },
      
        3143
            )
      
        3144
            executor = FakeExecutor(
      
        3145
                [
      
        3146
                    tool_outcome(
      
        3147
                        tool_call=tool_call,
      
        3148
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        3149
                        is_error=False,
      
        3150
                    )
      
        3151
                ]
      
        3152
            )
      
        3153
        
        3154
            summary = TurnSummary(final_response="")
      
        3155
            await runner.execute_batch(
      
        3156
                tool_calls=[tool_call],
      
        3157
                tool_source="assistant",
      
        3158
                pending_tool_calls_seen=set(),
      
        3159
                emit=_noop_emit,
      
        3160
                summary=summary,
      
        3161
                dod=dod,
      
        3162
                executor=executor,  # type: ignore[arg-type]
      
        3163
                on_confirmation=None,
      
        3164
                on_user_question=None,
      
        3165
                emit_confirmation=None,
      
        3166
                consecutive_errors=0,
      
        3167
            )
      
        3168
        
        3169
            assert any(
      
        3170
                "Next step: create `06-performance-tuning.html`." in message
      
        3171
                for message in ephemeral_messages
      
        3172
            )
      
        3173
            assert not any(
      
        3174
                "All explicitly planned artifacts now exist." in message
      
        3175
                for message in ephemeral_messages
      
        3176
            )
      
        3177
        
        3178
        
        3179
        @pytest.mark.asyncio
      
        3180
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        3181
            temp_dir: Path,
      
        3182
        ) -> None:
      
        3183
            async def assess_confidence(
      
        3184
                tool_name: str,
      
        3185
                tool_args: dict,
      
        3186
                context: str,
      
        3187
            ) -> ConfidenceAssessment:
      
        3188
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3189
        
        3190
            async def verify_action(
      
        3191
                tool_name: str,
      
        3192
                tool_args: dict,
      
        3193
                result: str,
      
        3194
                expected: str = "",
      
        3195
            ) -> ActionVerification:
      
        3196
                raise AssertionError("Verification should not run in this scenario")
      
        3197
        
        3198
            guide_root = temp_dir / "guides" / "nginx"
      
        3199
            chapters = guide_root / "chapters"
      
        3200
            guide_root.mkdir(parents=True)
      
        3201
            chapters.mkdir()
      
        3202
            index_path = guide_root / "index.html"
      
        3203
            chapter_paths = [
      
        3204
                chapters / "01-introduction.html",
      
        3205
                chapters / "02-installation.html",
      
        3206
                chapters / "03-configuration.html",
      
        3207
                chapters / "04-basic-usage.html",
      
        3208
                chapters / "05-advanced-features.html",
      
        3209
            ]
      
        3210
            for path in (index_path, *chapter_paths[:4]):
      
        3211
                path.write_text("<html></html>\n")
      
        3212
        
        3213
            implementation_plan = temp_dir / "implementation.md"
      
        3214
            implementation_plan.write_text(
      
        3215
                "\n".join(
      
        3216
                    [
      
        3217
                        "# Implementation Plan",
      
        3218
                        "",
      
        3219
                        "## File Changes",
      
        3220
                        f"- `{guide_root}/`",
      
        3221
                        f"- `{chapters}/`",
      
        3222
                        f"- `{index_path}`",
      
        3223
                        *[f"- `{path}`" for path in chapter_paths],
      
        3224
                        "",
      
        3225
                    ]
      
        3226
                )
      
        3227
            )
      
        3228
        
        3229
            context = build_context(
      
        3230
                temp_dir=temp_dir,
      
        3231
                messages=[],
      
        3232
                safeguards=FakeSafeguards(),
      
        3233
                assess_confidence=assess_confidence,
      
        3234
                verify_action=verify_action,
      
        3235
                auto_recover=False,
      
        3236
            )
      
        3237
            persistent_messages: list[str] = []
      
        3238
            ephemeral_messages: list[str] = []
      
        3239
            context.queue_steering_message_callback = persistent_messages.append
      
        3240
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3241
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3242
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3243
            dod.implementation_plan = str(implementation_plan)
      
        3244
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3245
            dod.completed_items.extend(
      
        3246
                [
      
        3247
                    "Create the nginx directory structure",
      
        3248
                    "Create the main index.html file with proper structure",
      
        3249
                ]
      
        3250
            )
      
        3251
            sync_todos_to_definition_of_done(
      
        3252
                dod,
      
        3253
                [
      
        3254
                    {
      
        3255
                        "content": "Create each chapter file with appropriate content",
      
        3256
                        "active_form": "Creating each chapter file with appropriate content",
      
        3257
                        "status": "pending",
      
        3258
                    }
      
        3259
                ],
      
        3260
            )
      
        3261
            tool_call = ToolCall(
      
        3262
                id="write-chapter-04",
      
        3263
                name="write",
      
        3264
                arguments={
      
        3265
                    "file_path": str(chapter_paths[3]),
      
        3266
                    "content": "<html>updated</html>\n",
      
        3267
                },
      
        3268
            )
      
        3269
            executor = FakeExecutor(
      
        3270
                [
      
        3271
                    tool_outcome(
      
        3272
                        tool_call=tool_call,
      
        3273
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3274
                        is_error=False,
      
        3275
                    )
      
        3276
                ]
      
        3277
            )
      
        3278
        
        3279
            summary = TurnSummary(final_response="")
      
        3280
            await runner.execute_batch(
      
        3281
                tool_calls=[tool_call],
      
        3282
                tool_source="assistant",
      
        3283
                pending_tool_calls_seen=set(),
      
        3284
                emit=_noop_emit,
      
        3285
                summary=summary,
      
        3286
                dod=dod,
      
        3287
                executor=executor,  # type: ignore[arg-type]
      
        3288
                on_confirmation=None,
      
        3289
                on_user_question=None,
      
        3290
                emit_confirmation=None,
      
        3291
                consecutive_errors=0,
      
        3292
            )
      
        3293
        
        3294
            assert ephemeral_messages
      
        3295
            message = ephemeral_messages[-1]
      
        3296
            assert "Next step: create `05-advanced-features.html`." in message
      
        3297
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3298
            assert "refresh `TodoWrite`" not in message
      
        3299
        
        3300
        
        3301
        @pytest.mark.asyncio
      
        3302
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3303
            temp_dir: Path,
      
        3304
        ) -> None:
      
        3305
            async def assess_confidence(
      
        3306
                tool_name: str,
      
        3307
                tool_args: dict,
      
        3308
                context: str,
      
        3309
            ) -> ConfidenceAssessment:
      
        3310
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3311
        
        3312
            async def verify_action(
      
        3313
                tool_name: str,
      
        3314
                tool_args: dict,
      
        3315
                result: str,
      
        3316
                expected: str = "",
      
        3317
            ) -> ActionVerification:
      
        3318
                raise AssertionError("Verification should not run in this scenario")
      
        3319
        
        3320
            guide_root = temp_dir / "guides" / "nginx"
      
        3321
            chapters = guide_root / "chapters"
      
        3322
            guide_root.mkdir(parents=True)
      
        3323
            chapters.mkdir()
      
        3324
            index_path = guide_root / "index.html"
      
        3325
            index_path.write_text("<html></html>\n")
      
        3326
            chapter_one = chapters / "01-getting-started.html"
      
        3327
            chapter_two = chapters / "02-installation.html"
      
        3328
            chapter_one.write_text("<h1>One</h1>\n")
      
        3329
        
        3330
            implementation_plan = temp_dir / "implementation.md"
      
        3331
            implementation_plan.write_text(
      
        3332
                "\n".join(
      
        3333
                    [
      
        3334
                        "# Implementation Plan",
      
        3335
                        "",
      
        3336
                        "## File Changes",
      
        3337
                        f"- `{guide_root}/`",
      
        3338
                        f"- `{chapters}/`",
      
        3339
                        f"- `{index_path}`",
      
        3340
                        f"- `{chapter_one}`",
      
        3341
                        f"- `{chapter_two}`",
      
        3342
                        "",
      
        3343
                    ]
      
        3344
                )
      
        3345
            )
      
        3346
        
        3347
            context = build_context(
      
        3348
                temp_dir=temp_dir,
      
        3349
                messages=[],
      
        3350
                safeguards=FakeSafeguards(),
      
        3351
                assess_confidence=assess_confidence,
      
        3352
                verify_action=verify_action,
      
        3353
                auto_recover=False,
      
        3354
            )
      
        3355
            persistent_messages: list[str] = []
      
        3356
            ephemeral_messages: list[str] = []
      
        3357
            context.queue_steering_message_callback = persistent_messages.append
      
        3358
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3359
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3360
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3361
            dod.implementation_plan = str(implementation_plan)
      
        3362
            sync_todos_to_definition_of_done(
      
        3363
                dod,
      
        3364
                [
      
        3365
                    {
      
        3366
                        "content": "Create 01-getting-started.html",
      
        3367
                        "active_form": "Creating 01-getting-started.html",
      
        3368
                        "status": "completed",
      
        3369
                    },
      
        3370
                    {
      
        3371
                        "content": "Create 02-installation.html",
      
        3372
                        "active_form": "Creating 02-installation.html",
      
        3373
                        "status": "pending",
      
        3374
                    },
      
        3375
                ],
      
        3376
            )
      
        3377
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3378
        
        3379
            tool_call = ToolCall(
      
        3380
                id="todo-only",
      
        3381
                name="TodoWrite",
      
        3382
                arguments={
      
        3383
                    "todos": [
      
        3384
                        {
      
        3385
                            "content": "Create 01-getting-started.html",
      
        3386
                            "active_form": "Creating 01-getting-started.html",
      
        3387
                            "status": "completed",
      
        3388
                        },
      
        3389
                        {
      
        3390
                            "content": "Create 02-installation.html",
      
        3391
                            "active_form": "Creating 02-installation.html",
      
        3392
                            "status": "pending",
      
        3393
                        },
      
        3394
                    ]
      
        3395
                },
      
        3396
            )
      
        3397
            executor = FakeExecutor(
      
        3398
                [
      
        3399
                    tool_outcome(
      
        3400
                        tool_call=tool_call,
      
        3401
                        output="Todos updated",
      
        3402
                        is_error=False,
      
        3403
                        metadata={
      
        3404
                            "new_todos": [
      
        3405
                                {
      
        3406
                                    "content": "Create 01-getting-started.html",
      
        3407
                                    "active_form": "Creating 01-getting-started.html",
      
        3408
                                    "status": "completed",
      
        3409
                                },
      
        3410
                                {
      
        3411
                                    "content": "Create 02-installation.html",
      
        3412
                                    "active_form": "Creating 02-installation.html",
      
        3413
                                    "status": "pending",
      
        3414
                                },
      
        3415
                            ]
      
        3416
                        },
      
        3417
                    )
      
        3418
                ]
      
        3419
            )
      
        3420
        
        3421
            summary = TurnSummary(final_response="")
      
        3422
            await runner.execute_batch(
      
        3423
                tool_calls=[tool_call],
      
        3424
                tool_source="assistant",
      
        3425
                pending_tool_calls_seen=set(),
      
        3426
                emit=_noop_emit,
      
        3427
                summary=summary,
      
        3428
                dod=dod,
      
        3429
                executor=executor,  # type: ignore[arg-type]
      
        3430
                on_confirmation=None,
      
        3431
                on_user_question=None,
      
        3432
                emit_confirmation=None,
      
        3433
                consecutive_errors=0,
      
        3434
            )
      
        3435
        
        3436
            assert persistent_messages
      
        3437
            message = persistent_messages[-1]
      
        3438
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3439
            assert "Resume by creating `02-installation.html` now." in message
      
        3440
            assert "refresh `TodoWrite`" in message
      
        3441
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3442
            assert ephemeral_messages == []
      
        3443
        
        3444
        
        3445
        @pytest.mark.asyncio
      
        3446
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3447
            temp_dir: Path,
      
        3448
        ) -> None:
      
        3449
            async def assess_confidence(
      
        3450
                tool_name: str,
      
        3451
                tool_args: dict,
      
        3452
                context: str,
      
        3453
            ) -> ConfidenceAssessment:
      
        3454
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3455
        
        3456
            async def verify_action(
      
        3457
                tool_name: str,
      
        3458
                tool_args: dict,
      
        3459
                result: str,
      
        3460
                expected: str = "",
      
        3461
            ) -> ActionVerification:
      
        3462
                raise AssertionError("Verification should not run in this scenario")
      
        3463
        
        3464
            guide_root = temp_dir / "guides" / "nginx"
      
        3465
            chapters = guide_root / "chapters"
      
        3466
            guide_root.mkdir(parents=True)
      
        3467
            chapters.mkdir()
      
        3468
            index_path = guide_root / "index.html"
      
        3469
            chapter_one = chapters / "01-getting-started.html"
      
        3470
            chapter_two = chapters / "02-installation.html"
      
        3471
            index_path.write_text("<html></html>\n")
      
        3472
            chapter_one.write_text("<h1>One</h1>\n")
      
        3473
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3474
        
        3475
            implementation_plan = temp_dir / "implementation.md"
      
        3476
            implementation_plan.write_text(
      
        3477
                "\n".join(
      
        3478
                    [
      
        3479
                        "# Implementation Plan",
      
        3480
                        "",
      
        3481
                        "## File Changes",
      
        3482
                        f"- `{guide_root}/`",
      
        3483
                        f"- `{chapters}/`",
      
        3484
                        f"- `{index_path}`",
      
        3485
                        f"- `{chapter_one}`",
      
        3486
                        f"- `{chapter_two}`",
      
        3487
                        "",
      
        3488
                    ]
      
        3489
                )
      
        3490
            )
      
        3491
        
        3492
            context = build_context(
      
        3493
                temp_dir=temp_dir,
      
        3494
                messages=[],
      
        3495
                safeguards=FakeSafeguards(),
      
        3496
                assess_confidence=assess_confidence,
      
        3497
                verify_action=verify_action,
      
        3498
                auto_recover=False,
      
        3499
            )
      
        3500
            queued_messages: list[str] = []
      
        3501
            context.queue_steering_message_callback = queued_messages.append
      
        3502
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3503
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3504
            dod.implementation_plan = str(implementation_plan)
      
        3505
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3506
            sync_todos_to_definition_of_done(
      
        3507
                dod,
      
        3508
                [
      
        3509
                    {
      
        3510
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3511
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3512
                        "status": "pending",
      
        3513
                    },
      
        3514
                    {
      
        3515
                        "content": "Verify all guide files are linked and complete",
      
        3516
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3517
                        "status": "pending",
      
        3518
                    },
      
        3519
                ],
      
        3520
                project_root=temp_dir,
      
        3521
            )
      
        3522
        
        3523
            tool_call = ToolCall(
      
        3524
                id="todo-only",
      
        3525
                name="TodoWrite",
      
        3526
                arguments={
      
        3527
                    "todos": [
      
        3528
                        {
      
        3529
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3530
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3531
                            "status": "pending",
      
        3532
                        },
      
        3533
                        {
      
        3534
                            "content": "Verify all guide files are linked and complete",
      
        3535
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3536
                            "status": "pending",
      
        3537
                        },
      
        3538
                    ]
      
        3539
                },
      
        3540
            )
      
        3541
            executor = FakeExecutor(
      
        3542
                [
      
        3543
                    tool_outcome(
      
        3544
                        tool_call=tool_call,
      
        3545
                        output="Todos updated",
      
        3546
                        is_error=False,
      
        3547
                        metadata={
      
        3548
                            "new_todos": [
      
        3549
                                {
      
        3550
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3551
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3552
                                    "status": "pending",
      
        3553
                                },
      
        3554
                                {
      
        3555
                                    "content": "Verify all guide files are linked and complete",
      
        3556
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        3557
                                    "status": "pending",
      
        3558
                                },
      
        3559
                            ]
      
        3560
                        },
      
        3561
                    )
      
        3562
                ]
      
        3563
            )
      
        3564
        
        3565
            summary = TurnSummary(final_response="")
      
        3566
            await runner.execute_batch(
      
        3567
                tool_calls=[tool_call],
      
        3568
                tool_source="assistant",
      
        3569
                pending_tool_calls_seen=set(),
      
        3570
                emit=_noop_emit,
      
        3571
                summary=summary,
      
        3572
                dod=dod,
      
        3573
                executor=executor,  # type: ignore[arg-type]
      
        3574
                on_confirmation=None,
      
        3575
                on_user_question=None,
      
        3576
                emit_confirmation=None,
      
        3577
                consecutive_errors=0,
      
        3578
            )
      
        3579
        
        3580
            assert queued_messages
      
        3581
            message = queued_messages[-1]
      
        3582
            assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
      
        3583
            assert "Verify all guide files are linked and complete" in message
      
        3584
            assert "Move to verification once no specific mismatch remains." in message
      
        3585
            assert "reopen reference materials" in message
      
        3586
            assert "Fortran guide structure" not in message
      
        3587
        
        3588
        
        3589
        @pytest.mark.asyncio
      
        3590
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        3591
            temp_dir: Path,
      
        3592
        ) -> None:
      
        3593
            async def assess_confidence(
      
        3594
                tool_name: str,
      
        3595
                tool_args: dict,
      
        3596
                context: str,
      
        3597
            ) -> ConfidenceAssessment:
      
        3598
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3599
        
        3600
            async def verify_action(
      
        3601
                tool_name: str,
      
        3602
                tool_args: dict,
      
        3603
                result: str,
      
        3604
                expected: str = "",
      
        3605
            ) -> ActionVerification:
      
        3606
                raise AssertionError("Verification should not run in this scenario")
      
        3607
        
        3608
            guide_root = temp_dir / "guides" / "nginx"
      
        3609
            chapters = guide_root / "chapters"
      
        3610
            guide_root.mkdir(parents=True)
      
        3611
            chapters.mkdir()
      
        3612
            index_path = guide_root / "index.html"
      
        3613
            index_path.write_text(
      
        3614
                "\n".join(
      
        3615
                    [
      
        3616
                        "<!DOCTYPE html>",
      
        3617
                        "<html>",
      
        3618
                        "<body>",
      
        3619
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        3620
                        "</body>",
      
        3621
                        "</html>",
      
        3622
                        "",
      
        3623
                    ]
      
        3624
                )
      
        3625
            )
      
        3626
        
        3627
            implementation_plan = temp_dir / "implementation.md"
      
        3628
            implementation_plan.write_text(
      
        3629
                "\n".join(
      
        3630
                    [
      
        3631
                        "# Implementation Plan",
      
        3632
                        "",
      
        3633
                        "## File Changes",
      
        3634
                        f"- `{guide_root}/`",
      
        3635
                        f"- `{chapters}/`",
      
        3636
                        f"- `{index_path}`",
      
        3637
                        "",
      
        3638
                    ]
      
        3639
                )
      
        3640
            )
      
        3641
        
        3642
            context = build_context(
      
        3643
                temp_dir=temp_dir,
      
        3644
                messages=[],
      
        3645
                safeguards=FakeSafeguards(),
      
        3646
                assess_confidence=assess_confidence,
      
        3647
                verify_action=verify_action,
      
        3648
                auto_recover=False,
      
        3649
            )
      
        3650
            queued_messages: list[str] = []
      
        3651
            context.queue_steering_message_callback = queued_messages.append
      
        3652
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3653
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3654
            dod.implementation_plan = str(implementation_plan)
      
        3655
            dod.touched_files.append(str(index_path))
      
        3656
            sync_todos_to_definition_of_done(
      
        3657
                dod,
      
        3658
                [
      
        3659
                    {
      
        3660
                        "content": "Examine the existing Fortran guide structure",
      
        3661
                        "active_form": "Examining the existing Fortran guide structure",
      
        3662
                        "status": "completed",
      
        3663
                    },
      
        3664
                    {
      
        3665
                        "content": "Create the nginx directory structure",
      
        3666
                        "active_form": "Creating the nginx directory structure",
      
        3667
                        "status": "completed",
      
        3668
                    },
      
        3669
                    {
      
        3670
                        "content": "Write the introduction chapter",
      
        3671
                        "active_form": "Writing the introduction chapter",
      
        3672
                        "status": "pending",
      
        3673
                    },
      
        3674
                ],
      
        3675
                project_root=temp_dir,
      
        3676
            )
      
        3677
        
        3678
            tool_call = ToolCall(
      
        3679
                id="todo-next-mutation",
      
        3680
                name="TodoWrite",
      
        3681
                arguments={
      
        3682
                    "todos": [
      
        3683
                        {
      
        3684
                            "content": "Examine the existing Fortran guide structure",
      
        3685
                            "active_form": "Examining the existing Fortran guide structure",
      
        3686
                            "status": "completed",
      
        3687
                        },
      
        3688
                        {
      
        3689
                            "content": "Create the nginx directory structure",
      
        3690
                            "active_form": "Creating the nginx directory structure",
      
        3691
                            "status": "completed",
      
        3692
                        },
      
        3693
                        {
      
        3694
                            "content": "Write the introduction chapter",
      
        3695
                            "active_form": "Writing the introduction chapter",
      
        3696
                            "status": "pending",
      
        3697
                        },
      
        3698
                    ]
      
        3699
                },
      
        3700
            )
      
        3701
            executor = FakeExecutor(
      
        3702
                [
      
        3703
                    tool_outcome(
      
        3704
                        tool_call=tool_call,
      
        3705
                        output="Todos updated",
      
        3706
                        is_error=False,
      
        3707
                        metadata={
      
        3708
                            "new_todos": [
      
        3709
                                {
      
        3710
                                    "content": "Examine the existing Fortran guide structure",
      
        3711
                                    "active_form": "Examining the existing Fortran guide structure",
      
        3712
                                    "status": "completed",
      
        3713
                                },
      
        3714
                                {
      
        3715
                                    "content": "Create the nginx directory structure",
      
        3716
                                    "active_form": "Creating the nginx directory structure",
      
        3717
                                    "status": "completed",
      
        3718
                                },
      
        3719
                                {
      
        3720
                                    "content": "Write the introduction chapter",
      
        3721
                                    "active_form": "Writing the introduction chapter",
      
        3722
                                    "status": "pending",
      
        3723
                                },
      
        3724
                            ]
      
        3725
                        },
      
        3726
                    )
      
        3727
                ]
      
        3728
            )
      
        3729
        
        3730
            summary = TurnSummary(final_response="")
      
        3731
            await runner.execute_batch(
      
        3732
                tool_calls=[tool_call],
      
        3733
                tool_source="assistant",
      
        3734
                pending_tool_calls_seen=set(),
      
        3735
                emit=_noop_emit,
      
        3736
                summary=summary,
      
        3737
                dod=dod,
      
        3738
                executor=executor,  # type: ignore[arg-type]
      
        3739
                on_confirmation=None,
      
        3740
                on_user_question=None,
      
        3741
                emit_confirmation=None,
      
        3742
                consecutive_errors=0,
      
        3743
            )
      
        3744
        
        3745
            assert queued_messages
      
        3746
            message = queued_messages[-1]
      
        3747
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3748
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3749
            assert "Resume by creating `01-introduction.html` now." in message
      
        3750
            assert "Prefer one `write` call for `" in message
      
        3751
            assert "01-introduction.html` instead of more rereads." in message
      
        3752
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3753
        
        3754
        
        3755
        @pytest.mark.asyncio
      
        3756
        async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
      
        3757
            temp_dir: Path,
      
        3758
        ) -> None:
      
        3759
            async def assess_confidence(
      
        3760
                tool_name: str,
      
        3761
                tool_args: dict,
      
        3762
                context: str,
      
        3763
            ) -> ConfidenceAssessment:
      
        3764
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3765
        
        3766
            async def verify_action(
      
        3767
                tool_name: str,
      
        3768
                tool_args: dict,
      
        3769
                result: str,
      
        3770
                expected: str = "",
      
        3771
            ) -> ActionVerification:
      
        3772
                raise AssertionError("Verification should not run in this scenario")
      
        3773
        
        3774
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        3775
            chapters = guide_root / "chapters"
      
        3776
            chapters.mkdir(parents=True)
      
        3777
            index_path = guide_root / "index.html"
      
        3778
            implementation_plan = temp_dir / "implementation.md"
      
        3779
            implementation_plan.write_text(
      
        3780
                "\n".join(
      
        3781
                    [
      
        3782
                        "# Implementation Plan",
      
        3783
                        "",
      
        3784
                        "## File Changes",
      
        3785
                        f"- `{chapters}/`",
      
        3786
                        f"- `{index_path}`",
      
        3787
                        "",
      
        3788
                    ]
      
        3789
                )
      
        3790
            )
      
        3791
        
        3792
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3793
            dod.implementation_plan = str(implementation_plan)
      
        3794
            sync_todos_to_definition_of_done(
      
        3795
                dod,
      
        3796
                [
      
        3797
                    {
      
        3798
                        "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        3799
                        "active_form": "Examining the existing Fortran guide structure",
      
        3800
                        "status": "completed",
      
        3801
                    },
      
        3802
                    {
      
        3803
                        "content": "Create the new nginx guide directory structure",
      
        3804
                        "active_form": "Creating the new nginx guide directory structure",
      
        3805
                        "status": "completed",
      
        3806
                    },
      
        3807
                    {
      
        3808
                        "content": "Create a new index.html for the nginx guide",
      
        3809
                        "active_form": "Creating a new index.html for the nginx guide",
      
        3810
                        "status": "pending",
      
        3811
                    },
      
        3812
                    {
      
        3813
                        "content": "Create the first chapter for the nginx guide",
      
        3814
                        "active_form": "Creating the first chapter for the nginx guide",
      
        3815
                        "status": "pending",
      
        3816
                    },
      
        3817
                ],
      
        3818
                project_root=temp_dir,
      
        3819
            )
      
        3820
        
        3821
            queued_messages: list[str] = []
      
        3822
            context = build_context(
      
        3823
                temp_dir=temp_dir,
      
        3824
                messages=[],
      
        3825
                safeguards=FakeSafeguards(),
      
        3826
                assess_confidence=assess_confidence,
      
        3827
                verify_action=verify_action,
      
        3828
                auto_recover=False,
      
        3829
            )
      
        3830
            context.queue_steering_message_callback = queued_messages.append
      
        3831
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3832
        
        3833
            todos = [
      
        3834
                {
      
        3835
                    "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        3836
                    "active_form": "Examining the existing Fortran guide structure",
      
        3837
                    "status": "completed",
      
        3838
                },
      
        3839
                {
      
        3840
                    "content": "Create the new nginx guide directory structure",
      
        3841
                    "active_form": "Creating the new nginx guide directory structure",
      
        3842
                    "status": "completed",
      
        3843
                },
      
        3844
                {
      
        3845
                    "content": "Create a new index.html for the nginx guide",
      
        3846
                    "active_form": "Creating a new index.html for the nginx guide",
      
        3847
                    "status": "pending",
      
        3848
                },
      
        3849
                {
      
        3850
                    "content": "Create the first chapter for the nginx guide",
      
        3851
                    "active_form": "Creating the first chapter for the nginx guide",
      
        3852
                    "status": "pending",
      
        3853
                },
      
        3854
            ]
      
        3855
            tool_call = ToolCall(
      
        3856
                id="todo-index-before-chapter",
      
        3857
                name="TodoWrite",
      
        3858
                arguments={"todos": todos},
      
        3859
            )
      
        3860
            executor = FakeExecutor(
      
        3861
                [
      
        3862
                    tool_outcome(
      
        3863
                        tool_call=tool_call,
      
        3864
                        output="Todos updated",
      
        3865
                        is_error=False,
      
        3866
                        metadata={"new_todos": todos},
      
        3867
                    )
      
        3868
                ]
      
        3869
            )
      
        3870
        
        3871
            summary = TurnSummary(final_response="")
      
        3872
            await runner.execute_batch(
      
        3873
                tool_calls=[tool_call],
      
        3874
                tool_source="assistant",
      
        3875
                pending_tool_calls_seen=set(),
      
        3876
                emit=_noop_emit,
      
        3877
                summary=summary,
      
        3878
                dod=dod,
      
        3879
                executor=executor,  # type: ignore[arg-type]
      
        3880
                on_confirmation=None,
      
        3881
                on_user_question=None,
      
        3882
                emit_confirmation=None,
      
        3883
                consecutive_errors=0,
      
        3884
            )
      
        3885
        
        3886
            assert queued_messages
      
        3887
            message = queued_messages[-1]
      
        3888
            assert "Continue with the next pending item: `Create a new index.html for the nginx guide`." in message
      
        3889
            assert "Resume by creating `index.html` now." in message
      
        3890
            assert f"Prefer one `write` call for `{index_path.resolve(strict=False)}`" in message
      
        3891
            assert "01-introduction.html" not in message
      
        3892
        
        3893
        
        3894
        @pytest.mark.asyncio
      
        3895
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        3896
            temp_dir: Path,
      
        3897
        ) -> None:
      
        3898
            async def assess_confidence(
      
        3899
                tool_name: str,
      
        3900
                tool_args: dict,
      
        3901
                context: str,
      
        3902
            ) -> ConfidenceAssessment:
      
        3903
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3904
        
        3905
            async def verify_action(
      
        3906
                tool_name: str,
      
        3907
                tool_args: dict,
      
        3908
                result: str,
      
        3909
                expected: str = "",
      
        3910
            ) -> ActionVerification:
      
        3911
                raise AssertionError("Verification should not run in this scenario")
      
        3912
        
        3913
            guide_root = temp_dir / "guides" / "nginx"
      
        3914
            chapters = guide_root / "chapters"
      
        3915
            guide_root.mkdir(parents=True)
      
        3916
            chapters.mkdir()
      
        3917
            index_path = guide_root / "index.html"
      
        3918
            index_path.write_text(
      
        3919
                "\n".join(
      
        3920
                    [
      
        3921
                        "<html>",
      
        3922
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        3923
                        '<a href="chapters/installation.html">Installation</a>',
      
        3924
                        "</html>",
      
        3925
                    ]
      
        3926
                )
      
        3927
                + "\n"
      
        3928
            )
      
        3929
        
        3930
            implementation_plan = temp_dir / "implementation.md"
      
        3931
            implementation_plan.write_text(
      
        3932
                "\n".join(
      
        3933
                    [
      
        3934
                        "# Implementation Plan",
      
        3935
                        "",
      
        3936
                        "## File Changes",
      
        3937
                        f"- `{guide_root}/`",
      
        3938
                        f"- `{chapters}/`",
      
        3939
                        f"- `{index_path}`",
      
        3940
                        "",
      
        3941
                    ]
      
        3942
                )
      
        3943
            )
      
        3944
        
        3945
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3946
            dod.implementation_plan = str(implementation_plan)
      
        3947
            dod.pending_items = [
      
        3948
                "Write the introduction chapter",
      
        3949
                "Complete the requested work",
      
        3950
            ]
      
        3951
            dod.touched_files.append(str(index_path))
      
        3952
        
        3953
            queued_messages: list[str] = []
      
        3954
            context = build_context(
      
        3955
                temp_dir=temp_dir,
      
        3956
                messages=[],
      
        3957
                safeguards=FakeSafeguards(),
      
        3958
                assess_confidence=assess_confidence,
      
        3959
                verify_action=verify_action,
      
        3960
                auto_recover=False,
      
        3961
            )
      
        3962
            context.queue_steering_message_callback = queued_messages.append
      
        3963
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3964
        
        3965
            tool_call = ToolCall(
      
        3966
                id="todo-1",
      
        3967
                name="TodoWrite",
      
        3968
                arguments={
      
        3969
                    "todos": [
      
        3970
                        {
      
        3971
                            "content": "Write the introduction chapter",
      
        3972
                            "activeForm": "Writing the introduction chapter",
      
        3973
                            "status": "pending",
      
        3974
                        }
      
        3975
                    ]
      
        3976
                },
      
        3977
            )
      
        3978
            executor = FakeExecutor(
      
        3979
                [
      
        3980
                    tool_outcome(
      
        3981
                        tool_call=tool_call,
      
        3982
                        output="Todos updated",
      
        3983
                        is_error=False,
      
        3984
                        metadata={
      
        3985
                            "new_todos": [
      
        3986
                                {
      
        3987
                                    "content": "Write the introduction chapter",
      
        3988
                                    "active_form": "Writing the introduction chapter",
      
        3989
                                    "status": "pending",
      
        3990
                                }
      
        3991
                            ]
      
        3992
                        },
      
        3993
                    )
      
        3994
                ]
      
        3995
            )
      
        3996
        
        3997
            summary = TurnSummary(final_response="")
      
        3998
            await runner.execute_batch(
      
        3999
                tool_calls=[tool_call],
      
        4000
                tool_source="assistant",
      
        4001
                pending_tool_calls_seen=set(),
      
        4002
                emit=_noop_emit,
      
        4003
                summary=summary,
      
        4004
                dod=dod,
      
        4005
                executor=executor,  # type: ignore[arg-type]
      
        4006
                on_confirmation=None,
      
        4007
                on_user_question=None,
      
        4008
                emit_confirmation=None,
      
        4009
                consecutive_errors=0,
      
        4010
            )
      
        4011
        
        4012
            assert queued_messages
      
        4013
            message = queued_messages[-1]
      
        4014
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        4015
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        4016
            assert "Resume by creating `introduction.html` now." in message
      
        4017
            assert "Prefer one `write` call for `" in message
      
        4018
            assert "introduction.html` instead of more rereads." in message
      
        4019
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        4020
        
        4021
        
        4022
        @pytest.mark.asyncio
      
        4023
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        4024
            temp_dir: Path,
      
        4025
        ) -> None:
      
        4026
            async def assess_confidence(
      
        4027
                tool_name: str,
      
        4028
                tool_args: dict,
      
        4029
                context: str,
      
        4030
            ) -> ConfidenceAssessment:
      
        4031
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4032
        
        4033
            async def verify_action(
      
        4034
                tool_name: str,
      
        4035
                tool_args: dict,
      
        4036
                result: str,
      
        4037
                expected: str = "",
      
        4038
            ) -> ActionVerification:
      
        4039
                raise AssertionError("Verification should not run in this scenario")
      
        4040
        
        4041
            guide_root = temp_dir / "guides" / "nginx"
      
        4042
            chapters = guide_root / "chapters"
      
        4043
            guide_root.mkdir(parents=True)
      
        4044
            chapters.mkdir()
      
        4045
            index_path = guide_root / "index.html"
      
        4046
            chapter_one = chapters / "01-introduction.html"
      
        4047
            index_path.write_text(
      
        4048
                "\n".join(
      
        4049
                    [
      
        4050
                        "<html>",
      
        4051
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        4052
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        4053
                        "</html>",
      
        4054
                    ]
      
        4055
                )
      
        4056
                + "\n"
      
        4057
            )
      
        4058
            chapter_one.write_text("<html></html>\n")
      
        4059
        
        4060
            implementation_plan = temp_dir / "implementation.md"
      
        4061
            implementation_plan.write_text(
      
        4062
                "\n".join(
      
        4063
                    [
      
        4064
                        "# Implementation Plan",
      
        4065
                        "",
      
        4066
                        "## File Changes",
      
        4067
                        f"- `{guide_root}/`",
      
        4068
                        f"- `{chapters}/`",
      
        4069
                        f"- `{index_path}`",
      
        4070
                        "",
      
        4071
                    ]
      
        4072
                )
      
        4073
            )
      
        4074
        
        4075
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4076
            dod.implementation_plan = str(implementation_plan)
      
        4077
            dod.pending_items = [
      
        4078
                "Creating Chapter 2: Installation and Setup",
      
        4079
                "Complete the requested work",
      
        4080
            ]
      
        4081
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4082
        
        4083
            queued_messages: list[str] = []
      
        4084
            context = build_context(
      
        4085
                temp_dir=temp_dir,
      
        4086
                messages=[],
      
        4087
                safeguards=FakeSafeguards(),
      
        4088
                assess_confidence=assess_confidence,
      
        4089
                verify_action=verify_action,
      
        4090
                auto_recover=False,
      
        4091
            )
      
        4092
            context.queue_steering_message_callback = queued_messages.append
      
        4093
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4094
        
        4095
            tool_call = ToolCall(
      
        4096
                id="todo-1",
      
        4097
                name="TodoWrite",
      
        4098
                arguments={
      
        4099
                    "todos": [
      
        4100
                        {
      
        4101
                            "content": "Creating Chapter 2: Installation and Setup",
      
        4102
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        4103
                            "status": "pending",
      
        4104
                        }
      
        4105
                    ]
      
        4106
                },
      
        4107
            )
      
        4108
            executor = FakeExecutor(
      
        4109
                [
      
        4110
                    tool_outcome(
      
        4111
                        tool_call=tool_call,
      
        4112
                        output="Todos updated",
      
        4113
                        is_error=False,
      
        4114
                        metadata={
      
        4115
                            "new_todos": [
      
        4116
                                {
      
        4117
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        4118
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        4119
                                    "status": "pending",
      
        4120
                                }
      
        4121
                            ]
      
        4122
                        },
      
        4123
                    )
      
        4124
                ]
      
        4125
            )
      
        4126
        
        4127
            summary = TurnSummary(final_response="")
      
        4128
            await runner.execute_batch(
      
        4129
                tool_calls=[tool_call],
      
        4130
                tool_source="assistant",
      
        4131
                pending_tool_calls_seen=set(),
      
        4132
                emit=_noop_emit,
      
        4133
                summary=summary,
      
        4134
                dod=dod,
      
        4135
                executor=executor,  # type: ignore[arg-type]
      
        4136
                on_confirmation=None,
      
        4137
                on_user_question=None,
      
        4138
                emit_confirmation=None,
      
        4139
                consecutive_errors=0,
      
        4140
            )
      
        4141
        
        4142
            assert queued_messages
      
        4143
            message = queued_messages[-1]
      
        4144
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        4145
            assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
      
        4146
            assert "Resume by creating `02-installation.html` now." in message
      
        4147
            assert (
      
        4148
                f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
      
        4149
                "instead of more rereads."
      
        4150
                in message
      
        4151
            )
      
        4152
            assert "Make your next response the concrete mutation tool call itself" in message
      
        4153
        
        4154
        
        4155
        @pytest.mark.asyncio
      
        4156
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        4157
            temp_dir: Path,
      
        4158
        ) -> None:
      
        4159
            async def assess_confidence(
      
        4160
                tool_name: str,
      
        4161
                tool_args: dict,
      
        4162
                context: str,
      
        4163
            ) -> ConfidenceAssessment:
      
        4164
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4165
        
        4166
            async def verify_action(
      
        4167
                tool_name: str,
      
        4168
                tool_args: dict,
      
        4169
                result: str,
      
        4170
                expected: str = "",
      
        4171
            ) -> ActionVerification:
      
        4172
                raise AssertionError("Verification should not run in this scenario")
      
        4173
        
        4174
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        4175
            reference_chapters.mkdir(parents=True)
      
        4176
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        4177
        
        4178
            guide_root = temp_dir / "guides" / "nginx"
      
        4179
            chapters = guide_root / "chapters"
      
        4180
            guide_root.mkdir(parents=True)
      
        4181
            chapters.mkdir()
      
        4182
            index_path = guide_root / "index.html"
      
        4183
            index_path.write_text("<html></html>\n")
      
        4184
        
        4185
            implementation_plan = temp_dir / "implementation.md"
      
        4186
            implementation_plan.write_text(
      
        4187
                "\n".join(
      
        4188
                    [
      
        4189
                        "# Implementation Plan",
      
        4190
                        "",
      
        4191
                        "## File Changes",
      
        4192
                        f"- `{guide_root}/`",
      
        4193
                        f"- `{chapters}/`",
      
        4194
                        f"- `{index_path}`",
      
        4195
                        "",
      
        4196
                    ]
      
        4197
                )
      
        4198
            )
      
        4199
        
        4200
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4201
            dod.implementation_plan = str(implementation_plan)
      
        4202
            dod.pending_items = [
      
        4203
                "Write the introduction chapter",
      
        4204
                "Complete the requested work",
      
        4205
            ]
      
        4206
            dod.touched_files.append(str(index_path))
      
        4207
        
        4208
            queued_messages: list[str] = []
      
        4209
            context = build_context(
      
        4210
                temp_dir=temp_dir,
      
        4211
                messages=[
      
        4212
                    Message(
      
        4213
                        role=Role.ASSISTANT,
      
        4214
                        content="",
      
        4215
                        tool_calls=[
      
        4216
                            ToolCall(
      
        4217
                                id="read-ref-1",
      
        4218
                                name="read",
      
        4219
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        4220
                            )
      
        4221
                        ],
      
        4222
                    )
      
        4223
                ],
      
        4224
                safeguards=FakeSafeguards(),
      
        4225
                assess_confidence=assess_confidence,
      
        4226
                verify_action=verify_action,
      
        4227
                auto_recover=False,
      
        4228
            )
      
        4229
            context.queue_steering_message_callback = queued_messages.append
      
        4230
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4231
        
        4232
            tool_call = ToolCall(
      
        4233
                id="todo-observed-1",
      
        4234
                name="TodoWrite",
      
        4235
                arguments={
      
        4236
                    "todos": [
      
        4237
                        {
      
        4238
                            "content": "Write the introduction chapter",
      
        4239
                            "activeForm": "Writing the introduction chapter",
      
        4240
                            "status": "pending",
      
        4241
                        }
      
        4242
                    ]
      
        4243
                },
      
        4244
            )
      
        4245
            executor = FakeExecutor(
      
        4246
                [
      
        4247
                    tool_outcome(
      
        4248
                        tool_call=tool_call,
      
        4249
                        output="Todos updated",
      
        4250
                        is_error=False,
      
        4251
                        metadata={
      
        4252
                            "new_todos": [
      
        4253
                                {
      
        4254
                                    "content": "Write the introduction chapter",
      
        4255
                                    "active_form": "Writing the introduction chapter",
      
        4256
                                    "status": "pending",
      
        4257
                                }
      
        4258
                            ]
      
        4259
                        },
      
        4260
                    )
      
        4261
                ]
      
        4262
            )
      
        4263
        
        4264
            summary = TurnSummary(final_response="")
      
        4265
            await runner.execute_batch(
      
        4266
                tool_calls=[tool_call],
      
        4267
                tool_source="assistant",
      
        4268
                pending_tool_calls_seen=set(),
      
        4269
                emit=_noop_emit,
      
        4270
                summary=summary,
      
        4271
                dod=dod,
      
        4272
                executor=executor,  # type: ignore[arg-type]
      
        4273
                on_confirmation=None,
      
        4274
                on_user_question=None,
      
        4275
                emit_confirmation=None,
      
        4276
                consecutive_errors=0,
      
        4277
            )
      
        4278
        
        4279
            assert queued_messages
      
        4280
            message = queued_messages[-1]
      
        4281
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        4282
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        4283
            assert "Resume by creating `01-introduction.html` now." in message
      
        4284
            assert (
      
        4285
                "It mirrors the observed filename pattern from another `chapters/` directory "
      
        4286
                "you already inspected."
      
        4287
                in message
      
        4288
            )
      
        4289
            assert "01-introduction.html` instead of more rereads." in message
      
        4290
        
        4291
        
        4292
        @pytest.mark.asyncio
      
        4293
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        4294
            temp_dir: Path,
      
        4295
        ) -> None:
      
        4296
            async def assess_confidence(
      
        4297
                tool_name: str,
      
        4298
                tool_args: dict,
      
        4299
                context: str,
      
        4300
            ) -> ConfidenceAssessment:
      
        4301
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4302
        
        4303
            async def verify_action(
      
        4304
                tool_name: str,
      
        4305
                tool_args: dict,
      
        4306
                result: str,
      
        4307
                expected: str = "",
      
        4308
            ) -> ActionVerification:
      
        4309
                raise AssertionError("Verification should not run in this scenario")
      
        4310
        
        4311
            guide_root = temp_dir / "guides" / "nginx"
      
        4312
            chapters = guide_root / "chapters"
      
        4313
            guide_root.mkdir(parents=True)
      
        4314
            chapters.mkdir()
      
        4315
            index_path = guide_root / "index.html"
      
        4316
            chapter_one = chapters / "01-getting-started.html"
      
        4317
            chapter_two = chapters / "02-installation.html"
      
        4318
            index_path.write_text("<html></html>\n")
      
        4319
            chapter_one.write_text("<h1>One</h1>\n")
      
        4320
        
        4321
            implementation_plan = temp_dir / "implementation.md"
      
        4322
            implementation_plan.write_text(
      
        4323
                "\n".join(
      
        4324
                    [
      
        4325
                        "# Implementation Plan",
      
        4326
                        "",
      
        4327
                        "## File Changes",
      
        4328
                        f"- `{guide_root}/`",
      
        4329
                        f"- `{chapters}/`",
      
        4330
                        f"- `{index_path}`",
      
        4331
                        f"- `{chapter_one}`",
      
        4332
                        f"- `{chapter_two}`",
      
        4333
                        "",
      
        4334
                    ]
      
        4335
                )
      
        4336
            )
      
        4337
        
        4338
            context = build_context(
      
        4339
                temp_dir=temp_dir,
      
        4340
                messages=[],
      
        4341
                safeguards=FakeSafeguards(),
      
        4342
                assess_confidence=assess_confidence,
      
        4343
                verify_action=verify_action,
      
        4344
                auto_recover=False,
      
        4345
            )
      
        4346
            queued_messages: list[str] = []
      
        4347
            context.queue_steering_message_callback = queued_messages.append
      
        4348
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4349
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4350
            dod.implementation_plan = str(implementation_plan)
      
        4351
            sync_todos_to_definition_of_done(
      
        4352
                dod,
      
        4353
                [
      
        4354
                    {
      
        4355
                        "content": "Create 01-getting-started.html",
      
        4356
                        "active_form": "Creating 01-getting-started.html",
      
        4357
                        "status": "completed",
      
        4358
                    },
      
        4359
                    {
      
        4360
                        "content": "Create 02-installation.html",
      
        4361
                        "active_form": "Creating 02-installation.html",
      
        4362
                        "status": "pending",
      
        4363
                    },
      
        4364
                ],
      
        4365
                project_root=temp_dir,
      
        4366
            )
      
        4367
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4368
        
        4369
            tool_call = ToolCall(
      
        4370
                id="working-note",
      
        4371
                name="notepad_write_working",
      
        4372
                arguments={"content": "Creating the second chapter file: Installation"},
      
        4373
            )
      
        4374
            executor = FakeExecutor(
      
        4375
                [
      
        4376
                    tool_outcome(
      
        4377
                        tool_call=tool_call,
      
        4378
                        output="Working note recorded",
      
        4379
                        is_error=False,
      
        4380
                    )
      
        4381
                ]
      
        4382
            )
      
        4383
        
        4384
            summary = TurnSummary(final_response="")
      
        4385
            await runner.execute_batch(
      
        4386
                tool_calls=[tool_call],
      
        4387
                tool_source="assistant",
      
        4388
                pending_tool_calls_seen=set(),
      
        4389
                emit=_noop_emit,
      
        4390
                summary=summary,
      
        4391
                dod=dod,
      
        4392
                executor=executor,  # type: ignore[arg-type]
      
        4393
                on_confirmation=None,
      
        4394
                on_user_question=None,
      
        4395
                emit_confirmation=None,
      
        4396
                consecutive_errors=0,
      
        4397
            )
      
        4398
        
        4399
            assert queued_messages
      
        4400
            message = queued_messages[-1]
      
        4401
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4402
            assert "Resume by creating `02-installation.html` now." in message
      
        4403
            assert "Make your next response the concrete mutation tool call itself" in message
      
        4404
            assert "refresh `TodoWrite`" in message
      
        4405
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        4406
        
        4407
        
        4408
        @pytest.mark.asyncio
      
        4409
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        4410
            temp_dir: Path,
      
        4411
        ) -> None:
      
        4412
            async def assess_confidence(
      
        4413
                tool_name: str,
      
        4414
                tool_args: dict,
      
        4415
                context: str,
      
        4416
            ) -> ConfidenceAssessment:
      
        4417
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4418
        
        4419
            async def verify_action(
      
        4420
                tool_name: str,
      
        4421
                tool_args: dict,
      
        4422
                result: str,
      
        4423
                expected: str = "",
      
        4424
            ) -> ActionVerification:
      
        4425
                raise AssertionError("Verification should not run in this scenario")
      
        4426
        
        4427
            implementation_plan = temp_dir / "implementation.md"
      
        4428
            implementation_plan.write_text(
      
        4429
                "\n".join(
      
        4430
                    [
      
        4431
                        "# Implementation Plan",
      
        4432
                        "",
      
        4433
                        "## File Changes",
      
        4434
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        4435
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        4436
                        "",
      
        4437
                    ]
      
        4438
                )
      
        4439
            )
      
        4440
        
        4441
            context = build_context(
      
        4442
                temp_dir=temp_dir,
      
        4443
                messages=[],
      
        4444
                safeguards=FakeSafeguards(),
      
        4445
                assess_confidence=assess_confidence,
      
        4446
                verify_action=verify_action,
      
        4447
                auto_recover=False,
      
        4448
            )
      
        4449
            queued_messages: list[str] = []
      
        4450
            context.queue_steering_message_callback = queued_messages.append
      
        4451
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4452
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4453
            dod.implementation_plan = str(implementation_plan)
      
        4454
            dod.pending_items.extend(
      
        4455
                [
      
        4456
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4457
                    "Create the nginx directory structure",
      
        4458
                    "Develop the main index.html file for the nginx guide",
      
        4459
                ]
      
        4460
            )
      
        4461
        
        4462
            tool_call = ToolCall(
      
        4463
                id="working-note",
      
        4464
                name="notepad_write_working",
      
        4465
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        4466
            )
      
        4467
            executor = FakeExecutor(
      
        4468
                [
      
        4469
                    tool_outcome(
      
        4470
                        tool_call=tool_call,
      
        4471
                        output="Working note recorded",
      
        4472
                        is_error=False,
      
        4473
                    )
      
        4474
                ]
      
        4475
            )
      
        4476
        
        4477
            summary = TurnSummary(final_response="")
      
        4478
            await runner.execute_batch(
      
        4479
                tool_calls=[tool_call],
      
        4480
                tool_source="assistant",
      
        4481
                pending_tool_calls_seen=set(),
      
        4482
                emit=_noop_emit,
      
        4483
                summary=summary,
      
        4484
                dod=dod,
      
        4485
                executor=executor,  # type: ignore[arg-type]
      
        4486
                on_confirmation=None,
      
        4487
                on_user_question=None,
      
        4488
                emit_confirmation=None,
      
        4489
                consecutive_errors=0,
      
        4490
            )
      
        4491
        
        4492
            assert queued_messages
      
        4493
            message = queued_messages[-1]
      
        4494
            assert (
      
        4495
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        4496
                in message
      
        4497
            )
      
        4498
            assert "one concrete evidence-gathering tool call" in message
      
        4499
            assert "Resume by creating `index.html` now." not in message
      
        4500
        
        4501
        
        4502
        @pytest.mark.asyncio
      
        4503
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        4504
            temp_dir: Path,
      
        4505
        ) -> None:
      
        4506
            async def assess_confidence(
      
        4507
                tool_name: str,
      
        4508
                tool_args: dict,
      
        4509
                context: str,
      
        4510
            ) -> ConfidenceAssessment:
      
        4511
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4512
        
        4513
            async def verify_action(
      
        4514
                tool_name: str,
      
        4515
                tool_args: dict,
      
        4516
                result: str,
      
        4517
                expected: str = "",
      
        4518
            ) -> ActionVerification:
      
        4519
                raise AssertionError("Verification should not run in this scenario")
      
        4520
        
        4521
            guide_root = temp_dir / "guides" / "nginx"
      
        4522
            chapters_dir = guide_root / "chapters"
      
        4523
            chapters_dir.mkdir(parents=True)
      
        4524
            index_path = guide_root / "index.html"
      
        4525
            first_chapter = chapters_dir / "01-introduction.html"
      
        4526
            index_path.write_text(
      
        4527
                "\n".join(
      
        4528
                    [
      
        4529
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4530
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        4531
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        4532
                    ]
      
        4533
                )
      
        4534
            )
      
        4535
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        4536
        
        4537
            implementation_plan = temp_dir / "implementation.md"
      
        4538
            implementation_plan.write_text(
      
        4539
                "\n".join(
      
        4540
                    [
      
        4541
                        "# Implementation Plan",
      
        4542
                        "",
      
        4543
                        "## File Changes",
      
        4544
                        f"- `{guide_root / 'index.html'}`",
      
        4545
                        f"- `{chapters_dir}/`",
      
        4546
                        "",
      
        4547
                    ]
      
        4548
                )
      
        4549
            )
      
        4550
        
        4551
            context = build_context(
      
        4552
                temp_dir=temp_dir,
      
        4553
                messages=[],
      
        4554
                safeguards=FakeSafeguards(),
      
        4555
                assess_confidence=assess_confidence,
      
        4556
                verify_action=verify_action,
      
        4557
                auto_recover=False,
      
        4558
            )
      
        4559
            queued_messages: list[str] = []
      
        4560
            context.queue_steering_message_callback = queued_messages.append
      
        4561
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4562
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4563
            dod.implementation_plan = str(implementation_plan)
      
        4564
            dod.pending_items.extend(
      
        4565
                [
      
        4566
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        4567
                    "Create chapter files following the established pattern",
      
        4568
                ]
      
        4569
            )
      
        4570
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        4571
        
        4572
            tool_call = ToolCall(
      
        4573
                id="working-note",
      
        4574
                name="notepad_write_working",
      
        4575
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        4576
            )
      
        4577
            executor = FakeExecutor(
      
        4578
                [
      
        4579
                    tool_outcome(
      
        4580
                        tool_call=tool_call,
      
        4581
                        output="Working note recorded",
      
        4582
                        is_error=False,
      
        4583
                    )
      
        4584
                ]
      
        4585
            )
      
        4586
        
        4587
            summary = TurnSummary(final_response="")
      
        4588
            await runner.execute_batch(
      
        4589
                tool_calls=[tool_call],
      
        4590
                tool_source="assistant",
      
        4591
                pending_tool_calls_seen=set(),
      
        4592
                emit=_noop_emit,
      
        4593
                summary=summary,
      
        4594
                dod=dod,
      
        4595
                executor=executor,  # type: ignore[arg-type]
      
        4596
                on_confirmation=None,
      
        4597
                on_user_question=None,
      
        4598
                emit_confirmation=None,
      
        4599
                consecutive_errors=0,
      
        4600
            )
      
        4601
        
        4602
            assert queued_messages
      
        4603
            message = queued_messages[-1]
      
        4604
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        4605
            assert "Resume by creating `02-installation.html` now." in message
      
        4606
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        4607
        
        4608
        
        4609
        @pytest.mark.asyncio
      
        4610
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        4611
            temp_dir: Path,
      
        4612
        ) -> None:
      
        4613
            async def assess_confidence(
      
        4614
                tool_name: str,
      
        4615
                tool_args: dict,
      
        4616
                context: str,
      
        4617
            ) -> ConfidenceAssessment:
      
        4618
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4619
        
        4620
            async def verify_action(
      
        4621
                tool_name: str,
      
        4622
                tool_args: dict,
      
        4623
                result: str,
      
        4624
                expected: str = "",
      
        4625
            ) -> ActionVerification:
      
        4626
                raise AssertionError("Verification should not run in this scenario")
      
        4627
        
        4628
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        4629
            chapters_dir = fortran_root / "chapters"
      
        4630
            chapters_dir.mkdir(parents=True)
      
        4631
        
        4632
            implementation_plan = temp_dir / "implementation.md"
      
        4633
            implementation_plan.write_text(
      
        4634
                "\n".join(
      
        4635
                    [
      
        4636
                        "# Implementation Plan",
      
        4637
                        "",
      
        4638
                        "## File Changes",
      
        4639
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        4640
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        4641
                        "",
      
        4642
                    ]
      
        4643
                )
      
        4644
            )
      
        4645
        
        4646
            context = build_context(
      
        4647
                temp_dir=temp_dir,
      
        4648
                messages=[],
      
        4649
                safeguards=FakeSafeguards(),
      
        4650
                assess_confidence=assess_confidence,
      
        4651
                verify_action=verify_action,
      
        4652
                auto_recover=False,
      
        4653
            )
      
        4654
            queued_messages: list[str] = []
      
        4655
            context.queue_steering_message_callback = queued_messages.append
      
        4656
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4657
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4658
            dod.implementation_plan = str(implementation_plan)
      
        4659
            dod.pending_items.extend(
      
        4660
                [
      
        4661
                    "First, examine the existing fortran guide structure and content",
      
        4662
                    "Create the nginx directory structure",
      
        4663
                    "Develop the main index.html file for nginx guide",
      
        4664
                ]
      
        4665
            )
      
        4666
        
        4667
            tool_call = ToolCall(
      
        4668
                id="glob-1",
      
        4669
                name="glob",
      
        4670
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        4671
            )
      
        4672
            executor = FakeExecutor(
      
        4673
                [
      
        4674
                    tool_outcome(
      
        4675
                        tool_call=tool_call,
      
        4676
                        output=f"{fortran_root}\n{chapters_dir}",
      
        4677
                        is_error=False,
      
        4678
                    )
      
        4679
                ]
      
        4680
            )
      
        4681
        
        4682
            summary = TurnSummary(final_response="")
      
        4683
            await runner.execute_batch(
      
        4684
                tool_calls=[tool_call],
      
        4685
                tool_source="assistant",
      
        4686
                pending_tool_calls_seen=set(),
      
        4687
                emit=_noop_emit,
      
        4688
                summary=summary,
      
        4689
                dod=dod,
      
        4690
                executor=executor,  # type: ignore[arg-type]
      
        4691
                on_confirmation=None,
      
        4692
                on_user_question=None,
      
        4693
                emit_confirmation=None,
      
        4694
                consecutive_errors=0,
      
        4695
            )
      
        4696
        
        4697
            assert queued_messages == []
      
        4698
        
        4699
        
        4700
        @pytest.mark.asyncio
      
        4701
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        4702
            temp_dir: Path,
      
        4703
        ) -> None:
      
        4704
            async def assess_confidence(
      
        4705
                tool_name: str,
      
        4706
                tool_args: dict,
      
        4707
                context: str,
      
        4708
            ) -> ConfidenceAssessment:
      
        4709
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4710
        
        4711
            async def verify_action(
      
        4712
                tool_name: str,
      
        4713
                tool_args: dict,
      
        4714
                result: str,
      
        4715
                expected: str = "",
      
        4716
            ) -> ActionVerification:
      
        4717
                raise AssertionError("Verification should not run in this scenario")
      
        4718
        
        4719
            prompt = (
      
        4720
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        4721
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        4722
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        4723
                "links and their link texts to be correct."
      
        4724
            )
      
        4725
            chapters = temp_dir / "chapters"
      
        4726
            chapters.mkdir()
      
        4727
            (chapters / "01-introduction.html").write_text(
      
        4728
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        4729
            )
      
        4730
            (chapters / "02-setup.html").write_text(
      
        4731
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        4732
            )
      
        4733
            current_block = (
      
        4734
                "<h2>Table of Contents</h2>\n"
      
        4735
                '        <ul class="chapter-list">\n'
      
        4736
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        4737
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        4738
                "        </ul>\n"
      
        4739
            )
      
        4740
            index_path = temp_dir / "index.html"
      
        4741
            index_path.write_text(current_block)
      
        4742
        
        4743
            context = build_context(
      
        4744
                temp_dir=temp_dir,
      
        4745
                messages=[],
      
        4746
                safeguards=FakeSafeguards(),
      
        4747
                assess_confidence=assess_confidence,
      
        4748
                verify_action=verify_action,
      
        4749
                auto_recover=False,
      
        4750
            )
      
        4751
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        4752
            queued_messages: list[str] = []
      
        4753
            context.queue_steering_message_callback = queued_messages.append
      
        4754
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4755
            tool_call = ToolCall(
      
        4756
                id="edit-1",
      
        4757
                name="edit",
      
        4758
                arguments={
      
        4759
                    "file_path": str(index_path),
      
        4760
                    "old_string": current_block,
      
        4761
                    "new_string": current_block,
      
        4762
                },
      
        4763
            )
      
        4764
            executor = FakeExecutor(
      
        4765
                [
      
        4766
                    tool_outcome(
      
        4767
                        tool_call=tool_call,
      
        4768
                        output=(
      
        4769
                            "[Blocked - old_string and new_string are identical - no change "
      
        4770
                            "would occur] Suggestion: Provide different old and new strings"
      
        4771
                        ),
      
        4772
                        is_error=True,
      
        4773
                        state=ToolExecutionState.BLOCKED,
      
        4774
                    )
      
        4775
                ]
      
        4776
            )
      
        4777
        
        4778
            await runner.execute_batch(
      
        4779
                tool_calls=[tool_call],
      
        4780
                tool_source="assistant",
      
        4781
                pending_tool_calls_seen=set(),
      
        4782
                emit=_noop_emit,
      
        4783
                summary=TurnSummary(final_response=""),
      
        4784
                dod=create_definition_of_done(prompt),
      
        4785
                executor=executor,  # type: ignore[arg-type]
      
        4786
                on_confirmation=None,
      
        4787
                on_user_question=None,
      
        4788
                emit_confirmation=None,
      
        4789
                consecutive_errors=0,
      
        4790
            )
      
        4791
        
        4792
            assert queued_messages == []
      
        4793
        
        4794
        
        4795
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        4796
            temp_dir: Path,
      
        4797
        ) -> None:
      
        4798
            async def assess_confidence(
      
        4799
                tool_name: str,
      
        4800
                tool_args: dict,
      
        4801
                context: str,
      
        4802
            ) -> ConfidenceAssessment:
      
        4803
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4804
        
        4805
            async def verify_action(
      
        4806
                tool_name: str,
      
        4807
                tool_args: dict,
      
        4808
                result: str,
      
        4809
                expected: str = "",
      
        4810
            ) -> ActionVerification:
      
        4811
                raise AssertionError("Verification should not run in this scenario")
      
        4812
        
        4813
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        4814
            context = build_context(
      
        4815
                temp_dir=temp_dir,
      
        4816
                messages=[
      
        4817
                    Message(
      
        4818
                        role=Role.ASSISTANT,
      
        4819
                        content=(
      
        4820
                            "Repair focus:\n"
      
        4821
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        4822
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4823
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        4824
                        ),
      
        4825
                    )
      
        4826
                ],
      
        4827
                safeguards=FakeSafeguards(),
      
        4828
                assess_confidence=assess_confidence,
      
        4829
                verify_action=verify_action,
      
        4830
            )
      
        4831
            queued: list[str] = []
      
        4832
            context.queue_steering_message_callback = queued.append
      
        4833
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4834
        
        4835
            runner._queue_blocked_html_edit_nudge(
      
        4836
                ToolCall(
      
        4837
                    id="edit-1",
      
        4838
                    name="edit",
      
        4839
                    arguments={
      
        4840
                        "file_path": str(repair_target),
      
        4841
                        "old_string": "same",
      
        4842
                        "new_string": "same",
      
        4843
                    },
      
        4844
                ),
      
        4845
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        4846
            )
      
        4847
        
        4848
            assert queued
      
        4849
            assert str(repair_target) in queued[0]
      
        4850
            assert "no on-disk change" in queued[0]
      
        4851
            assert "replace the surrounding block" in queued[0]
      
        4852
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4853
        
        4854
        
        4855
        async def _noop_emit(event: AgentEvent) -> None:
      
        4856
            return None
      
        4857
        
        4858
        
        4859
        @pytest.mark.asyncio
      
        4860
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        4861
            temp_dir: Path,
      
        4862
        ) -> None:
      
        4863
            async def assess_confidence(
      
        4864
                tool_name: str,
      
        4865
                tool_args: dict,
      
        4866
                context: str,
      
        4867
            ) -> ConfidenceAssessment:
      
        4868
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4869
        
        4870
            async def verify_action(
      
        4871
                tool_name: str,
      
        4872
                tool_args: dict,
      
        4873
                result: str,
      
        4874
                expected: str = "",
      
        4875
            ) -> ActionVerification:
      
        4876
                raise AssertionError("Verification should not run for this scenario")
      
        4877
        
        4878
            context = build_context(
      
        4879
                temp_dir=temp_dir,
      
        4880
                messages=[],
      
        4881
                safeguards=FakeSafeguards(),
      
        4882
                assess_confidence=assess_confidence,
      
        4883
                verify_action=verify_action,
      
        4884
            )
      
        4885
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4886
            tool_call = ToolCall(
      
        4887
                id="write-1",
      
        4888
                name="write",
      
        4889
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4890
            )
      
        4891
            executor = FakeExecutor(
      
        4892
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4893
            )
      
        4894
            summary = TurnSummary(final_response="")
      
        4895
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4896
            events: list[AgentEvent] = []
      
        4897
        
        4898
            async def emit(event: AgentEvent) -> None:
      
        4899
                events.append(event)
      
        4900
        
        4901
            await runner.execute_batch(
      
        4902
                tool_calls=[tool_call],
      
        4903
                tool_source="assistant",
      
        4904
                pending_tool_calls_seen=set(),
      
        4905
                emit=emit,
      
        4906
                summary=summary,
      
        4907
                dod=dod,
      
        4908
                executor=executor,  # type: ignore[arg-type]
      
        4909
                on_confirmation=None,
      
        4910
                on_user_question=None,
      
        4911
                emit_confirmation=None,
      
        4912
                consecutive_errors=0,
      
        4913
            )
      
        4914
        
        4915
            assert dod.last_verification_result == "planned"
      
        4916
            assert dod.verification_commands
      
        4917
            assert "Collect verification evidence" in dod.pending_items
      
        4918
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        4919
            assert dod.active_verification_attempt_number == 1
      
        4920
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        4921
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        4922
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        4923
            assert (
      
        4924
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4925
                == "verification-attempt-1"
      
        4926
            )
      
        4927
            assert (
      
        4928
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4929
            )
      
        4930
        
        4931
        
        4932
        @pytest.mark.asyncio
      
        4933
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        4934
            temp_dir: Path,
      
        4935
        ) -> None:
      
        4936
            async def assess_confidence(
      
        4937
                tool_name: str,
      
        4938
                tool_args: dict,
      
        4939
                context: str,
      
        4940
            ) -> ConfidenceAssessment:
      
        4941
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4942
        
        4943
            async def verify_action(
      
        4944
                tool_name: str,
      
        4945
                tool_args: dict,
      
        4946
                result: str,
      
        4947
                expected: str = "",
      
        4948
            ) -> ActionVerification:
      
        4949
                raise AssertionError("Verification should not run in this scenario")
      
        4950
        
        4951
            context = build_context(
      
        4952
                temp_dir=temp_dir,
      
        4953
                messages=[],
      
        4954
                safeguards=FakeSafeguards(),
      
        4955
                assess_confidence=assess_confidence,
      
        4956
                verify_action=verify_action,
      
        4957
            )
      
        4958
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4959
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4960
            chapters = nginx_root / "chapters"
      
        4961
            implementation_plan = temp_dir / "implementation.md"
      
        4962
            implementation_plan.write_text(
      
        4963
                "\n".join(
      
        4964
                    [
      
        4965
                        "# Implementation Plan",
      
        4966
                        "",
      
        4967
                        "## File Changes",
      
        4968
                        f"- `{chapters}/`",
      
        4969
                        f"- `{nginx_root / 'index.html'}`",
      
        4970
                        "",
      
        4971
                    ]
      
        4972
                )
      
        4973
            )
      
        4974
        
        4975
            tool_call = ToolCall(
      
        4976
                id="mkdir-1",
      
        4977
                name="bash",
      
        4978
                arguments={"command": f"mkdir -p {chapters}"},
      
        4979
            )
      
        4980
            executor = FakeExecutor(
      
        4981
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        4982
            )
      
        4983
            summary = TurnSummary(final_response="")
      
        4984
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        4985
            dod.implementation_plan = str(implementation_plan)
      
        4986
            events: list[AgentEvent] = []
      
        4987
        
        4988
            async def emit(event: AgentEvent) -> None:
      
        4989
                events.append(event)
      
        4990
        
        4991
            await runner.execute_batch(
      
        4992
                tool_calls=[tool_call],
      
        4993
                tool_source="assistant",
      
        4994
                pending_tool_calls_seen=set(),
      
        4995
                emit=emit,
      
        4996
                summary=summary,
      
        4997
                dod=dod,
      
        4998
                executor=executor,  # type: ignore[arg-type]
      
        4999
                on_confirmation=None,
      
        5000
                on_user_question=None,
      
        5001
                emit_confirmation=None,
      
        5002
                consecutive_errors=0,
      
        5003
            )
      
        5004
        
        5005
            assert dod.last_verification_result is None
      
        5006
            assert "Collect verification evidence" not in dod.pending_items
      
        5007
            assert not any(
      
        5008
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        5009
            )
      
        5010
        
        5011
        
        5012
        @pytest.mark.asyncio
      
        5013
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        5014
            temp_dir: Path,
      
        5015
        ) -> None:
      
        5016
            async def assess_confidence(
      
        5017
                tool_name: str,
      
        5018
                tool_args: dict,
      
        5019
                context: str,
      
        5020
            ) -> ConfidenceAssessment:
      
        5021
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5022
        
        5023
            async def verify_action(
      
        5024
                tool_name: str,
      
        5025
                tool_args: dict,
      
        5026
                result: str,
      
        5027
                expected: str = "",
      
        5028
            ) -> ActionVerification:
      
        5029
                raise AssertionError("Verification should not run for this scenario")
      
        5030
        
        5031
            context = build_context(
      
        5032
                temp_dir=temp_dir,
      
        5033
                messages=[],
      
        5034
                safeguards=FakeSafeguards(),
      
        5035
                assess_confidence=assess_confidence,
      
        5036
                verify_action=verify_action,
      
        5037
            )
      
        5038
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5039
            tool_call = ToolCall(
      
        5040
                id="write-1",
      
        5041
                name="write",
      
        5042
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        5043
            )
      
        5044
            executor = FakeExecutor(
      
        5045
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        5046
            )
      
        5047
            summary = TurnSummary(final_response="")
      
        5048
            dod = create_definition_of_done("Update README and verify it still works.")
      
        5049
            dod.verification_commands = ["uv run pytest -q"]
      
        5050
            dod.last_verification_result = "passed"
      
        5051
            dod.verification_attempt_counter = 1
      
        5052
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        5053
            dod.active_verification_attempt_number = 1
      
        5054
            dod.evidence = [
      
        5055
                VerificationEvidence(
      
        5056
                    command="uv run pytest -q",
      
        5057
                    passed=True,
      
        5058
                    stdout="401 passed",
      
        5059
                    kind="test",
      
        5060
                )
      
        5061
            ]
      
        5062
            dod.completed_items.append("Collect verification evidence")
      
        5063
            events: list[AgentEvent] = []
      
        5064
        
        5065
            async def emit(event: AgentEvent) -> None:
      
        5066
                events.append(event)
      
        5067
        
        5068
            await runner.execute_batch(
      
        5069
                tool_calls=[tool_call],
      
        5070
                tool_source="assistant",
      
        5071
                pending_tool_calls_seen=set(),
      
        5072
                emit=emit,
      
        5073
                summary=summary,
      
        5074
                dod=dod,
      
        5075
                executor=executor,  # type: ignore[arg-type]
      
        5076
                on_confirmation=None,
      
        5077
                on_user_question=None,
      
        5078
                emit_confirmation=None,
      
        5079
                consecutive_errors=0,
      
        5080
            )
      
        5081
        
        5082
            assert dod.last_verification_result == "stale"
      
        5083
            assert dod.evidence == []
      
        5084
            assert "Collect verification evidence" in dod.pending_items
      
        5085
            assert "Collect verification evidence" not in dod.completed_items
      
        5086
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        5087
            assert dod.active_verification_attempt_number == 2
      
        5088
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        5089
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        5090
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        5091
            assert (
      
        5092
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        5093
                == "verification-attempt-1"
      
        5094
            )
      
        5095
            assert (
      
        5096
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        5097
            )
      
        5098
            assert (
      
        5099
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        5100
                == "verification-attempt-2"
      
        5101
            )
      
        5102
            assert (
      
        5103
                summary.workflow_timeline[-1].verification_observations[0].command
      
        5104
                == "uv run pytest -q"
      
        5105
            )
      
        5106
        
        5107
        
        5108
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        5109
            async def assess_confidence(
      
        5110
                tool_name: str,
      
        5111
                tool_args: dict,
      
        5112
                context: str,
      
        5113
            ) -> ConfidenceAssessment:
      
        5114
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5115
        
        5116
            async def verify_action(
      
        5117
                tool_name: str,
      
        5118
                tool_args: dict,
      
        5119
                result: str,
      
        5120
                expected: str = "",
      
        5121
            ) -> ActionVerification:
      
        5122
                raise AssertionError("Verification should not run in this scenario")
      
        5123
        
        5124
            repair_target = temp_dir / "guide" / "index.html"
      
        5125
            context = build_context(
      
        5126
                temp_dir=temp_dir,
      
        5127
                messages=[
      
        5128
                    Message(
      
        5129
                        role=Role.ASSISTANT,
      
        5130
                        content=(
      
        5131
                            "Repair focus:\n"
      
        5132
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        5133
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5134
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        5135
                        ),
      
        5136
                    )
      
        5137
                ],
      
        5138
                safeguards=FakeSafeguards(),
      
        5139
                assess_confidence=assess_confidence,
      
        5140
                verify_action=verify_action,
      
        5141
            )
      
        5142
            queued: list[str] = []
      
        5143
            context.queue_steering_message_callback = queued.append
      
        5144
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5145
        
        5146
            runner._queue_blocked_active_repair_nudge(
      
        5147
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        5148
            )
      
        5149
        
        5150
            assert queued
      
        5151
            assert str(repair_target) in queued[0]
      
        5152
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        5153
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        5154
        
        5155
        
        5156
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        5157
            temp_dir: Path,
      
        5158
        ) -> None:
      
        5159
            async def assess_confidence(
      
        5160
                tool_name: str,
      
        5161
                tool_args: dict,
      
        5162
                context: str,
      
        5163
            ) -> ConfidenceAssessment:
      
        5164
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5165
        
        5166
            async def verify_action(
      
        5167
                tool_name: str,
      
        5168
                tool_args: dict,
      
        5169
                result: str,
      
        5170
                expected: str = "",
      
        5171
            ) -> ActionVerification:
      
        5172
                raise AssertionError("Verification should not run in this scenario")
      
        5173
        
        5174
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        5175
            stylesheet = temp_dir / "guide" / "styles.css"
      
        5176
            context = build_context(
      
        5177
                temp_dir=temp_dir,
      
        5178
                messages=[
      
        5179
                    Message(
      
        5180
                        role=Role.ASSISTANT,
      
        5181
                        content=(
      
        5182
                            "Repair focus:\n"
      
        5183
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        5184
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5185
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        5186
                        ),
      
        5187
                    )
      
        5188
                ],
      
        5189
                safeguards=FakeSafeguards(),
      
        5190
                assess_confidence=assess_confidence,
      
        5191
                verify_action=verify_action,
      
        5192
            )
      
        5193
            queued: list[str] = []
      
        5194
            context.queue_steering_message_callback = queued.append
      
        5195
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5196
        
        5197
            runner._queue_blocked_active_repair_mutation_nudge(
      
        5198
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        5199
            )
      
        5200
        
        5201
            assert queued
      
        5202
            assert str(repair_target) in queued[0]
      
        5203
            assert str(stylesheet) in queued[0]
      
        5204
            assert "before widening the change set" in queued[0]
      
        5205
        
        5206
        
        5207
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        5208
            temp_dir: Path,
      
        5209
        ) -> None:
      
        5210
            async def assess_confidence(
      
        5211
                tool_name: str,
      
        5212
                tool_args: dict,
      
        5213
                context: str,
      
        5214
            ) -> ConfidenceAssessment:
      
        5215
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5216
        
        5217
            async def verify_action(
      
        5218
                tool_name: str,
      
        5219
                tool_args: dict,
      
        5220
                result: str,
      
        5221
                expected: str = "",
      
        5222
            ) -> ActionVerification:
      
        5223
                raise AssertionError("Verification should not run in this scenario")
      
        5224
        
        5225
            context = build_context(
      
        5226
                temp_dir=temp_dir,
      
        5227
                messages=[],
      
        5228
                safeguards=FakeSafeguards(),
      
        5229
                assess_confidence=assess_confidence,
      
        5230
                verify_action=verify_action,
      
        5231
            )
      
        5232
            queued: list[str] = []
      
        5233
            context.queue_steering_message_callback = queued.append
      
        5234
            store = DefinitionOfDoneStore(temp_dir)
      
        5235
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        5236
            plan_path = temp_dir / "implementation.md"
      
        5237
            plan_path.write_text(
      
        5238
                "# File Changes\n"
      
        5239
                "- `guide/index.html`\n"
      
        5240
                "- `guide/chapters/01-getting-started.html`\n"
      
        5241
                "- `guide/chapters/02-installation.html`\n"
      
        5242
                "- `guide/chapters/03-first-website.html`\n"
      
        5243
            )
      
        5244
            dod.implementation_plan = str(plan_path)
      
        5245
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        5246
            (temp_dir / "guide" / "index.html").write_text("index")
      
        5247
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        5248
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        5249
            runner = ToolBatchRunner(context, store)
      
        5250
        
        5251
            runner._queue_blocked_late_reference_drift_nudge(
      
        5252
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        5253
                dod=dod,
      
        5254
            )
      
        5255
        
        5256
            assert queued
      
        5257
            assert "03-first-website.html" in queued[0]
      
        5258
            assert "older reference materials" in queued[0]
      
        5259
        
        5260
        
        5261
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        5262
            temp_dir: Path,
      
        5263
        ) -> None:
      
        5264
            async def assess_confidence(
      
        5265
                tool_name: str,
      
        5266
                tool_args: dict,
      
        5267
                context: str,
      
        5268
            ) -> ConfidenceAssessment:
      
        5269
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5270
        
        5271
            async def verify_action(
      
        5272
                tool_name: str,
      
        5273
                tool_args: dict,
      
        5274
                result: str,
      
        5275
                expected: str = "",
      
        5276
            ) -> ActionVerification:
      
        5277
                raise AssertionError("Verification should not run in this scenario")
      
        5278
        
        5279
            guide_root = temp_dir / "guide"
      
        5280
            chapters = guide_root / "chapters"
      
        5281
            guide_root.mkdir(parents=True)
      
        5282
            chapters.mkdir()
      
        5283
            index_path = guide_root / "index.html"
      
        5284
            chapter_one = chapters / "01-getting-started.html"
      
        5285
            chapter_two = chapters / "02-installation.html"
      
        5286
            index_path.write_text("index")
      
        5287
            chapter_one.write_text("one")
      
        5288
            chapter_two.write_text("two")
      
        5289
        
        5290
            implementation_plan = temp_dir / "implementation.md"
      
        5291
            implementation_plan.write_text(
      
        5292
                "\n".join(
      
        5293
                    [
      
        5294
                        "# Implementation Plan",
      
        5295
                        "",
      
        5296
                        "## File Changes",
      
        5297
                        f"- `{guide_root}`",
      
        5298
                        f"- `{chapters}`",
      
        5299
                        f"- `{index_path}`",
      
        5300
                        f"- `{chapter_one}`",
      
        5301
                        f"- `{chapter_two}`",
      
        5302
                        "",
      
        5303
                    ]
      
        5304
                )
      
        5305
            )
      
        5306
        
        5307
            context = build_context(
      
        5308
                temp_dir=temp_dir,
      
        5309
                messages=[],
      
        5310
                safeguards=FakeSafeguards(),
      
        5311
                assess_confidence=assess_confidence,
      
        5312
                verify_action=verify_action,
      
        5313
            )
      
        5314
            queued: list[str] = []
      
        5315
            context.queue_steering_message_callback = queued.append
      
        5316
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5317
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        5318
            dod.implementation_plan = str(implementation_plan)
      
        5319
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5320
            sync_todos_to_definition_of_done(
      
        5321
                dod,
      
        5322
                [
      
        5323
                    {
      
        5324
                        "content": "Verify all guide files are linked and complete",
      
        5325
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        5326
                        "status": "pending",
      
        5327
                    }
      
        5328
                ],
      
        5329
                project_root=temp_dir,
      
        5330
            )
      
        5331
        
        5332
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        5333
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        5334
                dod=dod,
      
        5335
            )
      
        5336
        
        5337
            assert queued
      
        5338
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        5339
            assert "Verify all guide files are linked and complete" in queued[0]
      
        5340
            assert "Do not reopen earlier reference materials." in queued[0]
      
        5341
        
        5342
        
        5343
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        5344
            temp_dir: Path,
      
        5345
        ) -> None:
      
        5346
            async def assess_confidence(
      
        5347
                tool_name: str,
      
        5348
                tool_args: dict,
      
        5349
                context: str,
      
        5350
            ) -> ConfidenceAssessment:
      
        5351
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5352
        
        5353
            async def verify_action(
      
        5354
                tool_name: str,
      
        5355
                tool_args: dict,
      
        5356
                result: str,
      
        5357
                expected: str = "",
      
        5358
            ) -> ActionVerification:
      
        5359
                raise AssertionError("Verification should not run in this scenario")
      
        5360
        
        5361
            context = build_context(
      
        5362
                temp_dir=temp_dir,
      
        5363
                messages=[],
      
        5364
                safeguards=FakeSafeguards(),
      
        5365
                assess_confidence=assess_confidence,
      
        5366
                verify_action=verify_action,
      
        5367
            )
      
        5368
            queued: list[str] = []
      
        5369
            context.queue_steering_message_callback = queued.append
      
        5370
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5371
        
        5372
            runner._queue_blocked_html_declared_target_nudge(
      
        5373
                ToolCall(
      
        5374
                    id="write-ch1",
      
        5375
                    name="write",
      
        5376
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        5377
                ),
      
        5378
                (
      
        5379
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        5380
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        5381
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        5382
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        5383
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        5384
                ),
      
        5385
            )
      
        5386
        
        5387
            assert queued
      
        5388
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        5389
            assert "`chapters/02-installation.html`" in queued[0]
      
        5390
            assert "same file now" in queued[0]
      
        5391
        
        5392
        
        5393
        @pytest.mark.asyncio
      
        5394
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        5395
            temp_dir: Path,
      
        5396
        ) -> None:
      
        5397
            async def assess_confidence(
      
        5398
                tool_name: str,
      
        5399
                tool_args: dict,
      
        5400
                context: str,
      
        5401
            ) -> ConfidenceAssessment:
      
        5402
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5403
        
        5404
            async def verify_action(
      
        5405
                tool_name: str,
      
        5406
                tool_args: dict,
      
        5407
                result: str,
      
        5408
                expected: str = "",
      
        5409
            ) -> ActionVerification:
      
        5410
                raise AssertionError("Verification should not run in this scenario")
      
        5411
        
        5412
            guide_root = temp_dir / "guides" / "nginx"
      
        5413
            chapters = guide_root / "chapters"
      
        5414
            chapters.mkdir(parents=True)
      
        5415
            index_path = guide_root / "index.html"
      
        5416
            chapter_one = chapters / "01-introduction.html"
      
        5417
            chapter_two = chapters / "02-installation.html"
      
        5418
            index_path.write_text("<html></html>\n")
      
        5419
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        5420
        
        5421
            implementation_plan = temp_dir / "implementation.md"
      
        5422
            implementation_plan.write_text(
      
        5423
                "\n".join(
      
        5424
                    [
      
        5425
                        "# Implementation Plan",
      
        5426
                        "",
      
        5427
                        "## File Changes",
      
        5428
                        f"- `{index_path}`",
      
        5429
                        f"- `{chapter_one}`",
      
        5430
                        f"- `{chapter_two}`",
      
        5431
                        "",
      
        5432
                    ]
      
        5433
                )
      
        5434
            )
      
        5435
        
        5436
            context = build_context(
      
        5437
                temp_dir=temp_dir,
      
        5438
                messages=[],
      
        5439
                safeguards=FakeSafeguards(),
      
        5440
                assess_confidence=assess_confidence,
      
        5441
                verify_action=verify_action,
      
        5442
                auto_recover=False,
      
        5443
            )
      
        5444
            queued: list[str] = []
      
        5445
            context.queue_steering_message_callback = queued.append
      
        5446
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5447
            tool_call = ToolCall(
      
        5448
                id="write-2",
      
        5449
                name="write",
      
        5450
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        5451
            )
      
        5452
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        5453
            executor = FakeExecutor(
      
        5454
                [
      
        5455
                    ToolExecutionOutcome(
      
        5456
                        tool_call=tool_call,
      
        5457
                        state=ToolExecutionState.BLOCKED,
      
        5458
                        message=Message.tool_result_message(
      
        5459
                            tool_call_id=tool_call.id,
      
        5460
                            display_content=blocked_message,
      
        5461
                            result_content=blocked_message,
      
        5462
                            is_error=True,
      
        5463
                        ),
      
        5464
                        event_content=blocked_message,
      
        5465
                        is_error=True,
      
        5466
                        result_output=blocked_message,
      
        5467
                    )
      
        5468
                ]
      
        5469
            )
      
        5470
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5471
            dod.implementation_plan = str(implementation_plan)
      
        5472
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5473
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        5474
        
        5475
            await runner.execute_batch(
      
        5476
                tool_calls=[tool_call],
      
        5477
                tool_source="assistant",
      
        5478
                pending_tool_calls_seen=set(),
      
        5479
                emit=_noop_emit,
      
        5480
                summary=TurnSummary(final_response=""),
      
        5481
                dod=dod,
      
        5482
                executor=executor,  # type: ignore[arg-type]
      
        5483
                on_confirmation=None,
      
        5484
                on_user_question=None,
      
        5485
                emit_confirmation=None,
      
        5486
                consecutive_errors=0,
      
        5487
            )
      
        5488
        
        5489
            assert queued
      
        5490
            assert "did not provide a valid `file_path`" in queued[0]
      
        5491
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        5492
            assert (
      
        5493
                f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
      
        5494
                in queued[0]
      
        5495
            )
      
        5496
            assert context.recovery_context is not None
      
        5497
            assert context.recovery_context.attempts[-1].error == blocked_message