loader Public

Watch 0 Fork 0 Star 0
Python · 164917 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.permissions import (
      
        20
            PermissionMode,
      
        21
            build_permission_policy,
      
        22
            load_permission_rules,
      
        23
        )
      
        24
        from loader.runtime.reasoning_types import (
      
        25
            ActionVerification,
      
        26
            ConfidenceAssessment,
      
        27
            ConfidenceLevel,
      
        28
        )
      
        29
        from loader.runtime.recovery import RecoveryContext
      
        30
        from loader.runtime.tool_batches import (
      
        31
            ToolBatchRunner,
      
        32
        )
      
        33
        from loader.runtime.tool_batches import (
      
        34
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        35
        )
      
        36
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        37
        from loader.tools.base import ToolResult as RegistryToolResult
      
        38
        from loader.tools.base import create_default_registry
      
        39
        from tests.helpers.runtime_harness import ScriptedBackend
      
        40
        
        41
        
        42
        class FakeSession:
      
        43
            def __init__(self, messages: list[Message]) -> None:
      
        44
                self.messages = list(messages)
      
        45
                self.workflow_timeline = []
      
        46
        
        47
            def append(self, message: Message) -> None:
      
        48
                self.messages.append(message)
      
        49
        
        50
            def append_workflow_timeline_entry(self, entry) -> None:
      
        51
                self.workflow_timeline.append(entry)
      
        52
        
        53
        
        54
        class FakeCodeFilter:
      
        55
            def reset(self) -> None:
      
        56
                return None
      
        57
        
        58
        
        59
        class FakeSafeguards:
      
        60
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        61
                self.action_tracker = object()
      
        62
                self.validator = object()
      
        63
                self.code_filter = FakeCodeFilter()
      
        64
                self._detect_loop_result = detect_loop_result
      
        65
        
        66
            def filter_stream_chunk(self, content: str) -> str:
      
        67
                return content
      
        68
        
        69
            def filter_complete_content(self, content: str) -> str:
      
        70
                return content
      
        71
        
        72
            def should_steer(self) -> bool:
      
        73
                return False
      
        74
        
        75
            def get_steering_message(self) -> str | None:
      
        76
                return None
      
        77
        
        78
            def record_response(self, content: str) -> None:
      
        79
                return None
      
        80
        
        81
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        82
                return False, ""
      
        83
        
        84
            def detect_loop(self) -> tuple[bool, str]:
      
        85
                return self._detect_loop_result
      
        86
        
        87
        
        88
        class FakeExecutor:
      
        89
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        90
                self._outcomes = list(outcomes)
      
        91
                self.calls: list[ToolCall] = []
      
        92
        
        93
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        94
                self.calls.append(tool_call)
      
        95
                if not self._outcomes:
      
        96
                    raise AssertionError("No fake tool outcome queued")
      
        97
                return self._outcomes.pop(0)
      
        98
        
        99
        
        100
        def build_context(
      
        101
            *,
      
        102
            temp_dir: Path,
      
        103
            messages: list[Message],
      
        104
            safeguards: FakeSafeguards,
      
        105
            assess_confidence,
      
        106
            verify_action,
      
        107
            recovery_context: RecoveryContext | None = None,
      
        108
            confidence_scoring: bool = False,
      
        109
            verification: bool = False,
      
        110
            auto_recover: bool = True,
      
        111
            min_confidence_for_action: int = 3,
      
        112
        ) -> RuntimeContext:
      
        113
            registry = create_default_registry(temp_dir)
      
        114
            registry.configure_workspace_root(temp_dir)
      
        115
            rule_status = load_permission_rules(temp_dir)
      
        116
            policy = build_permission_policy(
      
        117
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        118
                workspace_root=temp_dir,
      
        119
                tool_requirements=registry.get_tool_requirements(),
      
        120
                rules=rule_status.rules,
      
        121
            )
      
        122
            context = RuntimeContext(
      
        123
                project_root=temp_dir,
      
        124
                backend=ScriptedBackend(),
      
        125
                registry=registry,
      
        126
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        127
                config=SimpleNamespace(
      
        128
                    force_react=False,
      
        129
                    max_recovery_attempts=2,
      
        130
                    auto_recover=auto_recover,
      
        131
                    reasoning=SimpleNamespace(
      
        132
                        rollback=False,
      
        133
                        show_rollback_plan=False,
      
        134
                        completion_check=True,
      
        135
                        max_continuation_prompts=5,
      
        136
                        self_critique=False,
      
        137
                        confidence_scoring=confidence_scoring,
      
        138
                        min_confidence_for_action=min_confidence_for_action,
      
        139
                        verification=verification,
      
        140
                    ),
      
        141
                ),
      
        142
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        143
                project_context=None,
      
        144
                permission_policy=policy,
      
        145
                permission_config_status=rule_status,
      
        146
                workflow_mode="execute",
      
        147
                safeguards=safeguards,
      
        148
                reasoning=SimpleNamespace(
      
        149
                    assess_confidence=assess_confidence,
      
        150
                    verify_action=verify_action,
      
        151
                ),
      
        152
                recovery_context=recovery_context,
      
        153
            )
      
        154
            return context
      
        155
        
        156
        
        157
        def tool_outcome(
      
        158
            *,
      
        159
            tool_call: ToolCall,
      
        160
            output: str,
      
        161
            is_error: bool,
      
        162
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        163
            metadata: dict[str, object] | None = None,
      
        164
        ) -> ToolExecutionOutcome:
      
        165
            return ToolExecutionOutcome(
      
        166
                tool_call=tool_call,
      
        167
                state=state,
      
        168
                message=Message.tool_result_message(
      
        169
                    tool_call_id=tool_call.id,
      
        170
                    display_content=output,
      
        171
                    result_content=output,
      
        172
                    is_error=is_error,
      
        173
                ),
      
        174
                event_content=output,
      
        175
                is_error=is_error,
      
        176
                result_output=output,
      
        177
                registry_result=RegistryToolResult(
      
        178
                    output=output,
      
        179
                    is_error=is_error,
      
        180
                    metadata=metadata or {},
      
        181
                ),
      
        182
            )
      
        183
        
        184
        
        185
        @pytest.mark.asyncio
      
        186
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        187
            captured: dict[str, str] = {}
      
        188
        
        189
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        190
                captured["context"] = context
      
        191
                return ConfidenceAssessment(
      
        192
                    action=f"{tool_name} with {tool_args}",
      
        193
                    tool_name=tool_name,
      
        194
                    tool_args=tool_args,
      
        195
                    level=ConfidenceLevel.LOW,
      
        196
                    reasoning="Need to inspect the target first.",
      
        197
                    risks=["Unknown target file"],
      
        198
                )
      
        199
        
        200
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        201
                raise AssertionError("Verification should not run for skipped actions")
      
        202
        
        203
            context = build_context(
      
        204
                temp_dir=temp_dir,
      
        205
                messages=[
      
        206
                    Message(role=Role.USER, content="Please inspect the project."),
      
        207
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        208
                ],
      
        209
                safeguards=FakeSafeguards(),
      
        210
                assess_confidence=assess_confidence,
      
        211
                verify_action=verify_action,
      
        212
                confidence_scoring=True,
      
        213
                min_confidence_for_action=3,
      
        214
            )
      
        215
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        216
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        217
            events: list[AgentEvent] = []
      
        218
        
        219
            async def emit(event: AgentEvent) -> None:
      
        220
                events.append(event)
      
        221
        
        222
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        223
            result = await runner.execute_batch(
      
        224
                tool_calls=[tool_call],
      
        225
                tool_source="assistant",
      
        226
                pending_tool_calls_seen=set(),
      
        227
                emit=emit,
      
        228
                summary=TurnSummary(final_response=""),
      
        229
                dod=create_definition_of_done("Read the docs"),
      
        230
                executor=executor,  # type: ignore[arg-type]
      
        231
                on_confirmation=None,
      
        232
                on_user_question=None,
      
        233
                emit_confirmation=None,
      
        234
                consecutive_errors=0,
      
        235
            )
      
        236
        
        237
            assert result.actions_taken == []
      
        238
            assert executor.calls == []
      
        239
            assert "Please inspect the project." in captured["context"]
      
        240
            assert context.session.messages[-1].role == Role.USER
      
        241
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        242
            event_types = [event.type for event in events]
      
        243
            assert "confidence" in event_types
      
        244
        
        245
        
        246
        @pytest.mark.asyncio
      
        247
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        248
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        250
        
        251
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        252
                raise AssertionError("Verification should not run for failed actions")
      
        253
        
        254
            context = build_context(
      
        255
                temp_dir=temp_dir,
      
        256
                messages=[],
      
        257
                safeguards=FakeSafeguards(),
      
        258
                assess_confidence=assess_confidence,
      
        259
                verify_action=verify_action,
      
        260
                auto_recover=True,
      
        261
            )
      
        262
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        263
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        264
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        265
            summary = TurnSummary(final_response="")
      
        266
            events: list[AgentEvent] = []
      
        267
        
        268
            async def emit(event: AgentEvent) -> None:
      
        269
                events.append(event)
      
        270
        
        271
            await runner.execute_batch(
      
        272
                tool_calls=[tool_call],
      
        273
                tool_source="assistant",
      
        274
                pending_tool_calls_seen=set(),
      
        275
                emit=emit,
      
        276
                summary=summary,
      
        277
                dod=create_definition_of_done("Run tests"),
      
        278
                executor=executor,  # type: ignore[arg-type]
      
        279
                on_confirmation=None,
      
        280
                on_user_question=None,
      
        281
                emit_confirmation=None,
      
        282
                consecutive_errors=0,
      
        283
            )
      
        284
        
        285
            assert context.recovery_context is not None
      
        286
            assert summary.tool_result_messages
      
        287
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        288
            assert any(event.type == "recovery" for event in events)
      
        289
        
        290
        
        291
        @pytest.mark.asyncio
      
        292
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        293
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        294
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        295
        
        296
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        297
                raise AssertionError("Verification should not run for this scenario")
      
        298
        
        299
            context = build_context(
      
        300
                temp_dir=temp_dir,
      
        301
                messages=[],
      
        302
                safeguards=FakeSafeguards(),
      
        303
                assess_confidence=assess_confidence,
      
        304
                verify_action=verify_action,
      
        305
                auto_recover=False,
      
        306
            )
      
        307
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        308
            tool_call = ToolCall(
      
        309
                id="bash-1",
      
        310
                name="bash",
      
        311
                arguments={"command": "python -m http.server 8000", "background": True},
      
        312
            )
      
        313
            metadata = {
      
        314
                "job_id": "bash-1",
      
        315
                "status": "running",
      
        316
                "background": True,
      
        317
            }
      
        318
            executor = FakeExecutor(
      
        319
                [
      
        320
                    tool_outcome(
      
        321
                        tool_call=tool_call,
      
        322
                        output="Started bash job bash-1",
      
        323
                        is_error=False,
      
        324
                        metadata=metadata,
      
        325
                    )
      
        326
                ]
      
        327
            )
      
        328
            events: list[AgentEvent] = []
      
        329
        
        330
            async def emit(event: AgentEvent) -> None:
      
        331
                events.append(event)
      
        332
        
        333
            await runner.execute_batch(
      
        334
                tool_calls=[tool_call],
      
        335
                tool_source="assistant",
      
        336
                pending_tool_calls_seen=set(),
      
        337
                emit=emit,
      
        338
                summary=TurnSummary(final_response=""),
      
        339
                dod=create_definition_of_done("Launch a preview server"),
      
        340
                executor=executor,  # type: ignore[arg-type]
      
        341
                on_confirmation=None,
      
        342
                on_user_question=None,
      
        343
                emit_confirmation=None,
      
        344
                consecutive_errors=0,
      
        345
            )
      
        346
        
        347
            tool_result = next(event for event in events if event.type == "tool_result")
      
        348
            assert tool_result.tool_metadata == metadata
      
        349
        
        350
        
        351
        @pytest.mark.asyncio
      
        352
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        353
            verification_calls: list[str] = []
      
        354
        
        355
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        356
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        357
        
        358
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        359
                verification_calls.append(result)
      
        360
                return ActionVerification(
      
        361
                    tool_name=tool_name,
      
        362
                    tool_args=tool_args,
      
        363
                    expected_outcome="Success",
      
        364
                    actual_result=result,
      
        365
                    verified=False,
      
        366
                    discrepancies=["File contents did not match"],
      
        367
                    needs_correction=True,
      
        368
                    correction_suggestion="Read the file before editing again.",
      
        369
                )
      
        370
        
        371
            existing_recovery = RecoveryContext(
      
        372
                original_tool="edit",
      
        373
                original_args={"file_path": "README.md"},
      
        374
            )
      
        375
            context = build_context(
      
        376
                temp_dir=temp_dir,
      
        377
                messages=[],
      
        378
                safeguards=FakeSafeguards(),
      
        379
                assess_confidence=assess_confidence,
      
        380
                verify_action=verify_action,
      
        381
                recovery_context=existing_recovery,
      
        382
                verification=True,
      
        383
            )
      
        384
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        385
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        386
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        387
            events: list[AgentEvent] = []
      
        388
        
        389
            async def emit(event: AgentEvent) -> None:
      
        390
                events.append(event)
      
        391
        
        392
            await runner.execute_batch(
      
        393
                tool_calls=[tool_call],
      
        394
                tool_source="assistant",
      
        395
                pending_tool_calls_seen=set(),
      
        396
                emit=emit,
      
        397
                summary=TurnSummary(final_response=""),
      
        398
                dod=create_definition_of_done("Read the docs"),
      
        399
                executor=executor,  # type: ignore[arg-type]
      
        400
                on_confirmation=None,
      
        401
                on_user_question=None,
      
        402
                emit_confirmation=None,
      
        403
                consecutive_errors=0,
      
        404
            )
      
        405
        
        406
            assert verification_calls == ["file contents"]
      
        407
            assert context.recovery_context is existing_recovery
      
        408
            assert existing_recovery.successful_steps == [
      
        409
                ("read", {"file_path": "README.md"})
      
        410
            ]
      
        411
            assert context.session.messages[-1].role == Role.TOOL
      
        412
            assert context.session.messages[-1].content == "file contents"
      
        413
            assert any(event.type == "verification" for event in events)
      
        414
        
        415
        
        416
        @pytest.mark.asyncio
      
        417
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        418
            temp_dir: Path,
      
        419
        ) -> None:
      
        420
            async def assess_confidence(
      
        421
                tool_name: str,
      
        422
                tool_args: dict,
      
        423
                context: str,
      
        424
            ) -> ConfidenceAssessment:
      
        425
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        426
        
        427
            async def verify_action(
      
        428
                tool_name: str,
      
        429
                tool_args: dict,
      
        430
                result: str,
      
        431
                expected: str = "",
      
        432
            ) -> ActionVerification:
      
        433
                raise AssertionError("Verification should not run for this scenario")
      
        434
        
        435
            existing_recovery = RecoveryContext(
      
        436
                original_tool="read",
      
        437
                original_args={"file_path": "chapters/04-data-types.html"},
      
        438
            )
      
        439
            existing_recovery.add_attempt(
      
        440
                "read",
      
        441
                {"file_path": "chapters/04-data-types.html"},
      
        442
                "File not found",
      
        443
            )
      
        444
            context = build_context(
      
        445
                temp_dir=temp_dir,
      
        446
                messages=[],
      
        447
                safeguards=FakeSafeguards(),
      
        448
                assess_confidence=assess_confidence,
      
        449
                verify_action=verify_action,
      
        450
                recovery_context=existing_recovery,
      
        451
                auto_recover=False,
      
        452
            )
      
        453
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        454
            tool_call = ToolCall(
      
        455
                id="bash-1",
      
        456
                name="bash",
      
        457
                arguments={"command": "ls chapters"},
      
        458
            )
      
        459
            executor = FakeExecutor(
      
        460
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        461
            )
      
        462
        
        463
            summary = TurnSummary(final_response="")
      
        464
            await runner.execute_batch(
      
        465
                tool_calls=[tool_call],
      
        466
                tool_source="assistant",
      
        467
                pending_tool_calls_seen=set(),
      
        468
                emit=_noop_emit,
      
        469
                summary=summary,
      
        470
                dod=create_definition_of_done("Fix the chapter links"),
      
        471
                executor=executor,  # type: ignore[arg-type]
      
        472
                on_confirmation=None,
      
        473
                on_user_question=None,
      
        474
                emit_confirmation=None,
      
        475
                consecutive_errors=0,
      
        476
            )
      
        477
        
        478
            assert context.recovery_context is existing_recovery
      
        479
            assert existing_recovery.successful_steps == [
      
        480
                ("bash", {"command": "ls chapters"})
      
        481
            ]
      
        482
        
        483
        
        484
        @pytest.mark.asyncio
      
        485
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        486
            temp_dir: Path,
      
        487
        ) -> None:
      
        488
            async def assess_confidence(
      
        489
                tool_name: str,
      
        490
                tool_args: dict,
      
        491
                context: str,
      
        492
            ) -> ConfidenceAssessment:
      
        493
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        494
        
        495
            async def verify_action(
      
        496
                tool_name: str,
      
        497
                tool_args: dict,
      
        498
                result: str,
      
        499
                expected: str = "",
      
        500
            ) -> ActionVerification:
      
        501
                raise AssertionError("Verification should not run for this scenario")
      
        502
        
        503
            existing_recovery = RecoveryContext(
      
        504
                original_tool="read",
      
        505
                original_args={"file_path": "chapters/04-data-types.html"},
      
        506
            )
      
        507
            existing_recovery.add_attempt(
      
        508
                "read",
      
        509
                {"file_path": "chapters/04-data-types.html"},
      
        510
                "File not found",
      
        511
            )
      
        512
            context = build_context(
      
        513
                temp_dir=temp_dir,
      
        514
                messages=[],
      
        515
                safeguards=FakeSafeguards(),
      
        516
                assess_confidence=assess_confidence,
      
        517
                verify_action=verify_action,
      
        518
                recovery_context=existing_recovery,
      
        519
                auto_recover=False,
      
        520
            )
      
        521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        522
            tool_call = ToolCall(
      
        523
                id="patch-1",
      
        524
                name="patch",
      
        525
                arguments={
      
        526
                    "file_path": "index.html",
      
        527
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        528
                },
      
        529
            )
      
        530
            executor = FakeExecutor(
      
        531
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        532
            )
      
        533
        
        534
            summary = TurnSummary(final_response="")
      
        535
            await runner.execute_batch(
      
        536
                tool_calls=[tool_call],
      
        537
                tool_source="assistant",
      
        538
                pending_tool_calls_seen=set(),
      
        539
                emit=_noop_emit,
      
        540
                summary=summary,
      
        541
                dod=create_definition_of_done("Fix the chapter links"),
      
        542
                executor=executor,  # type: ignore[arg-type]
      
        543
                on_confirmation=None,
      
        544
                on_user_question=None,
      
        545
                emit_confirmation=None,
      
        546
                consecutive_errors=0,
      
        547
            )
      
        548
        
        549
            assert context.recovery_context is None
      
        550
        
        551
        
        552
        @pytest.mark.asyncio
      
        553
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        554
            temp_dir: Path,
      
        555
        ) -> None:
      
        556
            async def assess_confidence(
      
        557
                tool_name: str,
      
        558
                tool_args: dict,
      
        559
                context: str,
      
        560
            ) -> ConfidenceAssessment:
      
        561
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        562
        
        563
            async def verify_action(
      
        564
                tool_name: str,
      
        565
                tool_args: dict,
      
        566
                result: str,
      
        567
                expected: str = "",
      
        568
            ) -> ActionVerification:
      
        569
                raise AssertionError("Verification should not run for this scenario")
      
        570
        
        571
            messages = [
      
        572
                Message(
      
        573
                    role=Role.TOOL,
      
        574
                    content=(
      
        575
                        "Observation [glob]: Result: "
      
        576
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        577
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        578
                        f"{temp_dir}/chapters/03-basics.html"
      
        579
                    ),
      
        580
                    tool_results=[],
      
        581
                ),
      
        582
                Message(
      
        583
                    role=Role.ASSISTANT,
      
        584
                    content="I already inspected the first chapter title.",
      
        585
                    tool_calls=[
      
        586
                        ToolCall(
      
        587
                            id="read-ch1",
      
        588
                            name="read",
      
        589
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        590
                        )
      
        591
                    ],
      
        592
                ),
      
        593
                Message.tool_result_message(
      
        594
                    tool_call_id="read-ch1",
      
        595
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        596
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                ),
      
        598
                Message(
      
        599
                    role=Role.ASSISTANT,
      
        600
                    content="I should update the index now.",
      
        601
                    tool_calls=[
      
        602
                        ToolCall(
      
        603
                            id="read-index",
      
        604
                            name="read",
      
        605
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        606
                        )
      
        607
                    ],
      
        608
                ),
      
        609
            ]
      
        610
            context = build_context(
      
        611
                temp_dir=temp_dir,
      
        612
                messages=messages,
      
        613
                safeguards=FakeSafeguards(),
      
        614
                assess_confidence=assess_confidence,
      
        615
                verify_action=verify_action,
      
        616
                auto_recover=False,
      
        617
            )
      
        618
            (temp_dir / "chapters").mkdir()
      
        619
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        620
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        621
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        622
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        623
            implementation_plan = temp_dir / "implementation.md"
      
        624
            implementation_plan.write_text(
      
        625
                "\n".join(
      
        626
                    [
      
        627
                        "# Implementation Plan",
      
        628
                        "",
      
        629
                        "## File Changes",
      
        630
                        f"- `{temp_dir / 'index.html'}`",
      
        631
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        635
                    ]
      
        636
                )
      
        637
            )
      
        638
            context.session.current_task = (
      
        639
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        640
            )
      
        641
            persistent_messages: list[str] = []
      
        642
            ephemeral_messages: list[str] = []
      
        643
            context.queue_steering_message_callback = persistent_messages.append
      
        644
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        645
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        646
            tool_call = ToolCall(
      
        647
                id="read-dup",
      
        648
                name="read",
      
        649
                arguments={"file_path": str(temp_dir / "index.html")},
      
        650
            )
      
        651
            duplicate_message = (
      
        652
                "[Skipped - duplicate action: Already read "
      
        653
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        654
                "reuse the earlier read result instead of rereading]"
      
        655
            )
      
        656
            executor = FakeExecutor(
      
        657
                [
      
        658
                    ToolExecutionOutcome(
      
        659
                        tool_call=tool_call,
      
        660
                        state=ToolExecutionState.DUPLICATE,
      
        661
                        message=Message.tool_result_message(
      
        662
                            tool_call_id=tool_call.id,
      
        663
                            display_content=duplicate_message,
      
        664
                            result_content=duplicate_message,
      
        665
                        ),
      
        666
                        event_content=duplicate_message,
      
        667
                        is_error=False,
      
        668
                        result_output=duplicate_message,
      
        669
                    )
      
        670
                ]
      
        671
            )
      
        672
        
        673
            summary = TurnSummary(final_response="")
      
        674
            dod = create_definition_of_done("Fix the chapter links")
      
        675
            dod.implementation_plan = str(implementation_plan)
      
        676
            dod.pending_items.append("Create the remaining chapter files")
      
        677
            await runner.execute_batch(
      
        678
                tool_calls=[tool_call],
      
        679
                tool_source="assistant",
      
        680
                pending_tool_calls_seen=set(),
      
        681
                emit=_noop_emit,
      
        682
                summary=summary,
      
        683
                dod=dod,
      
        684
                executor=executor,  # type: ignore[arg-type]
      
        685
                on_confirmation=None,
      
        686
                on_user_question=None,
      
        687
                emit_confirmation=None,
      
        688
                consecutive_errors=0,
      
        689
            )
      
        690
        
        691
            assert len(persistent_messages) == 1
      
        692
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        693
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        694
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        695
            assert (
      
        696
                f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
      
        697
                in persistent_messages[0]
      
        698
            )
      
        699
            assert ephemeral_messages == []
      
        700
        
        701
        
        702
        @pytest.mark.asyncio
      
        703
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        704
            temp_dir: Path,
      
        705
        ) -> None:
      
        706
            async def assess_confidence(
      
        707
                tool_name: str,
      
        708
                tool_args: dict,
      
        709
                context: str,
      
        710
            ) -> ConfidenceAssessment:
      
        711
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        712
        
        713
            async def verify_action(
      
        714
                tool_name: str,
      
        715
                tool_args: dict,
      
        716
                result: str,
      
        717
                expected: str = "",
      
        718
            ) -> ActionVerification:
      
        719
                raise AssertionError("Verification should not run for this scenario")
      
        720
        
        721
            context = build_context(
      
        722
                temp_dir=temp_dir,
      
        723
                messages=[],
      
        724
                safeguards=FakeSafeguards(),
      
        725
                assess_confidence=assess_confidence,
      
        726
                verify_action=verify_action,
      
        727
                auto_recover=False,
      
        728
            )
      
        729
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        730
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        731
            sync_todos_to_definition_of_done(
      
        732
                dod,
      
        733
                [
      
        734
                    {
      
        735
                        "content": "Create 03-first-website.html",
      
        736
                        "active_form": "Creating 03-first-website.html",
      
        737
                        "status": "pending",
      
        738
                    },
      
        739
                    {
      
        740
                        "content": "Create 04-configuration-basics.html",
      
        741
                        "active_form": "Creating 04-configuration-basics.html",
      
        742
                        "status": "pending",
      
        743
                    },
      
        744
                ],
      
        745
            )
      
        746
        
        747
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        748
            chapter_path.parent.mkdir(parents=True)
      
        749
            write_call = ToolCall(
      
        750
                id="write-ch3",
      
        751
                name="write",
      
        752
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        753
            )
      
        754
            stale_todo_call = ToolCall(
      
        755
                id="todo-stale",
      
        756
                name="TodoWrite",
      
        757
                arguments={
      
        758
                    "todos": [
      
        759
                        {
      
        760
                            "content": "Create 03-first-website.html",
      
        761
                            "active_form": "Creating 03-first-website.html",
      
        762
                            "status": "pending",
      
        763
                        },
      
        764
                        {
      
        765
                            "content": "Create 04-configuration-basics.html",
      
        766
                            "active_form": "Creating 04-configuration-basics.html",
      
        767
                            "status": "pending",
      
        768
                        },
      
        769
                    ]
      
        770
                },
      
        771
            )
      
        772
            executor = FakeExecutor(
      
        773
                [
      
        774
                    tool_outcome(
      
        775
                        tool_call=write_call,
      
        776
                        output=f"Successfully wrote {chapter_path}",
      
        777
                        is_error=False,
      
        778
                    ),
      
        779
                    tool_outcome(
      
        780
                        tool_call=stale_todo_call,
      
        781
                        output="Todos updated",
      
        782
                        is_error=False,
      
        783
                        metadata={
      
        784
                            "new_todos": [
      
        785
                                {
      
        786
                                    "content": "Create 03-first-website.html",
      
        787
                                    "active_form": "Creating 03-first-website.html",
      
        788
                                    "status": "pending",
      
        789
                                },
      
        790
                                {
      
        791
                                    "content": "Create 04-configuration-basics.html",
      
        792
                                    "active_form": "Creating 04-configuration-basics.html",
      
        793
                                    "status": "pending",
      
        794
                                },
      
        795
                            ]
      
        796
                        },
      
        797
                    ),
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            await runner.execute_batch(
      
        803
                tool_calls=[write_call, stale_todo_call],
      
        804
                tool_source="assistant",
      
        805
                pending_tool_calls_seen=set(),
      
        806
                emit=_noop_emit,
      
        807
                summary=summary,
      
        808
                dod=dod,
      
        809
                executor=executor,  # type: ignore[arg-type]
      
        810
                on_confirmation=None,
      
        811
                on_user_question=None,
      
        812
                emit_confirmation=None,
      
        813
                consecutive_errors=0,
      
        814
            )
      
        815
        
        816
            assert "Create 03-first-website.html" in dod.completed_items
      
        817
            assert "Create 03-first-website.html" not in dod.pending_items
      
        818
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        819
        
        820
        
        821
        @pytest.mark.asyncio
      
        822
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        823
            temp_dir: Path,
      
        824
        ) -> None:
      
        825
            async def assess_confidence(
      
        826
                tool_name: str,
      
        827
                tool_args: dict,
      
        828
                context: str,
      
        829
            ) -> ConfidenceAssessment:
      
        830
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        831
        
        832
            async def verify_action(
      
        833
                tool_name: str,
      
        834
                tool_args: dict,
      
        835
                result: str,
      
        836
                expected: str = "",
      
        837
            ) -> ActionVerification:
      
        838
                raise AssertionError("Verification should not run for this scenario")
      
        839
        
        840
            chapters = temp_dir / "chapters"
      
        841
            chapters.mkdir()
      
        842
            (chapters / "01-introduction.html").write_text(
      
        843
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        844
            )
      
        845
            (chapters / "02-setup.html").write_text(
      
        846
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        847
            )
      
        848
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        849
        
        850
            context = build_context(
      
        851
                temp_dir=temp_dir,
      
        852
                messages=[],
      
        853
                safeguards=FakeSafeguards(),
      
        854
                assess_confidence=assess_confidence,
      
        855
                verify_action=verify_action,
      
        856
                auto_recover=False,
      
        857
            )
      
        858
            context.session.current_task = (
      
        859
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        860
            )
      
        861
            persistent_messages: list[str] = []
      
        862
            ephemeral_messages: list[str] = []
      
        863
            context.queue_steering_message_callback = persistent_messages.append
      
        864
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        865
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        866
            tool_call = ToolCall(
      
        867
                id="glob-1",
      
        868
                name="glob",
      
        869
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        870
            )
      
        871
            executor = FakeExecutor(
      
        872
                [
      
        873
                    tool_outcome(
      
        874
                        tool_call=tool_call,
      
        875
                        output="\n".join(
      
        876
                            [
      
        877
                                str(chapters / "01-introduction.html"),
      
        878
                                str(chapters / "02-setup.html"),
      
        879
                            ]
      
        880
                        ),
      
        881
                        is_error=False,
      
        882
                    )
      
        883
                ]
      
        884
            )
      
        885
        
        886
            summary = TurnSummary(final_response="")
      
        887
            await runner.execute_batch(
      
        888
                tool_calls=[tool_call],
      
        889
                tool_source="assistant",
      
        890
                pending_tool_calls_seen=set(),
      
        891
                emit=_noop_emit,
      
        892
                summary=summary,
      
        893
                dod=create_definition_of_done("Fix the chapter links"),
      
        894
                executor=executor,  # type: ignore[arg-type]
      
        895
                on_confirmation=None,
      
        896
                on_user_question=None,
      
        897
                emit_confirmation=None,
      
        898
                consecutive_errors=0,
      
        899
            )
      
        900
        
        901
            assert persistent_messages == []
      
        902
            assert ephemeral_messages == []
      
        903
            assert len(summary.tool_result_messages) == 1
      
        904
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        905
        
        906
        
        907
        @pytest.mark.asyncio
      
        908
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        909
            temp_dir: Path,
      
        910
        ) -> None:
      
        911
            async def assess_confidence(
      
        912
                tool_name: str,
      
        913
                tool_args: dict,
      
        914
                context: str,
      
        915
            ) -> ConfidenceAssessment:
      
        916
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        917
        
        918
            async def verify_action(
      
        919
                tool_name: str,
      
        920
                tool_args: dict,
      
        921
                result: str,
      
        922
                expected: str = "",
      
        923
            ) -> ActionVerification:
      
        924
                raise AssertionError("Verification should not run for this scenario")
      
        925
        
        926
            chapters = temp_dir / "chapters"
      
        927
            chapters.mkdir()
      
        928
            (chapters / "01-introduction.html").write_text(
      
        929
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        930
            )
      
        931
            (chapters / "02-setup.html").write_text(
      
        932
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        933
            )
      
        934
            index_path = temp_dir / "index.html"
      
        935
            old_block = (
      
        936
                '<ul class="chapter-list">\n'
      
        937
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        938
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        939
                "</ul>\n"
      
        940
            )
      
        941
            new_block = (
      
        942
                '<ul class="chapter-list">\n'
      
        943
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        944
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        945
                "</ul>\n"
      
        946
            )
      
        947
            index_path.write_text(new_block)
      
        948
        
        949
            context = build_context(
      
        950
                temp_dir=temp_dir,
      
        951
                messages=[],
      
        952
                safeguards=FakeSafeguards(),
      
        953
                assess_confidence=assess_confidence,
      
        954
                verify_action=verify_action,
      
        955
                auto_recover=False,
      
        956
            )
      
        957
            context.session.current_task = (
      
        958
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        959
            )
      
        960
            persistent_messages: list[str] = []
      
        961
            ephemeral_messages: list[str] = []
      
        962
            context.queue_steering_message_callback = persistent_messages.append
      
        963
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        964
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        965
            tool_call = ToolCall(
      
        966
                id="edit-1",
      
        967
                name="edit",
      
        968
                arguments={
      
        969
                    "file_path": str(index_path),
      
        970
                    "old_string": old_block,
      
        971
                    "new_string": new_block,
      
        972
                },
      
        973
            )
      
        974
            executor = FakeExecutor(
      
        975
                [
      
        976
                    tool_outcome(
      
        977
                        tool_call=tool_call,
      
        978
                        output=f"Successfully edited {index_path}",
      
        979
                        is_error=False,
      
        980
                    )
      
        981
                ]
      
        982
            )
      
        983
        
        984
            summary = TurnSummary(final_response="")
      
        985
            await runner.execute_batch(
      
        986
                tool_calls=[tool_call],
      
        987
                tool_source="assistant",
      
        988
                pending_tool_calls_seen=set(),
      
        989
                emit=_noop_emit,
      
        990
                summary=summary,
      
        991
                dod=create_definition_of_done(
      
        992
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        993
                ),
      
        994
                executor=executor,  # type: ignore[arg-type]
      
        995
                on_confirmation=None,
      
        996
                on_user_question=None,
      
        997
                emit_confirmation=None,
      
        998
                consecutive_errors=0,
      
        999
            )
      
        1000
        
        1001
            assert all(
      
        1002
                "Semantic verification preview:" not in message.content
      
        1003
                for message in summary.tool_result_messages
      
        1004
            )
      
        1005
            assert persistent_messages == []
      
        1006
            assert ephemeral_messages == []
      
        1007
        
        1008
        
        1009
        @pytest.mark.asyncio
      
        1010
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1011
            temp_dir: Path,
      
        1012
        ) -> None:
      
        1013
            async def assess_confidence(
      
        1014
                tool_name: str,
      
        1015
                tool_args: dict,
      
        1016
                context: str,
      
        1017
            ) -> ConfidenceAssessment:
      
        1018
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1019
        
        1020
            async def verify_action(
      
        1021
                tool_name: str,
      
        1022
                tool_args: dict,
      
        1023
                result: str,
      
        1024
                expected: str = "",
      
        1025
            ) -> ActionVerification:
      
        1026
                raise AssertionError("Verification should not run for this scenario")
      
        1027
        
        1028
            chapters = temp_dir / "chapters"
      
        1029
            chapters.mkdir()
      
        1030
            (chapters / "01-introduction.html").write_text(
      
        1031
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1032
            )
      
        1033
            (chapters / "02-setup.html").write_text(
      
        1034
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1035
            )
      
        1036
            index_path = temp_dir / "index.html"
      
        1037
            index_path.write_text(
      
        1038
                "<h2>Table of Contents</h2>\n"
      
        1039
                '<ul class="chapter-list">\n'
      
        1040
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1041
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1042
                "</ul>\n"
      
        1043
            )
      
        1044
        
        1045
            prompt = (
      
        1046
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1047
                "for the structure and cadence of the guide. We are going to make an all "
      
        1048
                "new equally thorough guide on how to use the nginx tool."
      
        1049
            )
      
        1050
        
        1051
            context = build_context(
      
        1052
                temp_dir=temp_dir,
      
        1053
                messages=[],
      
        1054
                safeguards=FakeSafeguards(),
      
        1055
                assess_confidence=assess_confidence,
      
        1056
                verify_action=verify_action,
      
        1057
                auto_recover=False,
      
        1058
            )
      
        1059
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1060
            persistent_messages: list[str] = []
      
        1061
            ephemeral_messages: list[str] = []
      
        1062
            context.queue_steering_message_callback = persistent_messages.append
      
        1063
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1064
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1065
            tool_call = ToolCall(
      
        1066
                id="read-index",
      
        1067
                name="read",
      
        1068
                arguments={"file_path": str(index_path)},
      
        1069
            )
      
        1070
            executor = FakeExecutor(
      
        1071
                [
      
        1072
                    tool_outcome(
      
        1073
                        tool_call=tool_call,
      
        1074
                        output=index_path.read_text(),
      
        1075
                        is_error=False,
      
        1076
                    )
      
        1077
                ]
      
        1078
            )
      
        1079
        
        1080
            summary = TurnSummary(final_response="")
      
        1081
            await runner.execute_batch(
      
        1082
                tool_calls=[tool_call],
      
        1083
                tool_source="assistant",
      
        1084
                pending_tool_calls_seen=set(),
      
        1085
                emit=_noop_emit,
      
        1086
                summary=summary,
      
        1087
                dod=create_definition_of_done(prompt),
      
        1088
                executor=executor,  # type: ignore[arg-type]
      
        1089
                on_confirmation=None,
      
        1090
                on_user_question=None,
      
        1091
                emit_confirmation=None,
      
        1092
                consecutive_errors=0,
      
        1093
            )
      
        1094
        
        1095
            assert persistent_messages == []
      
        1096
            assert ephemeral_messages == []
      
        1097
            assert all(
      
        1098
                "Semantic verification preview:" not in message.content
      
        1099
                for message in summary.tool_result_messages
      
        1100
            )
      
        1101
        
        1102
        
        1103
        @pytest.mark.asyncio
      
        1104
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1105
            temp_dir: Path,
      
        1106
        ) -> None:
      
        1107
            async def assess_confidence(
      
        1108
                tool_name: str,
      
        1109
                tool_args: dict,
      
        1110
                context: str,
      
        1111
            ) -> ConfidenceAssessment:
      
        1112
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1113
        
        1114
            async def verify_action(
      
        1115
                tool_name: str,
      
        1116
                tool_args: dict,
      
        1117
                result: str,
      
        1118
                expected: str = "",
      
        1119
            ) -> ActionVerification:
      
        1120
                raise AssertionError("Verification should not run for this scenario")
      
        1121
        
        1122
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1123
            reference.parent.mkdir(parents=True)
      
        1124
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1125
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1126
            chapters = nginx_root / "chapters"
      
        1127
            implementation_plan = temp_dir / "implementation.md"
      
        1128
            implementation_plan.write_text(
      
        1129
                "\n".join(
      
        1130
                    [
      
        1131
                        "# Implementation Plan",
      
        1132
                        "",
      
        1133
                        "## File Changes",
      
        1134
                        f"- `{chapters}/`",
      
        1135
                        f"- `{nginx_root / 'index.html'}`",
      
        1136
                        "",
      
        1137
                    ]
      
        1138
                )
      
        1139
            )
      
        1140
        
        1141
            context = build_context(
      
        1142
                temp_dir=temp_dir,
      
        1143
                messages=[],
      
        1144
                safeguards=FakeSafeguards(),
      
        1145
                assess_confidence=assess_confidence,
      
        1146
                verify_action=verify_action,
      
        1147
                auto_recover=False,
      
        1148
            )
      
        1149
            persistent_messages: list[str] = []
      
        1150
            ephemeral_messages: list[str] = []
      
        1151
            context.queue_steering_message_callback = persistent_messages.append
      
        1152
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1153
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1154
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1155
            dod.implementation_plan = str(implementation_plan)
      
        1156
            sync_todos_to_definition_of_done(
      
        1157
                dod,
      
        1158
                [
      
        1159
                    {
      
        1160
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1161
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1162
                        "status": "pending",
      
        1163
                    },
      
        1164
                    {
      
        1165
                        "content": "Create the nginx directory structure",
      
        1166
                        "active_form": "Working on: Create the nginx directory structure",
      
        1167
                        "status": "pending",
      
        1168
                    },
      
        1169
                    {
      
        1170
                        "content": "Create the nginx index.html file",
      
        1171
                        "active_form": "Working on: Create the nginx index.html file",
      
        1172
                        "status": "pending",
      
        1173
                    },
      
        1174
                ],
      
        1175
            )
      
        1176
            tool_call = ToolCall(
      
        1177
                id="read-reference",
      
        1178
                name="read",
      
        1179
                arguments={"file_path": str(reference)},
      
        1180
            )
      
        1181
            executor = FakeExecutor(
      
        1182
                [
      
        1183
                    tool_outcome(
      
        1184
                        tool_call=tool_call,
      
        1185
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1186
                        is_error=False,
      
        1187
                    )
      
        1188
                ]
      
        1189
            )
      
        1190
        
        1191
            summary = TurnSummary(final_response="")
      
        1192
            await runner.execute_batch(
      
        1193
                tool_calls=[tool_call],
      
        1194
                tool_source="assistant",
      
        1195
                pending_tool_calls_seen=set(),
      
        1196
                emit=_noop_emit,
      
        1197
                summary=summary,
      
        1198
                dod=dod,
      
        1199
                executor=executor,  # type: ignore[arg-type]
      
        1200
                on_confirmation=None,
      
        1201
                on_user_question=None,
      
        1202
                emit_confirmation=None,
      
        1203
                consecutive_errors=0,
      
        1204
            )
      
        1205
        
        1206
            assert (
      
        1207
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1208
                in dod.completed_items
      
        1209
            )
      
        1210
            assert any(
      
        1211
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1212
                in message
      
        1213
                for message in persistent_messages
      
        1214
            )
      
        1215
            assert any(
      
        1216
                "Resume by creating `chapters/` now." in message
      
        1217
                for message in persistent_messages
      
        1218
            )
      
        1219
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1220
            assert ephemeral_messages == []
      
        1221
        
        1222
        
        1223
        @pytest.mark.asyncio
      
        1224
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1225
            temp_dir: Path,
      
        1226
        ) -> None:
      
        1227
            async def assess_confidence(
      
        1228
                tool_name: str,
      
        1229
                tool_args: dict,
      
        1230
                context: str,
      
        1231
            ) -> ConfidenceAssessment:
      
        1232
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1233
        
        1234
            async def verify_action(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                result: str,
      
        1238
                expected: str = "",
      
        1239
            ) -> ActionVerification:
      
        1240
                raise AssertionError("Verification should not run for this scenario")
      
        1241
        
        1242
            reference = temp_dir / "fortran" / "index.html"
      
        1243
            reference.parent.mkdir(parents=True)
      
        1244
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1245
        
        1246
            messages = [
      
        1247
                Message(
      
        1248
                    role=Role.TOOL,
      
        1249
                    content=(
      
        1250
                        "Observation [read]: Result: "
      
        1251
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1252
                    ),
      
        1253
                )
      
        1254
            ]
      
        1255
            context = build_context(
      
        1256
                temp_dir=temp_dir,
      
        1257
                messages=messages,
      
        1258
                safeguards=FakeSafeguards(),
      
        1259
                assess_confidence=assess_confidence,
      
        1260
                verify_action=verify_action,
      
        1261
                auto_recover=False,
      
        1262
            )
      
        1263
            prompt = (
      
        1264
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1265
                "for the structure and cadence of the guide. We are going to make an all "
      
        1266
                "new equally thorough guide on how to use the nginx tool."
      
        1267
            )
      
        1268
            context.session.current_task = prompt
      
        1269
            persistent_messages: list[str] = []
      
        1270
            ephemeral_messages: list[str] = []
      
        1271
            context.queue_steering_message_callback = persistent_messages.append
      
        1272
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1273
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1274
            dod = create_definition_of_done(prompt)
      
        1275
            sync_todos_to_definition_of_done(
      
        1276
                dod,
      
        1277
                [
      
        1278
                    {
      
        1279
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1280
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1281
                        "status": "completed",
      
        1282
                    },
      
        1283
                    {
      
        1284
                        "content": "Create the nginx directory structure",
      
        1285
                        "active_form": "Working on: Create the nginx directory structure",
      
        1286
                        "status": "pending",
      
        1287
                    },
      
        1288
                    {
      
        1289
                        "content": "Create the nginx index.html file",
      
        1290
                        "active_form": "Working on: Create the nginx index.html file",
      
        1291
                        "status": "pending",
      
        1292
                    },
      
        1293
                ],
      
        1294
            )
      
        1295
            tool_call = ToolCall(
      
        1296
                id="read-dup",
      
        1297
                name="read",
      
        1298
                arguments={"file_path": str(reference)},
      
        1299
            )
      
        1300
            duplicate_message = (
      
        1301
                "[Skipped - duplicate action: Already read "
      
        1302
                f"{reference} recently without any intervening changes; "
      
        1303
                "reuse the earlier read result instead of rereading]"
      
        1304
            )
      
        1305
            executor = FakeExecutor(
      
        1306
                [
      
        1307
                    ToolExecutionOutcome(
      
        1308
                        tool_call=tool_call,
      
        1309
                        state=ToolExecutionState.DUPLICATE,
      
        1310
                        message=Message.tool_result_message(
      
        1311
                            tool_call_id=tool_call.id,
      
        1312
                            display_content=duplicate_message,
      
        1313
                            result_content=duplicate_message,
      
        1314
                        ),
      
        1315
                        event_content=duplicate_message,
      
        1316
                        is_error=False,
      
        1317
                        result_output=duplicate_message,
      
        1318
                    )
      
        1319
                ]
      
        1320
            )
      
        1321
        
        1322
            summary = TurnSummary(final_response="")
      
        1323
            await runner.execute_batch(
      
        1324
                tool_calls=[tool_call],
      
        1325
                tool_source="assistant",
      
        1326
                pending_tool_calls_seen=set(),
      
        1327
                emit=_noop_emit,
      
        1328
                summary=summary,
      
        1329
                dod=dod,
      
        1330
                executor=executor,  # type: ignore[arg-type]
      
        1331
                on_confirmation=None,
      
        1332
                on_user_question=None,
      
        1333
                emit_confirmation=None,
      
        1334
                consecutive_errors=0,
      
        1335
            )
      
        1336
        
        1337
            assert len(persistent_messages) == 1
      
        1338
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1339
            assert (
      
        1340
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1341
                in persistent_messages[0]
      
        1342
            )
      
        1343
            assert "Update `" not in persistent_messages[0]
      
        1344
            assert ephemeral_messages == []
      
        1345
        
        1346
        
        1347
        @pytest.mark.asyncio
      
        1348
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1349
            temp_dir: Path,
      
        1350
        ) -> None:
      
        1351
            async def assess_confidence(
      
        1352
                tool_name: str,
      
        1353
                tool_args: dict,
      
        1354
                context: str,
      
        1355
            ) -> ConfidenceAssessment:
      
        1356
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1357
        
        1358
            async def verify_action(
      
        1359
                tool_name: str,
      
        1360
                tool_args: dict,
      
        1361
                result: str,
      
        1362
                expected: str = "",
      
        1363
            ) -> ActionVerification:
      
        1364
                raise AssertionError("Verification should not run for this scenario")
      
        1365
        
        1366
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1367
            chapters = guide_root / "chapters"
      
        1368
            chapters.mkdir(parents=True)
      
        1369
            chapter_one = chapters / "01-introduction.html"
      
        1370
            chapter_one.write_text("<html></html>\n")
      
        1371
            index_path = guide_root / "index.html"
      
        1372
        
        1373
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1374
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1375
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1376
        
        1377
            implementation_plan = temp_dir / "implementation.md"
      
        1378
            implementation_plan.write_text(
      
        1379
                "\n".join(
      
        1380
                    [
      
        1381
                        "# Implementation Plan",
      
        1382
                        "",
      
        1383
                        "## File Changes",
      
        1384
                        f"- `{guide_root}/`",
      
        1385
                        f"- `{chapters}/`",
      
        1386
                        f"- `{index_path}`",
      
        1387
                        f"- `{chapter_one}`",
      
        1388
                        f"- `{chapters / '02-installation.html'}`",
      
        1389
                        "",
      
        1390
                    ]
      
        1391
                )
      
        1392
            )
      
        1393
        
        1394
            context = build_context(
      
        1395
                temp_dir=temp_dir,
      
        1396
                messages=[],
      
        1397
                safeguards=FakeSafeguards(),
      
        1398
                assess_confidence=assess_confidence,
      
        1399
                verify_action=verify_action,
      
        1400
                auto_recover=False,
      
        1401
            )
      
        1402
            persistent_messages: list[str] = []
      
        1403
            ephemeral_messages: list[str] = []
      
        1404
            context.queue_steering_message_callback = persistent_messages.append
      
        1405
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1406
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1407
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1408
            dod.implementation_plan = str(implementation_plan)
      
        1409
            dod.touched_files.append(str(chapter_one))
      
        1410
            sync_todos_to_definition_of_done(
      
        1411
                dod,
      
        1412
                [
      
        1413
                    {
      
        1414
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1415
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1416
                        "status": "pending",
      
        1417
                    },
      
        1418
                    {
      
        1419
                        "content": "Create each chapter file with appropriate content",
      
        1420
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1421
                        "status": "pending",
      
        1422
                    },
      
        1423
                    {
      
        1424
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1425
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1426
                        "status": "pending",
      
        1427
                    },
      
        1428
                ],
      
        1429
            )
      
        1430
            tool_call = ToolCall(
      
        1431
                id="read-reference-chapter",
      
        1432
                name="read",
      
        1433
                arguments={"file_path": str(reference)},
      
        1434
            )
      
        1435
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1436
            executor = FakeExecutor(
      
        1437
                [
      
        1438
                    ToolExecutionOutcome(
      
        1439
                        tool_call=tool_call,
      
        1440
                        state=ToolExecutionState.EXECUTED,
      
        1441
                        message=Message.tool_result_message(
      
        1442
                            tool_call_id=tool_call.id,
      
        1443
                            display_content=read_output,
      
        1444
                            result_content=read_output,
      
        1445
                        ),
      
        1446
                        event_content=read_output,
      
        1447
                        is_error=False,
      
        1448
                        result_output=read_output,
      
        1449
                    )
      
        1450
                ]
      
        1451
            )
      
        1452
        
        1453
            summary = TurnSummary(final_response="")
      
        1454
            await runner.execute_batch(
      
        1455
                tool_calls=[tool_call],
      
        1456
                tool_source="assistant",
      
        1457
                pending_tool_calls_seen=set(),
      
        1458
                emit=_noop_emit,
      
        1459
                summary=summary,
      
        1460
                dod=dod,
      
        1461
                executor=executor,  # type: ignore[arg-type]
      
        1462
                on_confirmation=None,
      
        1463
                on_user_question=None,
      
        1464
                emit_confirmation=None,
      
        1465
                consecutive_errors=0,
      
        1466
            )
      
        1467
        
        1468
            assert persistent_messages
      
        1469
            assert any(
      
        1470
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1471
                in message
      
        1472
                for message in persistent_messages
      
        1473
            )
      
        1474
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1475
            assert not any(
      
        1476
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1477
                in message
      
        1478
                for message in persistent_messages
      
        1479
            )
      
        1480
            assert ephemeral_messages == []
      
        1481
        
        1482
        
        1483
        @pytest.mark.asyncio
      
        1484
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1485
            temp_dir: Path,
      
        1486
        ) -> None:
      
        1487
            async def assess_confidence(
      
        1488
                tool_name: str,
      
        1489
                tool_args: dict,
      
        1490
                context: str,
      
        1491
            ) -> ConfidenceAssessment:
      
        1492
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1493
        
        1494
            async def verify_action(
      
        1495
                tool_name: str,
      
        1496
                tool_args: dict,
      
        1497
                result: str,
      
        1498
                expected: str = "",
      
        1499
            ) -> ActionVerification:
      
        1500
                raise AssertionError("Verification should not run for this scenario")
      
        1501
        
        1502
            guide_root = temp_dir / "guides" / "nginx"
      
        1503
            chapters = guide_root / "chapters"
      
        1504
            guide_root.mkdir(parents=True)
      
        1505
            chapters.mkdir()
      
        1506
            index_path = guide_root / "index.html"
      
        1507
            chapter_one = chapters / "01-getting-started.html"
      
        1508
            chapter_two = chapters / "02-installation.html"
      
        1509
            index_path.write_text("<html></html>\n")
      
        1510
            chapter_one.write_text("<h1>One</h1>\n")
      
        1511
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1512
        
        1513
            implementation_plan = temp_dir / "implementation.md"
      
        1514
            implementation_plan.write_text(
      
        1515
                "\n".join(
      
        1516
                    [
      
        1517
                        "# Implementation Plan",
      
        1518
                        "",
      
        1519
                        "## File Changes",
      
        1520
                        f"- `{guide_root}/`",
      
        1521
                        f"- `{chapters}/`",
      
        1522
                        f"- `{index_path}`",
      
        1523
                        f"- `{chapter_one}`",
      
        1524
                        f"- `{chapter_two}`",
      
        1525
                        "",
      
        1526
                    ]
      
        1527
                )
      
        1528
            )
      
        1529
        
        1530
            context = build_context(
      
        1531
                temp_dir=temp_dir,
      
        1532
                messages=[],
      
        1533
                safeguards=FakeSafeguards(),
      
        1534
                assess_confidence=assess_confidence,
      
        1535
                verify_action=verify_action,
      
        1536
                auto_recover=False,
      
        1537
            )
      
        1538
            persistent_messages: list[str] = []
      
        1539
            ephemeral_messages: list[str] = []
      
        1540
            context.queue_steering_message_callback = persistent_messages.append
      
        1541
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1542
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1543
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1544
            dod.implementation_plan = str(implementation_plan)
      
        1545
            dod.pending_items = [
      
        1546
                "Create 07-performance-tuning.html",
      
        1547
                "Verify all guide files are linked and complete",
      
        1548
                "Complete the requested work",
      
        1549
            ]
      
        1550
        
        1551
            tool_call = ToolCall(
      
        1552
                id="read-dup",
      
        1553
                name="read",
      
        1554
                arguments={"file_path": str(chapter_one)},
      
        1555
            )
      
        1556
            duplicate_message = (
      
        1557
                "[Skipped - duplicate action: Already read "
      
        1558
                f"{chapter_one} recently without any intervening changes; "
      
        1559
                "reuse the earlier read result instead of rereading]"
      
        1560
            )
      
        1561
            executor = FakeExecutor(
      
        1562
                [
      
        1563
                    ToolExecutionOutcome(
      
        1564
                        tool_call=tool_call,
      
        1565
                        state=ToolExecutionState.DUPLICATE,
      
        1566
                        message=Message.tool_result_message(
      
        1567
                            tool_call_id=tool_call.id,
      
        1568
                            display_content=duplicate_message,
      
        1569
                            result_content=duplicate_message,
      
        1570
                        ),
      
        1571
                        event_content=duplicate_message,
      
        1572
                        is_error=False,
      
        1573
                        result_output=duplicate_message,
      
        1574
                    )
      
        1575
                ]
      
        1576
            )
      
        1577
        
        1578
            summary = TurnSummary(final_response="")
      
        1579
            await runner.execute_batch(
      
        1580
                tool_calls=[tool_call],
      
        1581
                tool_source="assistant",
      
        1582
                pending_tool_calls_seen=set(),
      
        1583
                emit=_noop_emit,
      
        1584
                summary=summary,
      
        1585
                dod=dod,
      
        1586
                executor=executor,  # type: ignore[arg-type]
      
        1587
                on_confirmation=None,
      
        1588
                on_user_question=None,
      
        1589
                emit_confirmation=None,
      
        1590
                consecutive_errors=0,
      
        1591
            )
      
        1592
        
        1593
            assert len(persistent_messages) == 1
      
        1594
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1595
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1596
            assert ephemeral_messages == []
      
        1597
        
        1598
        
        1599
        @pytest.mark.asyncio
      
        1600
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1601
            temp_dir: Path,
      
        1602
        ) -> None:
      
        1603
            async def assess_confidence(
      
        1604
                tool_name: str,
      
        1605
                tool_args: dict,
      
        1606
                context: str,
      
        1607
            ) -> ConfidenceAssessment:
      
        1608
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1609
        
        1610
            async def verify_action(
      
        1611
                tool_name: str,
      
        1612
                tool_args: dict,
      
        1613
                result: str,
      
        1614
                expected: str = "",
      
        1615
            ) -> ActionVerification:
      
        1616
                raise AssertionError("Verification should not run for this scenario")
      
        1617
        
        1618
            guide_root = temp_dir / "guides" / "nginx"
      
        1619
            chapters = guide_root / "chapters"
      
        1620
            guide_root.mkdir(parents=True)
      
        1621
            chapters.mkdir()
      
        1622
            index_path = guide_root / "index.html"
      
        1623
            chapter_one = chapters / "01-getting-started.html"
      
        1624
            chapter_two = chapters / "02-installation.html"
      
        1625
            index_path.write_text("<html></html>\n")
      
        1626
            chapter_one.write_text("<h1>One</h1>\n")
      
        1627
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1628
        
        1629
            implementation_plan = temp_dir / "implementation.md"
      
        1630
            implementation_plan.write_text(
      
        1631
                "\n".join(
      
        1632
                    [
      
        1633
                        "# Implementation Plan",
      
        1634
                        "",
      
        1635
                        "## File Changes",
      
        1636
                        f"- `{guide_root}/`",
      
        1637
                        f"- `{chapters}/`",
      
        1638
                        f"- `{index_path}`",
      
        1639
                        f"- `{chapter_one}`",
      
        1640
                        f"- `{chapter_two}`",
      
        1641
                        "",
      
        1642
                    ]
      
        1643
                )
      
        1644
            )
      
        1645
        
        1646
            context = build_context(
      
        1647
                temp_dir=temp_dir,
      
        1648
                messages=[],
      
        1649
                safeguards=FakeSafeguards(),
      
        1650
                assess_confidence=assess_confidence,
      
        1651
                verify_action=verify_action,
      
        1652
                auto_recover=False,
      
        1653
            )
      
        1654
            persistent_messages: list[str] = []
      
        1655
            ephemeral_messages: list[str] = []
      
        1656
            context.queue_steering_message_callback = persistent_messages.append
      
        1657
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1658
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1659
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1660
            dod.implementation_plan = str(implementation_plan)
      
        1661
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1662
            dod.pending_items = [
      
        1663
                "Create 07-performance-tuning.html",
      
        1664
                "Complete the requested work",
      
        1665
            ]
      
        1666
        
        1667
            tool_call = ToolCall(
      
        1668
                id="read-dup",
      
        1669
                name="read",
      
        1670
                arguments={"file_path": str(chapter_one)},
      
        1671
            )
      
        1672
            duplicate_message = (
      
        1673
                "[Skipped - duplicate action: Already read "
      
        1674
                f"{chapter_one} recently without any intervening changes; "
      
        1675
                "reuse the earlier read result instead of rereading]"
      
        1676
            )
      
        1677
            executor = FakeExecutor(
      
        1678
                [
      
        1679
                    ToolExecutionOutcome(
      
        1680
                        tool_call=tool_call,
      
        1681
                        state=ToolExecutionState.DUPLICATE,
      
        1682
                        message=Message.tool_result_message(
      
        1683
                            tool_call_id=tool_call.id,
      
        1684
                            display_content=duplicate_message,
      
        1685
                            result_content=duplicate_message,
      
        1686
                        ),
      
        1687
                        event_content=duplicate_message,
      
        1688
                        is_error=False,
      
        1689
                        result_output=duplicate_message,
      
        1690
                    )
      
        1691
                ]
      
        1692
            )
      
        1693
        
        1694
            summary = TurnSummary(final_response="")
      
        1695
            await runner.execute_batch(
      
        1696
                tool_calls=[tool_call],
      
        1697
                tool_source="assistant",
      
        1698
                pending_tool_calls_seen=set(),
      
        1699
                emit=_noop_emit,
      
        1700
                summary=summary,
      
        1701
                dod=dod,
      
        1702
                executor=executor,  # type: ignore[arg-type]
      
        1703
                on_confirmation=None,
      
        1704
                on_user_question=None,
      
        1705
                emit_confirmation=None,
      
        1706
                consecutive_errors=0,
      
        1707
            )
      
        1708
        
        1709
            assert len(persistent_messages) == 1
      
        1710
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1711
            assert (
      
        1712
                "Move to verification or final confirmation using the files already on disk."
      
        1713
                in persistent_messages[0]
      
        1714
            )
      
        1715
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1716
            assert ephemeral_messages == []
      
        1717
        
        1718
        
        1719
        @pytest.mark.asyncio
      
        1720
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1721
            temp_dir: Path,
      
        1722
        ) -> None:
      
        1723
            async def assess_confidence(
      
        1724
                tool_name: str,
      
        1725
                tool_args: dict,
      
        1726
                context: str,
      
        1727
            ) -> ConfidenceAssessment:
      
        1728
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1729
        
        1730
            async def verify_action(
      
        1731
                tool_name: str,
      
        1732
                tool_args: dict,
      
        1733
                result: str,
      
        1734
                expected: str = "",
      
        1735
            ) -> ActionVerification:
      
        1736
                raise AssertionError("Verification should not run for this scenario")
      
        1737
        
        1738
            guide_root = temp_dir / "guides" / "nginx"
      
        1739
            chapters = guide_root / "chapters"
      
        1740
            guide_root.mkdir(parents=True)
      
        1741
            chapters.mkdir()
      
        1742
            index_path = guide_root / "index.html"
      
        1743
            chapter_one = chapters / "01-getting-started.html"
      
        1744
            chapter_two = chapters / "02-installation.html"
      
        1745
            index_path.write_text("<html></html>\n")
      
        1746
            chapter_one.write_text("<h1>One</h1>\n")
      
        1747
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1748
        
        1749
            implementation_plan = temp_dir / "implementation.md"
      
        1750
            implementation_plan.write_text(
      
        1751
                "\n".join(
      
        1752
                    [
      
        1753
                        "# Implementation Plan",
      
        1754
                        "",
      
        1755
                        "## File Changes",
      
        1756
                        f"- `{guide_root}/`",
      
        1757
                        f"- `{chapters}/`",
      
        1758
                        f"- `{index_path}`",
      
        1759
                        f"- `{chapter_one}`",
      
        1760
                        f"- `{chapter_two}`",
      
        1761
                        "",
      
        1762
                    ]
      
        1763
                )
      
        1764
            )
      
        1765
        
        1766
            context = build_context(
      
        1767
                temp_dir=temp_dir,
      
        1768
                messages=[],
      
        1769
                safeguards=FakeSafeguards(),
      
        1770
                assess_confidence=assess_confidence,
      
        1771
                verify_action=verify_action,
      
        1772
                auto_recover=False,
      
        1773
            )
      
        1774
            persistent_messages: list[str] = []
      
        1775
            ephemeral_messages: list[str] = []
      
        1776
            context.queue_steering_message_callback = persistent_messages.append
      
        1777
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1778
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1779
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1780
            dod.implementation_plan = str(implementation_plan)
      
        1781
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1782
            dod.pending_items = [
      
        1783
                "Create 01-getting-started.html",
      
        1784
                "Creating 02-installation.html",
      
        1785
                "Complete the requested work",
      
        1786
            ]
      
        1787
        
        1788
            tool_call = ToolCall(
      
        1789
                id="read-dup-built-stale",
      
        1790
                name="read",
      
        1791
                arguments={"file_path": str(chapter_one)},
      
        1792
            )
      
        1793
            duplicate_message = (
      
        1794
                "[Skipped - duplicate action: Already read "
      
        1795
                f"{chapter_one} recently without any intervening changes; "
      
        1796
                "reuse the earlier read result instead of rereading]"
      
        1797
            )
      
        1798
            executor = FakeExecutor(
      
        1799
                [
      
        1800
                    ToolExecutionOutcome(
      
        1801
                        tool_call=tool_call,
      
        1802
                        state=ToolExecutionState.DUPLICATE,
      
        1803
                        message=Message.tool_result_message(
      
        1804
                            tool_call_id=tool_call.id,
      
        1805
                            display_content=duplicate_message,
      
        1806
                            result_content=duplicate_message,
      
        1807
                        ),
      
        1808
                        event_content=duplicate_message,
      
        1809
                        is_error=False,
      
        1810
                        result_output=duplicate_message,
      
        1811
                    )
      
        1812
                ]
      
        1813
            )
      
        1814
        
        1815
            summary = TurnSummary(final_response="")
      
        1816
            await runner.execute_batch(
      
        1817
                tool_calls=[tool_call],
      
        1818
                tool_source="assistant",
      
        1819
                pending_tool_calls_seen=set(),
      
        1820
                emit=_noop_emit,
      
        1821
                summary=summary,
      
        1822
                dod=dod,
      
        1823
                executor=executor,  # type: ignore[arg-type]
      
        1824
                on_confirmation=None,
      
        1825
                on_user_question=None,
      
        1826
                emit_confirmation=None,
      
        1827
                consecutive_errors=0,
      
        1828
            )
      
        1829
        
        1830
            assert len(persistent_messages) == 1
      
        1831
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        1832
            assert (
      
        1833
                "Move to verification or final confirmation using the files already on disk."
      
        1834
                in persistent_messages[0]
      
        1835
            )
      
        1836
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        1837
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        1838
            assert ephemeral_messages == []
      
        1839
        
        1840
        
        1841
        @pytest.mark.asyncio
      
        1842
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        1843
            temp_dir: Path,
      
        1844
        ) -> None:
      
        1845
            async def assess_confidence(
      
        1846
                tool_name: str,
      
        1847
                tool_args: dict,
      
        1848
                context: str,
      
        1849
            ) -> ConfidenceAssessment:
      
        1850
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1851
        
        1852
            async def verify_action(
      
        1853
                tool_name: str,
      
        1854
                tool_args: dict,
      
        1855
                result: str,
      
        1856
                expected: str = "",
      
        1857
            ) -> ActionVerification:
      
        1858
                raise AssertionError("Verification should not run for this scenario")
      
        1859
        
        1860
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1861
            reference.parent.mkdir(parents=True)
      
        1862
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1863
        
        1864
            context = build_context(
      
        1865
                temp_dir=temp_dir,
      
        1866
                messages=[],
      
        1867
                safeguards=FakeSafeguards(),
      
        1868
                assess_confidence=assess_confidence,
      
        1869
                verify_action=verify_action,
      
        1870
                auto_recover=False,
      
        1871
            )
      
        1872
            persistent_messages: list[str] = []
      
        1873
            ephemeral_messages: list[str] = []
      
        1874
            context.queue_steering_message_callback = persistent_messages.append
      
        1875
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1876
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1877
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1878
            sync_todos_to_definition_of_done(
      
        1879
                dod,
      
        1880
                [
      
        1881
                    {
      
        1882
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1883
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1884
                        "status": "pending",
      
        1885
                    },
      
        1886
                    {
      
        1887
                        "content": "Create the nginx index.html file",
      
        1888
                        "active_form": "Working on: Create the nginx index.html file",
      
        1889
                        "status": "pending",
      
        1890
                    },
      
        1891
                ],
      
        1892
            )
      
        1893
            tool_call = ToolCall(
      
        1894
                id="read-reference",
      
        1895
                name="read",
      
        1896
                arguments={"file_path": str(reference)},
      
        1897
            )
      
        1898
            executor = FakeExecutor(
      
        1899
                [
      
        1900
                    tool_outcome(
      
        1901
                        tool_call=tool_call,
      
        1902
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1903
                        is_error=False,
      
        1904
                    )
      
        1905
                ]
      
        1906
            )
      
        1907
        
        1908
            summary = TurnSummary(final_response="")
      
        1909
            await runner.execute_batch(
      
        1910
                tool_calls=[tool_call],
      
        1911
                tool_source="assistant",
      
        1912
                pending_tool_calls_seen=set(),
      
        1913
                emit=_noop_emit,
      
        1914
                summary=summary,
      
        1915
                dod=dod,
      
        1916
                executor=executor,  # type: ignore[arg-type]
      
        1917
                on_confirmation=None,
      
        1918
                on_user_question=None,
      
        1919
                emit_confirmation=None,
      
        1920
                consecutive_errors=0,
      
        1921
            )
      
        1922
        
        1923
            assert any(
      
        1924
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        1925
                in message
      
        1926
                for message in persistent_messages
      
        1927
            )
      
        1928
            assert any(
      
        1929
                "stop gathering more reference material and perform the change now" in message
      
        1930
                for message in persistent_messages
      
        1931
            )
      
        1932
            assert ephemeral_messages == []
      
        1933
        
        1934
        
        1935
        @pytest.mark.asyncio
      
        1936
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        1937
            temp_dir: Path,
      
        1938
        ) -> None:
      
        1939
            async def assess_confidence(
      
        1940
                tool_name: str,
      
        1941
                tool_args: dict,
      
        1942
                context: str,
      
        1943
            ) -> ConfidenceAssessment:
      
        1944
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1945
        
        1946
            async def verify_action(
      
        1947
                tool_name: str,
      
        1948
                tool_args: dict,
      
        1949
                result: str,
      
        1950
                expected: str = "",
      
        1951
            ) -> ActionVerification:
      
        1952
                raise AssertionError("Verification should not run for this scenario")
      
        1953
        
        1954
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1955
            reference.parent.mkdir(parents=True)
      
        1956
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1957
        
        1958
            context = build_context(
      
        1959
                temp_dir=temp_dir,
      
        1960
                messages=[],
      
        1961
                safeguards=FakeSafeguards(),
      
        1962
                assess_confidence=assess_confidence,
      
        1963
                verify_action=verify_action,
      
        1964
                auto_recover=False,
      
        1965
            )
      
        1966
            persistent_messages: list[str] = []
      
        1967
            ephemeral_messages: list[str] = []
      
        1968
            context.queue_steering_message_callback = persistent_messages.append
      
        1969
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1970
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1971
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1972
            sync_todos_to_definition_of_done(
      
        1973
                dod,
      
        1974
                [
      
        1975
                    {
      
        1976
                        "content": "First, examine the existing fortran guide structure and content",
      
        1977
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        1978
                        "status": "pending",
      
        1979
                    },
      
        1980
                    {
      
        1981
                        "content": "Create the nginx directory structure",
      
        1982
                        "active_form": "Working on: Create the nginx directory structure",
      
        1983
                        "status": "pending",
      
        1984
                    },
      
        1985
                ],
      
        1986
            )
      
        1987
            tool_call = ToolCall(
      
        1988
                id="read-reference",
      
        1989
                name="read",
      
        1990
                arguments={"file_path": str(reference)},
      
        1991
            )
      
        1992
            executor = FakeExecutor(
      
        1993
                [
      
        1994
                    tool_outcome(
      
        1995
                        tool_call=tool_call,
      
        1996
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1997
                        is_error=False,
      
        1998
                    )
      
        1999
                ]
      
        2000
            )
      
        2001
        
        2002
            summary = TurnSummary(final_response="")
      
        2003
            await runner.execute_batch(
      
        2004
                tool_calls=[tool_call],
      
        2005
                tool_source="assistant",
      
        2006
                pending_tool_calls_seen=set(),
      
        2007
                emit=_noop_emit,
      
        2008
                summary=summary,
      
        2009
                dod=dod,
      
        2010
                executor=executor,  # type: ignore[arg-type]
      
        2011
                on_confirmation=None,
      
        2012
                on_user_question=None,
      
        2013
                emit_confirmation=None,
      
        2014
                consecutive_errors=0,
      
        2015
            )
      
        2016
        
        2017
            assert persistent_messages
      
        2018
            assert any(
      
        2019
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2020
                in message
      
        2021
                for message in persistent_messages
      
        2022
            )
      
        2023
            assert ephemeral_messages == []
      
        2024
        
        2025
        
        2026
        @pytest.mark.asyncio
      
        2027
        async def test_tool_batch_runner_missing_artifact_nudge_prefers_pending_index_after_mkdir(
      
        2028
            temp_dir: Path,
      
        2029
        ) -> None:
      
        2030
            async def assess_confidence(
      
        2031
                tool_name: str,
      
        2032
                tool_args: dict,
      
        2033
                context: str,
      
        2034
            ) -> ConfidenceAssessment:
      
        2035
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2036
        
        2037
            async def verify_action(
      
        2038
                tool_name: str,
      
        2039
                tool_args: dict,
      
        2040
                result: str,
      
        2041
                expected: str = "",
      
        2042
            ) -> ActionVerification:
      
        2043
                raise AssertionError("Verification should not run for this scenario")
      
        2044
        
        2045
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2046
            chapters = nginx_root / "chapters"
      
        2047
            implementation_plan = temp_dir / "implementation.md"
      
        2048
            implementation_plan.write_text(
      
        2049
                "\n".join(
      
        2050
                    [
      
        2051
                        "# Implementation Plan",
      
        2052
                        "",
      
        2053
                        "## File Changes",
      
        2054
                        f"- `{chapters}/`",
      
        2055
                        f"- `{nginx_root / 'index.html'}`",
      
        2056
                        "",
      
        2057
                    ]
      
        2058
                )
      
        2059
            )
      
        2060
        
        2061
            context = build_context(
      
        2062
                temp_dir=temp_dir,
      
        2063
                messages=[],
      
        2064
                safeguards=FakeSafeguards(),
      
        2065
                assess_confidence=assess_confidence,
      
        2066
                verify_action=verify_action,
      
        2067
                auto_recover=False,
      
        2068
            )
      
        2069
            persistent_messages: list[str] = []
      
        2070
            ephemeral_messages: list[str] = []
      
        2071
            context.queue_steering_message_callback = persistent_messages.append
      
        2072
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2073
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2074
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2075
            dod.implementation_plan = str(implementation_plan)
      
        2076
            sync_todos_to_definition_of_done(
      
        2077
                dod,
      
        2078
                [
      
        2079
                    {
      
        2080
                        "content": "Create the nginx directory structure",
      
        2081
                        "active_form": "Creating the nginx directory structure",
      
        2082
                        "status": "pending",
      
        2083
                    },
      
        2084
                    {
      
        2085
                        "content": "Develop the main index.html file with proper structure",
      
        2086
                        "active_form": "Developing the main index.html file with proper structure",
      
        2087
                        "status": "pending",
      
        2088
                    },
      
        2089
                ],
      
        2090
            )
      
        2091
        
        2092
            tool_call = ToolCall(
      
        2093
                id="mkdir-nginx",
      
        2094
                name="bash",
      
        2095
                arguments={"command": f"mkdir -p {chapters}"},
      
        2096
            )
      
        2097
            executor = FakeExecutor(
      
        2098
                [
      
        2099
                    tool_outcome(
      
        2100
                        tool_call=tool_call,
      
        2101
                        output="",
      
        2102
                        is_error=False,
      
        2103
                    )
      
        2104
                ]
      
        2105
            )
      
        2106
        
        2107
            summary = TurnSummary(final_response="")
      
        2108
            await runner.execute_batch(
      
        2109
                tool_calls=[tool_call],
      
        2110
                tool_source="assistant",
      
        2111
                pending_tool_calls_seen=set(),
      
        2112
                emit=_noop_emit,
      
        2113
                summary=summary,
      
        2114
                dod=dod,
      
        2115
                executor=executor,  # type: ignore[arg-type]
      
        2116
                on_confirmation=None,
      
        2117
                on_user_question=None,
      
        2118
                emit_confirmation=None,
      
        2119
                consecutive_errors=0,
      
        2120
            )
      
        2121
        
        2122
            assert persistent_messages
      
        2123
            message = persistent_messages[-1]
      
        2124
            assert "Next step: create `index.html`." in message
      
        2125
            assert (
      
        2126
                f"Prefer one `write(file_path=..., content=...)` call for `{(nginx_root / 'index.html').resolve(strict=False)}` now."
      
        2127
                in message
      
        2128
            )
      
        2129
            assert "One declared output artifact is still missing." not in message
      
        2130
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2131
            assert "Resume by creating the next output file under `chapters/` now." not in message
      
        2132
            assert ephemeral_messages == []
      
        2133
        
        2134
        
        2135
        @pytest.mark.asyncio
      
        2136
        async def test_tool_batch_runner_first_file_handoff_stays_persistent(
      
        2137
            temp_dir: Path,
      
        2138
        ) -> None:
      
        2139
            async def assess_confidence(
      
        2140
                tool_name: str,
      
        2141
                tool_args: dict,
      
        2142
                context: str,
      
        2143
            ) -> ConfidenceAssessment:
      
        2144
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2145
        
        2146
            async def verify_action(
      
        2147
                tool_name: str,
      
        2148
                tool_args: dict,
      
        2149
                result: str,
      
        2150
                expected: str = "",
      
        2151
            ) -> ActionVerification:
      
        2152
                raise AssertionError("Verification should not run for this scenario")
      
        2153
        
        2154
            nginx_root = temp_dir / "guides" / "nginx"
      
        2155
            chapters = nginx_root / "chapters"
      
        2156
            chapters.mkdir(parents=True)
      
        2157
            index_path = nginx_root / "index.html"
      
        2158
        
        2159
            implementation_plan = temp_dir / "implementation.md"
      
        2160
            implementation_plan.write_text(
      
        2161
                "\n".join(
      
        2162
                    [
      
        2163
                        "# Implementation Plan",
      
        2164
                        "",
      
        2165
                        "## File Changes",
      
        2166
                        f"- `{chapters}/`",
      
        2167
                        f"- `{index_path}`",
      
        2168
                        f"- `{chapters / '01-introduction.html'}`",
      
        2169
                        "",
      
        2170
                    ]
      
        2171
                )
      
        2172
            )
      
        2173
        
        2174
            context = build_context(
      
        2175
                temp_dir=temp_dir,
      
        2176
                messages=[],
      
        2177
                safeguards=FakeSafeguards(),
      
        2178
                assess_confidence=assess_confidence,
      
        2179
                verify_action=verify_action,
      
        2180
                auto_recover=False,
      
        2181
            )
      
        2182
            persistent_messages: list[str] = []
      
        2183
            ephemeral_messages: list[str] = []
      
        2184
            context.queue_steering_message_callback = persistent_messages.append
      
        2185
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2186
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2187
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2188
            dod.implementation_plan = str(implementation_plan)
      
        2189
            sync_todos_to_definition_of_done(
      
        2190
                dod,
      
        2191
                [
      
        2192
                    {
      
        2193
                        "content": "Create the main index.html file with proper structure",
      
        2194
                        "active_form": "Creating the main index.html file with proper structure",
      
        2195
                        "status": "pending",
      
        2196
                    },
      
        2197
                    {
      
        2198
                        "content": "Create each chapter file with appropriate content",
      
        2199
                        "active_form": "Creating each chapter file with appropriate content",
      
        2200
                        "status": "pending",
      
        2201
                    },
      
        2202
                ],
      
        2203
            )
      
        2204
        
        2205
            tool_call = ToolCall(
      
        2206
                id="write-index",
      
        2207
                name="write",
      
        2208
                arguments={
      
        2209
                    "file_path": str(index_path),
      
        2210
                    "content": "<html></html>\n",
      
        2211
                },
      
        2212
            )
      
        2213
            executor = FakeExecutor(
      
        2214
                [
      
        2215
                    tool_outcome(
      
        2216
                        tool_call=tool_call,
      
        2217
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2218
                        is_error=False,
      
        2219
                    )
      
        2220
                ]
      
        2221
            )
      
        2222
        
        2223
            summary = TurnSummary(final_response="")
      
        2224
            await runner.execute_batch(
      
        2225
                tool_calls=[tool_call],
      
        2226
                tool_source="assistant",
      
        2227
                pending_tool_calls_seen=set(),
      
        2228
                emit=_noop_emit,
      
        2229
                summary=summary,
      
        2230
                dod=dod,
      
        2231
                executor=executor,  # type: ignore[arg-type]
      
        2232
                on_confirmation=None,
      
        2233
                on_user_question=None,
      
        2234
                emit_confirmation=None,
      
        2235
                consecutive_errors=0,
      
        2236
            )
      
        2237
        
        2238
            assert persistent_messages
      
        2239
            message = persistent_messages[-1]
      
        2240
            assert "Confirmed progress:" in message
      
        2241
            assert "Resume by creating `01-introduction.html` now." in message
      
        2242
            assert (
      
        2243
                f"Prefer one `write` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` "
      
        2244
                "instead of more rereads."
      
        2245
                in message
      
        2246
            )
      
        2247
            assert "Do not move to verification, final confirmation, or TodoWrite-only bookkeeping" in message
      
        2248
            assert ephemeral_messages == []
      
        2249
        
        2250
        
        2251
        @pytest.mark.asyncio
      
        2252
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        2253
            temp_dir: Path,
      
        2254
        ) -> None:
      
        2255
            async def assess_confidence(
      
        2256
                tool_name: str,
      
        2257
                tool_args: dict,
      
        2258
                context: str,
      
        2259
            ) -> ConfidenceAssessment:
      
        2260
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2261
        
        2262
            async def verify_action(
      
        2263
                tool_name: str,
      
        2264
                tool_args: dict,
      
        2265
                result: str,
      
        2266
                expected: str = "",
      
        2267
            ) -> ActionVerification:
      
        2268
                raise AssertionError("Verification should not run for this scenario")
      
        2269
        
        2270
            guide_root = temp_dir / "guides" / "nginx"
      
        2271
            chapters = guide_root / "chapters"
      
        2272
            chapters.mkdir(parents=True)
      
        2273
            index_path = guide_root / "index.html"
      
        2274
            chapter_one = chapters / "01-getting-started.html"
      
        2275
            chapter_one.write_text("<h1>One</h1>\n")
      
        2276
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2277
        
        2278
            implementation_plan = temp_dir / "implementation.md"
      
        2279
            implementation_plan.write_text(
      
        2280
                "\n".join(
      
        2281
                    [
      
        2282
                        "# Implementation Plan",
      
        2283
                        "",
      
        2284
                        "## File Changes",
      
        2285
                        f"- `{index_path}`",
      
        2286
                        f"- `{chapter_one}`",
      
        2287
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        2288
                        "",
      
        2289
                    ]
      
        2290
                )
      
        2291
            )
      
        2292
        
        2293
            context = build_context(
      
        2294
                temp_dir=temp_dir,
      
        2295
                messages=[],
      
        2296
                safeguards=FakeSafeguards(),
      
        2297
                assess_confidence=assess_confidence,
      
        2298
                verify_action=verify_action,
      
        2299
                auto_recover=False,
      
        2300
            )
      
        2301
            queued_messages: list[str] = []
      
        2302
            context.queue_steering_message_callback = queued_messages.append
      
        2303
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2304
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2305
            dod.implementation_plan = str(implementation_plan)
      
        2306
            sync_todos_to_definition_of_done(
      
        2307
                dod,
      
        2308
                [
      
        2309
                    {
      
        2310
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2311
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2312
                        "status": "pending",
      
        2313
                    },
      
        2314
                    {
      
        2315
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        2316
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        2317
                        "status": "pending",
      
        2318
                    },
      
        2319
                ],
      
        2320
            )
      
        2321
            assert tool_batches_should_prioritize_missing_artifact(
      
        2322
                dod=dod,
      
        2323
                next_pending=dod.pending_items[0],
      
        2324
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        2325
                project_root=temp_dir,
      
        2326
            )
      
        2327
        
        2328
            tool_call = ToolCall(
      
        2329
                id="dup-read",
      
        2330
                name="read",
      
        2331
                arguments={"file_path": str(index_path)},
      
        2332
            )
      
        2333
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        2334
        
        2335
            assert queued_messages
      
        2336
            message = queued_messages[-1]
      
        2337
            assert "06-ssl-configuration.html" in message
      
        2338
            assert "Do not switch into review or consistency-check mode" in message
      
        2339
            assert (
      
        2340
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        2341
                not in message
      
        2342
            )
      
        2343
        
        2344
        
        2345
        @pytest.mark.asyncio
      
        2346
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        2347
            temp_dir: Path,
      
        2348
        ) -> None:
      
        2349
            async def assess_confidence(
      
        2350
                tool_name: str,
      
        2351
                tool_args: dict,
      
        2352
                context: str,
      
        2353
            ) -> ConfidenceAssessment:
      
        2354
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2355
        
        2356
            async def verify_action(
      
        2357
                tool_name: str,
      
        2358
                tool_args: dict,
      
        2359
                result: str,
      
        2360
                expected: str = "",
      
        2361
            ) -> ActionVerification:
      
        2362
                raise AssertionError("Verification should not run for this scenario")
      
        2363
        
        2364
            guide_root = temp_dir / "guides" / "nginx"
      
        2365
            chapters = guide_root / "chapters"
      
        2366
            chapters.mkdir(parents=True)
      
        2367
            index_path = guide_root / "index.html"
      
        2368
            chapter_one = chapters / "01-getting-started.html"
      
        2369
            chapter_two = chapters / "02-installation.html"
      
        2370
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        2371
            chapter_one.write_text("<h1>One</h1>\n")
      
        2372
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2373
        
        2374
            implementation_plan = temp_dir / "implementation.md"
      
        2375
            implementation_plan.write_text(
      
        2376
                "\n".join(
      
        2377
                    [
      
        2378
                        "# Implementation Plan",
      
        2379
                        "",
      
        2380
                        "## File Changes",
      
        2381
                        f"- `{chapters}/`",
      
        2382
                        f"- `{index_path}`",
      
        2383
                        f"- `{chapter_one}`",
      
        2384
                        f"- `{chapter_two}`",
      
        2385
                        "",
      
        2386
                    ]
      
        2387
                )
      
        2388
            )
      
        2389
        
        2390
            context = build_context(
      
        2391
                temp_dir=temp_dir,
      
        2392
                messages=[],
      
        2393
                safeguards=FakeSafeguards(),
      
        2394
                assess_confidence=assess_confidence,
      
        2395
                verify_action=verify_action,
      
        2396
                auto_recover=False,
      
        2397
            )
      
        2398
            queued_messages: list[str] = []
      
        2399
            context.queue_steering_message_callback = queued_messages.append
      
        2400
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2401
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2402
            dod.implementation_plan = str(implementation_plan)
      
        2403
            sync_todos_to_definition_of_done(
      
        2404
                dod,
      
        2405
                [
      
        2406
                    {
      
        2407
                        "content": "Create the guide files",
      
        2408
                        "active_form": "Working on: Create the guide files",
      
        2409
                        "status": "completed",
      
        2410
                    },
      
        2411
                    {
      
        2412
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2413
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2414
                        "status": "pending",
      
        2415
                    },
      
        2416
                ],
      
        2417
            )
      
        2418
            tool_call = ToolCall(
      
        2419
                id="write-final",
      
        2420
                name="write",
      
        2421
                arguments={
      
        2422
                    "file_path": str(chapter_two),
      
        2423
                    "content": "<h1>Two</h1>\n",
      
        2424
                },
      
        2425
            )
      
        2426
            executor = FakeExecutor(
      
        2427
                [
      
        2428
                    tool_outcome(
      
        2429
                        tool_call=tool_call,
      
        2430
                        output=f"Successfully wrote {chapter_two}",
      
        2431
                        is_error=False,
      
        2432
                    )
      
        2433
                ]
      
        2434
            )
      
        2435
        
        2436
            summary = TurnSummary(final_response="")
      
        2437
            await runner.execute_batch(
      
        2438
                tool_calls=[tool_call],
      
        2439
                tool_source="assistant",
      
        2440
                pending_tool_calls_seen=set(),
      
        2441
                emit=_noop_emit,
      
        2442
                summary=summary,
      
        2443
                dod=dod,
      
        2444
                executor=executor,  # type: ignore[arg-type]
      
        2445
                on_confirmation=None,
      
        2446
                on_user_question=None,
      
        2447
                emit_confirmation=None,
      
        2448
                consecutive_errors=0,
      
        2449
            )
      
        2450
        
        2451
            assert any(
      
        2452
                "All explicitly planned artifacts now exist." in message
      
        2453
                for message in queued_messages
      
        2454
            )
      
        2455
            assert any(
      
        2456
                "Ensure all files are properly linked and formatted consistently" in message
      
        2457
                for message in queued_messages
      
        2458
            )
      
        2459
            assert any(
      
        2460
                "Move to verification once no specific mismatch remains." in message
      
        2461
                for message in queued_messages
      
        2462
            )
      
        2463
        
        2464
        
        2465
        @pytest.mark.asyncio
      
        2466
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        2467
            temp_dir: Path,
      
        2468
        ) -> None:
      
        2469
            async def assess_confidence(
      
        2470
                tool_name: str,
      
        2471
                tool_args: dict,
      
        2472
                context: str,
      
        2473
            ) -> ConfidenceAssessment:
      
        2474
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2475
        
        2476
            async def verify_action(
      
        2477
                tool_name: str,
      
        2478
                tool_args: dict,
      
        2479
                result: str,
      
        2480
                expected: str = "",
      
        2481
            ) -> ActionVerification:
      
        2482
                raise AssertionError("Verification should not run in this scenario")
      
        2483
        
        2484
            guide_root = temp_dir / "guides" / "nginx"
      
        2485
            chapters = guide_root / "chapters"
      
        2486
            guide_root.mkdir(parents=True)
      
        2487
            chapters.mkdir()
      
        2488
            index_path = guide_root / "index.html"
      
        2489
            index_path.write_text("<html></html>\n")
      
        2490
            chapter_one = chapters / "01-getting-started.html"
      
        2491
            chapter_two = chapters / "02-installation.html"
      
        2492
            implementation_plan = temp_dir / "implementation.md"
      
        2493
            implementation_plan.write_text(
      
        2494
                "\n".join(
      
        2495
                    [
      
        2496
                        "# Implementation Plan",
      
        2497
                        "",
      
        2498
                        "## File Changes",
      
        2499
                        f"- `{guide_root}/`",
      
        2500
                        f"- `{index_path}`",
      
        2501
                        f"- `{chapter_one}`",
      
        2502
                        f"- `{chapter_two}`",
      
        2503
                        "",
      
        2504
                    ]
      
        2505
                )
      
        2506
            )
      
        2507
        
        2508
            context = build_context(
      
        2509
                temp_dir=temp_dir,
      
        2510
                messages=[],
      
        2511
                safeguards=FakeSafeguards(),
      
        2512
                assess_confidence=assess_confidence,
      
        2513
                verify_action=verify_action,
      
        2514
                auto_recover=False,
      
        2515
            )
      
        2516
            queued_messages: list[str] = []
      
        2517
            context.queue_steering_message_callback = queued_messages.append
      
        2518
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2519
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2520
            dod.implementation_plan = str(implementation_plan)
      
        2521
            sync_todos_to_definition_of_done(
      
        2522
                dod,
      
        2523
                [
      
        2524
                    {
      
        2525
                        "content": "Create the main index.html file with proper structure",
      
        2526
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        2527
                        "status": "pending",
      
        2528
                    },
      
        2529
                    {
      
        2530
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        2531
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        2532
                        "status": "pending",
      
        2533
                    },
      
        2534
                    {
      
        2535
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2536
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        2537
                        "status": "pending",
      
        2538
                    },
      
        2539
                ],
      
        2540
            )
      
        2541
            tool_call = ToolCall(
      
        2542
                id="write-index",
      
        2543
                name="write",
      
        2544
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        2545
            )
      
        2546
            executor = FakeExecutor(
      
        2547
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        2548
            )
      
        2549
        
        2550
            summary = TurnSummary(final_response="")
      
        2551
            await runner.execute_batch(
      
        2552
                tool_calls=[tool_call],
      
        2553
                tool_source="assistant",
      
        2554
                pending_tool_calls_seen=set(),
      
        2555
                emit=_noop_emit,
      
        2556
                summary=summary,
      
        2557
                dod=dod,
      
        2558
                executor=executor,  # type: ignore[arg-type]
      
        2559
                on_confirmation=None,
      
        2560
                on_user_question=None,
      
        2561
                emit_confirmation=None,
      
        2562
                consecutive_errors=0,
      
        2563
            )
      
        2564
        
        2565
            assert queued_messages
      
        2566
            message = queued_messages[-1]
      
        2567
            assert "Resume by creating `01-getting-started.html` now." in message
      
        2568
            assert "refresh `TodoWrite`" in message
      
        2569
            assert "Do not move to verification, final confirmation, or TodoWrite-only bookkeeping" in message
      
        2570
            assert "Do not spend another turn on working notes or rediscovery alone." in message
      
        2571
        
        2572
        
        2573
        @pytest.mark.asyncio
      
        2574
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        2575
            temp_dir: Path,
      
        2576
        ) -> None:
      
        2577
            async def assess_confidence(
      
        2578
                tool_name: str,
      
        2579
                tool_args: dict,
      
        2580
                context: str,
      
        2581
            ) -> ConfidenceAssessment:
      
        2582
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2583
        
        2584
            async def verify_action(
      
        2585
                tool_name: str,
      
        2586
                tool_args: dict,
      
        2587
                result: str,
      
        2588
                expected: str = "",
      
        2589
            ) -> ActionVerification:
      
        2590
                raise AssertionError("Verification should not run in this scenario")
      
        2591
        
        2592
            guide_root = temp_dir / "guides" / "nginx"
      
        2593
            chapters = guide_root / "chapters"
      
        2594
            guide_root.mkdir(parents=True)
      
        2595
            chapters.mkdir()
      
        2596
            index_path = guide_root / "index.html"
      
        2597
            index_path.write_text("<html></html>\n")
      
        2598
        
        2599
            chapter_paths = [
      
        2600
                chapters / "01-getting-started.html",
      
        2601
                chapters / "02-installation.html",
      
        2602
                chapters / "03-first-website.html",
      
        2603
                chapters / "04-configuration-basics.html",
      
        2604
                chapters / "05-advanced-configurations.html",
      
        2605
                chapters / "06-performance-tuning.html",
      
        2606
                chapters / "07-security-best-practices.html",
      
        2607
            ]
      
        2608
            for chapter in chapter_paths[:4]:
      
        2609
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        2610
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        2611
        
        2612
            implementation_plan = temp_dir / "implementation.md"
      
        2613
            implementation_plan.write_text(
      
        2614
                "\n".join(
      
        2615
                    [
      
        2616
                        "# Implementation Plan",
      
        2617
                        "",
      
        2618
                        "## File Changes",
      
        2619
                        f"- `{guide_root}/`",
      
        2620
                        f"- `{chapters}/`",
      
        2621
                        f"- `{index_path}`",
      
        2622
                        *[f"- `{path}`" for path in chapter_paths],
      
        2623
                        "",
      
        2624
                    ]
      
        2625
                )
      
        2626
            )
      
        2627
        
        2628
            context = build_context(
      
        2629
                temp_dir=temp_dir,
      
        2630
                messages=[],
      
        2631
                safeguards=FakeSafeguards(),
      
        2632
                assess_confidence=assess_confidence,
      
        2633
                verify_action=verify_action,
      
        2634
                auto_recover=False,
      
        2635
            )
      
        2636
            queued_messages: list[str] = []
      
        2637
            context.queue_steering_message_callback = queued_messages.append
      
        2638
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2639
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        2640
            dod.implementation_plan = str(implementation_plan)
      
        2641
            sync_todos_to_definition_of_done(
      
        2642
                dod,
      
        2643
                [
      
        2644
                    {
      
        2645
                        "content": "Create the nginx guide artifacts",
      
        2646
                        "active_form": "Creating nginx guide artifacts",
      
        2647
                        "status": "pending",
      
        2648
                    },
      
        2649
                    {
      
        2650
                        "content": "Verify all guide files are linked and complete",
      
        2651
                        "active_form": "Verifying guide linkage and completeness",
      
        2652
                        "status": "pending",
      
        2653
                    },
      
        2654
                ],
      
        2655
            )
      
        2656
            tool_call = ToolCall(
      
        2657
                id="write-chapter-05",
      
        2658
                name="write",
      
        2659
                arguments={
      
        2660
                    "file_path": str(chapter_paths[4]),
      
        2661
                    "content": "<h1>Advanced configurations</h1>\n",
      
        2662
                },
      
        2663
            )
      
        2664
            executor = FakeExecutor(
      
        2665
                [
      
        2666
                    tool_outcome(
      
        2667
                        tool_call=tool_call,
      
        2668
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        2669
                        is_error=False,
      
        2670
                    )
      
        2671
                ]
      
        2672
            )
      
        2673
        
        2674
            summary = TurnSummary(final_response="")
      
        2675
            await runner.execute_batch(
      
        2676
                tool_calls=[tool_call],
      
        2677
                tool_source="assistant",
      
        2678
                pending_tool_calls_seen=set(),
      
        2679
                emit=_noop_emit,
      
        2680
                summary=summary,
      
        2681
                dod=dod,
      
        2682
                executor=executor,  # type: ignore[arg-type]
      
        2683
                on_confirmation=None,
      
        2684
                on_user_question=None,
      
        2685
                emit_confirmation=None,
      
        2686
                consecutive_errors=0,
      
        2687
            )
      
        2688
        
        2689
            assert any(
      
        2690
                "Resume by creating `06-performance-tuning.html` now." in message
      
        2691
                for message in queued_messages
      
        2692
            )
      
        2693
            assert not any(
      
        2694
                "All explicitly planned artifacts now exist." in message
      
        2695
                for message in queued_messages
      
        2696
            )
      
        2697
        
        2698
        
        2699
        @pytest.mark.asyncio
      
        2700
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        2701
            temp_dir: Path,
      
        2702
        ) -> None:
      
        2703
            async def assess_confidence(
      
        2704
                tool_name: str,
      
        2705
                tool_args: dict,
      
        2706
                context: str,
      
        2707
            ) -> ConfidenceAssessment:
      
        2708
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2709
        
        2710
            async def verify_action(
      
        2711
                tool_name: str,
      
        2712
                tool_args: dict,
      
        2713
                result: str,
      
        2714
                expected: str = "",
      
        2715
            ) -> ActionVerification:
      
        2716
                raise AssertionError("Verification should not run in this scenario")
      
        2717
        
        2718
            guide_root = temp_dir / "guides" / "nginx"
      
        2719
            chapters = guide_root / "chapters"
      
        2720
            guide_root.mkdir(parents=True)
      
        2721
            chapters.mkdir()
      
        2722
            index_path = guide_root / "index.html"
      
        2723
            chapter_paths = [
      
        2724
                chapters / "01-introduction.html",
      
        2725
                chapters / "02-installation.html",
      
        2726
                chapters / "03-configuration.html",
      
        2727
                chapters / "04-basic-usage.html",
      
        2728
                chapters / "05-advanced-features.html",
      
        2729
            ]
      
        2730
            for path in (index_path, *chapter_paths[:4]):
      
        2731
                path.write_text("<html></html>\n")
      
        2732
        
        2733
            implementation_plan = temp_dir / "implementation.md"
      
        2734
            implementation_plan.write_text(
      
        2735
                "\n".join(
      
        2736
                    [
      
        2737
                        "# Implementation Plan",
      
        2738
                        "",
      
        2739
                        "## File Changes",
      
        2740
                        f"- `{guide_root}/`",
      
        2741
                        f"- `{chapters}/`",
      
        2742
                        f"- `{index_path}`",
      
        2743
                        *[f"- `{path}`" for path in chapter_paths],
      
        2744
                        "",
      
        2745
                    ]
      
        2746
                )
      
        2747
            )
      
        2748
        
        2749
            context = build_context(
      
        2750
                temp_dir=temp_dir,
      
        2751
                messages=[],
      
        2752
                safeguards=FakeSafeguards(),
      
        2753
                assess_confidence=assess_confidence,
      
        2754
                verify_action=verify_action,
      
        2755
                auto_recover=False,
      
        2756
            )
      
        2757
            queued_messages: list[str] = []
      
        2758
            context.queue_steering_message_callback = queued_messages.append
      
        2759
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2760
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        2761
            dod.implementation_plan = str(implementation_plan)
      
        2762
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        2763
            dod.completed_items.extend(
      
        2764
                [
      
        2765
                    "Create the nginx directory structure",
      
        2766
                    "Create the main index.html file with proper structure",
      
        2767
                ]
      
        2768
            )
      
        2769
            sync_todos_to_definition_of_done(
      
        2770
                dod,
      
        2771
                [
      
        2772
                    {
      
        2773
                        "content": "Create each chapter file with appropriate content",
      
        2774
                        "active_form": "Creating each chapter file with appropriate content",
      
        2775
                        "status": "pending",
      
        2776
                    }
      
        2777
                ],
      
        2778
            )
      
        2779
            tool_call = ToolCall(
      
        2780
                id="write-chapter-04",
      
        2781
                name="write",
      
        2782
                arguments={
      
        2783
                    "file_path": str(chapter_paths[3]),
      
        2784
                    "content": "<html>updated</html>\n",
      
        2785
                },
      
        2786
            )
      
        2787
            executor = FakeExecutor(
      
        2788
                [
      
        2789
                    tool_outcome(
      
        2790
                        tool_call=tool_call,
      
        2791
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        2792
                        is_error=False,
      
        2793
                    )
      
        2794
                ]
      
        2795
            )
      
        2796
        
        2797
            summary = TurnSummary(final_response="")
      
        2798
            await runner.execute_batch(
      
        2799
                tool_calls=[tool_call],
      
        2800
                tool_source="assistant",
      
        2801
                pending_tool_calls_seen=set(),
      
        2802
                emit=_noop_emit,
      
        2803
                summary=summary,
      
        2804
                dod=dod,
      
        2805
                executor=executor,  # type: ignore[arg-type]
      
        2806
                on_confirmation=None,
      
        2807
                on_user_question=None,
      
        2808
                emit_confirmation=None,
      
        2809
                consecutive_errors=0,
      
        2810
            )
      
        2811
        
        2812
            assert queued_messages
      
        2813
            message = queued_messages[-1]
      
        2814
            assert "Resume by creating `05-advanced-features.html` now." in message
      
        2815
            assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
      
        2816
            assert "refresh `TodoWrite`" not in message
      
        2817
        
        2818
        
        2819
        @pytest.mark.asyncio
      
        2820
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        2821
            temp_dir: Path,
      
        2822
        ) -> None:
      
        2823
            async def assess_confidence(
      
        2824
                tool_name: str,
      
        2825
                tool_args: dict,
      
        2826
                context: str,
      
        2827
            ) -> ConfidenceAssessment:
      
        2828
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2829
        
        2830
            async def verify_action(
      
        2831
                tool_name: str,
      
        2832
                tool_args: dict,
      
        2833
                result: str,
      
        2834
                expected: str = "",
      
        2835
            ) -> ActionVerification:
      
        2836
                raise AssertionError("Verification should not run in this scenario")
      
        2837
        
        2838
            guide_root = temp_dir / "guides" / "nginx"
      
        2839
            chapters = guide_root / "chapters"
      
        2840
            guide_root.mkdir(parents=True)
      
        2841
            chapters.mkdir()
      
        2842
            index_path = guide_root / "index.html"
      
        2843
            index_path.write_text("<html></html>\n")
      
        2844
            chapter_one = chapters / "01-getting-started.html"
      
        2845
            chapter_two = chapters / "02-installation.html"
      
        2846
            chapter_one.write_text("<h1>One</h1>\n")
      
        2847
        
        2848
            implementation_plan = temp_dir / "implementation.md"
      
        2849
            implementation_plan.write_text(
      
        2850
                "\n".join(
      
        2851
                    [
      
        2852
                        "# Implementation Plan",
      
        2853
                        "",
      
        2854
                        "## File Changes",
      
        2855
                        f"- `{guide_root}/`",
      
        2856
                        f"- `{chapters}/`",
      
        2857
                        f"- `{index_path}`",
      
        2858
                        f"- `{chapter_one}`",
      
        2859
                        f"- `{chapter_two}`",
      
        2860
                        "",
      
        2861
                    ]
      
        2862
                )
      
        2863
            )
      
        2864
        
        2865
            context = build_context(
      
        2866
                temp_dir=temp_dir,
      
        2867
                messages=[],
      
        2868
                safeguards=FakeSafeguards(),
      
        2869
                assess_confidence=assess_confidence,
      
        2870
                verify_action=verify_action,
      
        2871
                auto_recover=False,
      
        2872
            )
      
        2873
            queued_messages: list[str] = []
      
        2874
            context.queue_steering_message_callback = queued_messages.append
      
        2875
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2876
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2877
            dod.implementation_plan = str(implementation_plan)
      
        2878
            sync_todos_to_definition_of_done(
      
        2879
                dod,
      
        2880
                [
      
        2881
                    {
      
        2882
                        "content": "Create 01-getting-started.html",
      
        2883
                        "active_form": "Creating 01-getting-started.html",
      
        2884
                        "status": "completed",
      
        2885
                    },
      
        2886
                    {
      
        2887
                        "content": "Create 02-installation.html",
      
        2888
                        "active_form": "Creating 02-installation.html",
      
        2889
                        "status": "pending",
      
        2890
                    },
      
        2891
                ],
      
        2892
            )
      
        2893
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        2894
        
        2895
            tool_call = ToolCall(
      
        2896
                id="todo-only",
      
        2897
                name="TodoWrite",
      
        2898
                arguments={
      
        2899
                    "todos": [
      
        2900
                        {
      
        2901
                            "content": "Create 01-getting-started.html",
      
        2902
                            "active_form": "Creating 01-getting-started.html",
      
        2903
                            "status": "completed",
      
        2904
                        },
      
        2905
                        {
      
        2906
                            "content": "Create 02-installation.html",
      
        2907
                            "active_form": "Creating 02-installation.html",
      
        2908
                            "status": "pending",
      
        2909
                        },
      
        2910
                    ]
      
        2911
                },
      
        2912
            )
      
        2913
            executor = FakeExecutor(
      
        2914
                [
      
        2915
                    tool_outcome(
      
        2916
                        tool_call=tool_call,
      
        2917
                        output="Todos updated",
      
        2918
                        is_error=False,
      
        2919
                        metadata={
      
        2920
                            "new_todos": [
      
        2921
                                {
      
        2922
                                    "content": "Create 01-getting-started.html",
      
        2923
                                    "active_form": "Creating 01-getting-started.html",
      
        2924
                                    "status": "completed",
      
        2925
                                },
      
        2926
                                {
      
        2927
                                    "content": "Create 02-installation.html",
      
        2928
                                    "active_form": "Creating 02-installation.html",
      
        2929
                                    "status": "pending",
      
        2930
                                },
      
        2931
                            ]
      
        2932
                        },
      
        2933
                    )
      
        2934
                ]
      
        2935
            )
      
        2936
        
        2937
            summary = TurnSummary(final_response="")
      
        2938
            await runner.execute_batch(
      
        2939
                tool_calls=[tool_call],
      
        2940
                tool_source="assistant",
      
        2941
                pending_tool_calls_seen=set(),
      
        2942
                emit=_noop_emit,
      
        2943
                summary=summary,
      
        2944
                dod=dod,
      
        2945
                executor=executor,  # type: ignore[arg-type]
      
        2946
                on_confirmation=None,
      
        2947
                on_user_question=None,
      
        2948
                emit_confirmation=None,
      
        2949
                consecutive_errors=0,
      
        2950
            )
      
        2951
        
        2952
            assert queued_messages
      
        2953
            message = queued_messages[-1]
      
        2954
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        2955
            assert "Resume by creating `02-installation.html` now." in message
      
        2956
            assert "refresh `TodoWrite`" in message
      
        2957
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        2958
        
        2959
        
        2960
        @pytest.mark.asyncio
      
        2961
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        2962
            temp_dir: Path,
      
        2963
        ) -> None:
      
        2964
            async def assess_confidence(
      
        2965
                tool_name: str,
      
        2966
                tool_args: dict,
      
        2967
                context: str,
      
        2968
            ) -> ConfidenceAssessment:
      
        2969
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2970
        
        2971
            async def verify_action(
      
        2972
                tool_name: str,
      
        2973
                tool_args: dict,
      
        2974
                result: str,
      
        2975
                expected: str = "",
      
        2976
            ) -> ActionVerification:
      
        2977
                raise AssertionError("Verification should not run in this scenario")
      
        2978
        
        2979
            guide_root = temp_dir / "guides" / "nginx"
      
        2980
            chapters = guide_root / "chapters"
      
        2981
            guide_root.mkdir(parents=True)
      
        2982
            chapters.mkdir()
      
        2983
            index_path = guide_root / "index.html"
      
        2984
            chapter_one = chapters / "01-getting-started.html"
      
        2985
            chapter_two = chapters / "02-installation.html"
      
        2986
            index_path.write_text("<html></html>\n")
      
        2987
            chapter_one.write_text("<h1>One</h1>\n")
      
        2988
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2989
        
        2990
            implementation_plan = temp_dir / "implementation.md"
      
        2991
            implementation_plan.write_text(
      
        2992
                "\n".join(
      
        2993
                    [
      
        2994
                        "# Implementation Plan",
      
        2995
                        "",
      
        2996
                        "## File Changes",
      
        2997
                        f"- `{guide_root}/`",
      
        2998
                        f"- `{chapters}/`",
      
        2999
                        f"- `{index_path}`",
      
        3000
                        f"- `{chapter_one}`",
      
        3001
                        f"- `{chapter_two}`",
      
        3002
                        "",
      
        3003
                    ]
      
        3004
                )
      
        3005
            )
      
        3006
        
        3007
            context = build_context(
      
        3008
                temp_dir=temp_dir,
      
        3009
                messages=[],
      
        3010
                safeguards=FakeSafeguards(),
      
        3011
                assess_confidence=assess_confidence,
      
        3012
                verify_action=verify_action,
      
        3013
                auto_recover=False,
      
        3014
            )
      
        3015
            queued_messages: list[str] = []
      
        3016
            context.queue_steering_message_callback = queued_messages.append
      
        3017
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3018
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3019
            dod.implementation_plan = str(implementation_plan)
      
        3020
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3021
            sync_todos_to_definition_of_done(
      
        3022
                dod,
      
        3023
                [
      
        3024
                    {
      
        3025
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3026
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3027
                        "status": "pending",
      
        3028
                    },
      
        3029
                    {
      
        3030
                        "content": "Verify all guide files are linked and complete",
      
        3031
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3032
                        "status": "pending",
      
        3033
                    },
      
        3034
                ],
      
        3035
                project_root=temp_dir,
      
        3036
            )
      
        3037
        
        3038
            tool_call = ToolCall(
      
        3039
                id="todo-only",
      
        3040
                name="TodoWrite",
      
        3041
                arguments={
      
        3042
                    "todos": [
      
        3043
                        {
      
        3044
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3045
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3046
                            "status": "pending",
      
        3047
                        },
      
        3048
                        {
      
        3049
                            "content": "Verify all guide files are linked and complete",
      
        3050
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3051
                            "status": "pending",
      
        3052
                        },
      
        3053
                    ]
      
        3054
                },
      
        3055
            )
      
        3056
            executor = FakeExecutor(
      
        3057
                [
      
        3058
                    tool_outcome(
      
        3059
                        tool_call=tool_call,
      
        3060
                        output="Todos updated",
      
        3061
                        is_error=False,
      
        3062
                        metadata={
      
        3063
                            "new_todos": [
      
        3064
                                {
      
        3065
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3066
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3067
                                    "status": "pending",
      
        3068
                                },
      
        3069
                                {
      
        3070
                                    "content": "Verify all guide files are linked and complete",
      
        3071
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        3072
                                    "status": "pending",
      
        3073
                                },
      
        3074
                            ]
      
        3075
                        },
      
        3076
                    )
      
        3077
                ]
      
        3078
            )
      
        3079
        
        3080
            summary = TurnSummary(final_response="")
      
        3081
            await runner.execute_batch(
      
        3082
                tool_calls=[tool_call],
      
        3083
                tool_source="assistant",
      
        3084
                pending_tool_calls_seen=set(),
      
        3085
                emit=_noop_emit,
      
        3086
                summary=summary,
      
        3087
                dod=dod,
      
        3088
                executor=executor,  # type: ignore[arg-type]
      
        3089
                on_confirmation=None,
      
        3090
                on_user_question=None,
      
        3091
                emit_confirmation=None,
      
        3092
                consecutive_errors=0,
      
        3093
            )
      
        3094
        
        3095
            assert queued_messages
      
        3096
            message = queued_messages[-1]
      
        3097
            assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
      
        3098
            assert "Verify all guide files are linked and complete" in message
      
        3099
            assert "Move to verification once no specific mismatch remains." in message
      
        3100
            assert "reopen reference materials" in message
      
        3101
            assert "Fortran guide structure" not in message
      
        3102
        
        3103
        
        3104
        @pytest.mark.asyncio
      
        3105
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        3106
            temp_dir: Path,
      
        3107
        ) -> None:
      
        3108
            async def assess_confidence(
      
        3109
                tool_name: str,
      
        3110
                tool_args: dict,
      
        3111
                context: str,
      
        3112
            ) -> ConfidenceAssessment:
      
        3113
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3114
        
        3115
            async def verify_action(
      
        3116
                tool_name: str,
      
        3117
                tool_args: dict,
      
        3118
                result: str,
      
        3119
                expected: str = "",
      
        3120
            ) -> ActionVerification:
      
        3121
                raise AssertionError("Verification should not run in this scenario")
      
        3122
        
        3123
            guide_root = temp_dir / "guides" / "nginx"
      
        3124
            chapters = guide_root / "chapters"
      
        3125
            guide_root.mkdir(parents=True)
      
        3126
            chapters.mkdir()
      
        3127
            index_path = guide_root / "index.html"
      
        3128
            index_path.write_text(
      
        3129
                "\n".join(
      
        3130
                    [
      
        3131
                        "<!DOCTYPE html>",
      
        3132
                        "<html>",
      
        3133
                        "<body>",
      
        3134
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        3135
                        "</body>",
      
        3136
                        "</html>",
      
        3137
                        "",
      
        3138
                    ]
      
        3139
                )
      
        3140
            )
      
        3141
        
        3142
            implementation_plan = temp_dir / "implementation.md"
      
        3143
            implementation_plan.write_text(
      
        3144
                "\n".join(
      
        3145
                    [
      
        3146
                        "# Implementation Plan",
      
        3147
                        "",
      
        3148
                        "## File Changes",
      
        3149
                        f"- `{guide_root}/`",
      
        3150
                        f"- `{chapters}/`",
      
        3151
                        f"- `{index_path}`",
      
        3152
                        "",
      
        3153
                    ]
      
        3154
                )
      
        3155
            )
      
        3156
        
        3157
            context = build_context(
      
        3158
                temp_dir=temp_dir,
      
        3159
                messages=[],
      
        3160
                safeguards=FakeSafeguards(),
      
        3161
                assess_confidence=assess_confidence,
      
        3162
                verify_action=verify_action,
      
        3163
                auto_recover=False,
      
        3164
            )
      
        3165
            queued_messages: list[str] = []
      
        3166
            context.queue_steering_message_callback = queued_messages.append
      
        3167
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3168
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3169
            dod.implementation_plan = str(implementation_plan)
      
        3170
            dod.touched_files.append(str(index_path))
      
        3171
            sync_todos_to_definition_of_done(
      
        3172
                dod,
      
        3173
                [
      
        3174
                    {
      
        3175
                        "content": "Examine the existing Fortran guide structure",
      
        3176
                        "active_form": "Examining the existing Fortran guide structure",
      
        3177
                        "status": "completed",
      
        3178
                    },
      
        3179
                    {
      
        3180
                        "content": "Create the nginx directory structure",
      
        3181
                        "active_form": "Creating the nginx directory structure",
      
        3182
                        "status": "completed",
      
        3183
                    },
      
        3184
                    {
      
        3185
                        "content": "Write the introduction chapter",
      
        3186
                        "active_form": "Writing the introduction chapter",
      
        3187
                        "status": "pending",
      
        3188
                    },
      
        3189
                ],
      
        3190
                project_root=temp_dir,
      
        3191
            )
      
        3192
        
        3193
            tool_call = ToolCall(
      
        3194
                id="todo-next-mutation",
      
        3195
                name="TodoWrite",
      
        3196
                arguments={
      
        3197
                    "todos": [
      
        3198
                        {
      
        3199
                            "content": "Examine the existing Fortran guide structure",
      
        3200
                            "active_form": "Examining the existing Fortran guide structure",
      
        3201
                            "status": "completed",
      
        3202
                        },
      
        3203
                        {
      
        3204
                            "content": "Create the nginx directory structure",
      
        3205
                            "active_form": "Creating the nginx directory structure",
      
        3206
                            "status": "completed",
      
        3207
                        },
      
        3208
                        {
      
        3209
                            "content": "Write the introduction chapter",
      
        3210
                            "active_form": "Writing the introduction chapter",
      
        3211
                            "status": "pending",
      
        3212
                        },
      
        3213
                    ]
      
        3214
                },
      
        3215
            )
      
        3216
            executor = FakeExecutor(
      
        3217
                [
      
        3218
                    tool_outcome(
      
        3219
                        tool_call=tool_call,
      
        3220
                        output="Todos updated",
      
        3221
                        is_error=False,
      
        3222
                        metadata={
      
        3223
                            "new_todos": [
      
        3224
                                {
      
        3225
                                    "content": "Examine the existing Fortran guide structure",
      
        3226
                                    "active_form": "Examining the existing Fortran guide structure",
      
        3227
                                    "status": "completed",
      
        3228
                                },
      
        3229
                                {
      
        3230
                                    "content": "Create the nginx directory structure",
      
        3231
                                    "active_form": "Creating the nginx directory structure",
      
        3232
                                    "status": "completed",
      
        3233
                                },
      
        3234
                                {
      
        3235
                                    "content": "Write the introduction chapter",
      
        3236
                                    "active_form": "Writing the introduction chapter",
      
        3237
                                    "status": "pending",
      
        3238
                                },
      
        3239
                            ]
      
        3240
                        },
      
        3241
                    )
      
        3242
                ]
      
        3243
            )
      
        3244
        
        3245
            summary = TurnSummary(final_response="")
      
        3246
            await runner.execute_batch(
      
        3247
                tool_calls=[tool_call],
      
        3248
                tool_source="assistant",
      
        3249
                pending_tool_calls_seen=set(),
      
        3250
                emit=_noop_emit,
      
        3251
                summary=summary,
      
        3252
                dod=dod,
      
        3253
                executor=executor,  # type: ignore[arg-type]
      
        3254
                on_confirmation=None,
      
        3255
                on_user_question=None,
      
        3256
                emit_confirmation=None,
      
        3257
                consecutive_errors=0,
      
        3258
            )
      
        3259
        
        3260
            assert queued_messages
      
        3261
            message = queued_messages[-1]
      
        3262
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3263
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3264
            assert "Resume by creating `01-introduction.html` now." in message
      
        3265
            assert "It is the next missing declared output under `chapters/`." in message
      
        3266
            assert "Prefer one `write` call for `" in message
      
        3267
            assert "01-introduction.html` instead of more rereads." in message
      
        3268
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3269
        
        3270
        
        3271
        @pytest.mark.asyncio
      
        3272
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        3273
            temp_dir: Path,
      
        3274
        ) -> None:
      
        3275
            async def assess_confidence(
      
        3276
                tool_name: str,
      
        3277
                tool_args: dict,
      
        3278
                context: str,
      
        3279
            ) -> ConfidenceAssessment:
      
        3280
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3281
        
        3282
            async def verify_action(
      
        3283
                tool_name: str,
      
        3284
                tool_args: dict,
      
        3285
                result: str,
      
        3286
                expected: str = "",
      
        3287
            ) -> ActionVerification:
      
        3288
                raise AssertionError("Verification should not run in this scenario")
      
        3289
        
        3290
            guide_root = temp_dir / "guides" / "nginx"
      
        3291
            chapters = guide_root / "chapters"
      
        3292
            guide_root.mkdir(parents=True)
      
        3293
            chapters.mkdir()
      
        3294
            index_path = guide_root / "index.html"
      
        3295
            index_path.write_text(
      
        3296
                "\n".join(
      
        3297
                    [
      
        3298
                        "<html>",
      
        3299
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        3300
                        '<a href="chapters/installation.html">Installation</a>',
      
        3301
                        "</html>",
      
        3302
                    ]
      
        3303
                )
      
        3304
                + "\n"
      
        3305
            )
      
        3306
        
        3307
            implementation_plan = temp_dir / "implementation.md"
      
        3308
            implementation_plan.write_text(
      
        3309
                "\n".join(
      
        3310
                    [
      
        3311
                        "# Implementation Plan",
      
        3312
                        "",
      
        3313
                        "## File Changes",
      
        3314
                        f"- `{guide_root}/`",
      
        3315
                        f"- `{chapters}/`",
      
        3316
                        f"- `{index_path}`",
      
        3317
                        "",
      
        3318
                    ]
      
        3319
                )
      
        3320
            )
      
        3321
        
        3322
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3323
            dod.implementation_plan = str(implementation_plan)
      
        3324
            dod.pending_items = [
      
        3325
                "Write the introduction chapter",
      
        3326
                "Complete the requested work",
      
        3327
            ]
      
        3328
            dod.touched_files.append(str(index_path))
      
        3329
        
        3330
            queued_messages: list[str] = []
      
        3331
            context = build_context(
      
        3332
                temp_dir=temp_dir,
      
        3333
                messages=[],
      
        3334
                safeguards=FakeSafeguards(),
      
        3335
                assess_confidence=assess_confidence,
      
        3336
                verify_action=verify_action,
      
        3337
                auto_recover=False,
      
        3338
            )
      
        3339
            context.queue_steering_message_callback = queued_messages.append
      
        3340
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3341
        
        3342
            tool_call = ToolCall(
      
        3343
                id="todo-1",
      
        3344
                name="TodoWrite",
      
        3345
                arguments={
      
        3346
                    "todos": [
      
        3347
                        {
      
        3348
                            "content": "Write the introduction chapter",
      
        3349
                            "activeForm": "Writing the introduction chapter",
      
        3350
                            "status": "pending",
      
        3351
                        }
      
        3352
                    ]
      
        3353
                },
      
        3354
            )
      
        3355
            executor = FakeExecutor(
      
        3356
                [
      
        3357
                    tool_outcome(
      
        3358
                        tool_call=tool_call,
      
        3359
                        output="Todos updated",
      
        3360
                        is_error=False,
      
        3361
                        metadata={
      
        3362
                            "new_todos": [
      
        3363
                                {
      
        3364
                                    "content": "Write the introduction chapter",
      
        3365
                                    "active_form": "Writing the introduction chapter",
      
        3366
                                    "status": "pending",
      
        3367
                                }
      
        3368
                            ]
      
        3369
                        },
      
        3370
                    )
      
        3371
                ]
      
        3372
            )
      
        3373
        
        3374
            summary = TurnSummary(final_response="")
      
        3375
            await runner.execute_batch(
      
        3376
                tool_calls=[tool_call],
      
        3377
                tool_source="assistant",
      
        3378
                pending_tool_calls_seen=set(),
      
        3379
                emit=_noop_emit,
      
        3380
                summary=summary,
      
        3381
                dod=dod,
      
        3382
                executor=executor,  # type: ignore[arg-type]
      
        3383
                on_confirmation=None,
      
        3384
                on_user_question=None,
      
        3385
                emit_confirmation=None,
      
        3386
                consecutive_errors=0,
      
        3387
            )
      
        3388
        
        3389
            assert queued_messages
      
        3390
            message = queued_messages[-1]
      
        3391
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3392
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3393
            assert "Resume by creating `introduction.html` now." in message
      
        3394
            assert "It is the next missing declared output under `chapters/`." in message
      
        3395
            assert "Prefer one `write` call for `" in message
      
        3396
            assert "introduction.html` instead of more rereads." in message
      
        3397
            assert "Do not spend the next turn on TodoWrite alone" in message
      
        3398
        
        3399
        
        3400
        @pytest.mark.asyncio
      
        3401
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        3402
            temp_dir: Path,
      
        3403
        ) -> None:
      
        3404
            async def assess_confidence(
      
        3405
                tool_name: str,
      
        3406
                tool_args: dict,
      
        3407
                context: str,
      
        3408
            ) -> ConfidenceAssessment:
      
        3409
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3410
        
        3411
            async def verify_action(
      
        3412
                tool_name: str,
      
        3413
                tool_args: dict,
      
        3414
                result: str,
      
        3415
                expected: str = "",
      
        3416
            ) -> ActionVerification:
      
        3417
                raise AssertionError("Verification should not run in this scenario")
      
        3418
        
        3419
            guide_root = temp_dir / "guides" / "nginx"
      
        3420
            chapters = guide_root / "chapters"
      
        3421
            guide_root.mkdir(parents=True)
      
        3422
            chapters.mkdir()
      
        3423
            index_path = guide_root / "index.html"
      
        3424
            chapter_one = chapters / "01-introduction.html"
      
        3425
            index_path.write_text(
      
        3426
                "\n".join(
      
        3427
                    [
      
        3428
                        "<html>",
      
        3429
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        3430
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        3431
                        "</html>",
      
        3432
                    ]
      
        3433
                )
      
        3434
                + "\n"
      
        3435
            )
      
        3436
            chapter_one.write_text("<html></html>\n")
      
        3437
        
        3438
            implementation_plan = temp_dir / "implementation.md"
      
        3439
            implementation_plan.write_text(
      
        3440
                "\n".join(
      
        3441
                    [
      
        3442
                        "# Implementation Plan",
      
        3443
                        "",
      
        3444
                        "## File Changes",
      
        3445
                        f"- `{guide_root}/`",
      
        3446
                        f"- `{chapters}/`",
      
        3447
                        f"- `{index_path}`",
      
        3448
                        "",
      
        3449
                    ]
      
        3450
                )
      
        3451
            )
      
        3452
        
        3453
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3454
            dod.implementation_plan = str(implementation_plan)
      
        3455
            dod.pending_items = [
      
        3456
                "Creating Chapter 2: Installation and Setup",
      
        3457
                "Complete the requested work",
      
        3458
            ]
      
        3459
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3460
        
        3461
            queued_messages: list[str] = []
      
        3462
            context = build_context(
      
        3463
                temp_dir=temp_dir,
      
        3464
                messages=[],
      
        3465
                safeguards=FakeSafeguards(),
      
        3466
                assess_confidence=assess_confidence,
      
        3467
                verify_action=verify_action,
      
        3468
                auto_recover=False,
      
        3469
            )
      
        3470
            context.queue_steering_message_callback = queued_messages.append
      
        3471
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3472
        
        3473
            tool_call = ToolCall(
      
        3474
                id="todo-1",
      
        3475
                name="TodoWrite",
      
        3476
                arguments={
      
        3477
                    "todos": [
      
        3478
                        {
      
        3479
                            "content": "Creating Chapter 2: Installation and Setup",
      
        3480
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        3481
                            "status": "pending",
      
        3482
                        }
      
        3483
                    ]
      
        3484
                },
      
        3485
            )
      
        3486
            executor = FakeExecutor(
      
        3487
                [
      
        3488
                    tool_outcome(
      
        3489
                        tool_call=tool_call,
      
        3490
                        output="Todos updated",
      
        3491
                        is_error=False,
      
        3492
                        metadata={
      
        3493
                            "new_todos": [
      
        3494
                                {
      
        3495
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        3496
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        3497
                                    "status": "pending",
      
        3498
                                }
      
        3499
                            ]
      
        3500
                        },
      
        3501
                    )
      
        3502
                ]
      
        3503
            )
      
        3504
        
        3505
            summary = TurnSummary(final_response="")
      
        3506
            await runner.execute_batch(
      
        3507
                tool_calls=[tool_call],
      
        3508
                tool_source="assistant",
      
        3509
                pending_tool_calls_seen=set(),
      
        3510
                emit=_noop_emit,
      
        3511
                summary=summary,
      
        3512
                dod=dod,
      
        3513
                executor=executor,  # type: ignore[arg-type]
      
        3514
                on_confirmation=None,
      
        3515
                on_user_question=None,
      
        3516
                emit_confirmation=None,
      
        3517
                consecutive_errors=0,
      
        3518
            )
      
        3519
        
        3520
            assert queued_messages
      
        3521
            message = queued_messages[-1]
      
        3522
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3523
            assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
      
        3524
            assert "Resume by creating `02-installation.html` now." in message
      
        3525
            assert (
      
        3526
                f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
      
        3527
                "instead of more rereads."
      
        3528
                in message
      
        3529
            )
      
        3530
            assert "Make your next response the concrete mutation tool call itself" in message
      
        3531
        
        3532
        
        3533
        @pytest.mark.asyncio
      
        3534
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        3535
            temp_dir: Path,
      
        3536
        ) -> None:
      
        3537
            async def assess_confidence(
      
        3538
                tool_name: str,
      
        3539
                tool_args: dict,
      
        3540
                context: str,
      
        3541
            ) -> ConfidenceAssessment:
      
        3542
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3543
        
        3544
            async def verify_action(
      
        3545
                tool_name: str,
      
        3546
                tool_args: dict,
      
        3547
                result: str,
      
        3548
                expected: str = "",
      
        3549
            ) -> ActionVerification:
      
        3550
                raise AssertionError("Verification should not run in this scenario")
      
        3551
        
        3552
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        3553
            reference_chapters.mkdir(parents=True)
      
        3554
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        3555
        
        3556
            guide_root = temp_dir / "guides" / "nginx"
      
        3557
            chapters = guide_root / "chapters"
      
        3558
            guide_root.mkdir(parents=True)
      
        3559
            chapters.mkdir()
      
        3560
            index_path = guide_root / "index.html"
      
        3561
            index_path.write_text("<html></html>\n")
      
        3562
        
        3563
            implementation_plan = temp_dir / "implementation.md"
      
        3564
            implementation_plan.write_text(
      
        3565
                "\n".join(
      
        3566
                    [
      
        3567
                        "# Implementation Plan",
      
        3568
                        "",
      
        3569
                        "## File Changes",
      
        3570
                        f"- `{guide_root}/`",
      
        3571
                        f"- `{chapters}/`",
      
        3572
                        f"- `{index_path}`",
      
        3573
                        "",
      
        3574
                    ]
      
        3575
                )
      
        3576
            )
      
        3577
        
        3578
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3579
            dod.implementation_plan = str(implementation_plan)
      
        3580
            dod.pending_items = [
      
        3581
                "Write the introduction chapter",
      
        3582
                "Complete the requested work",
      
        3583
            ]
      
        3584
            dod.touched_files.append(str(index_path))
      
        3585
        
        3586
            queued_messages: list[str] = []
      
        3587
            context = build_context(
      
        3588
                temp_dir=temp_dir,
      
        3589
                messages=[
      
        3590
                    Message(
      
        3591
                        role=Role.ASSISTANT,
      
        3592
                        content="",
      
        3593
                        tool_calls=[
      
        3594
                            ToolCall(
      
        3595
                                id="read-ref-1",
      
        3596
                                name="read",
      
        3597
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        3598
                            )
      
        3599
                        ],
      
        3600
                    )
      
        3601
                ],
      
        3602
                safeguards=FakeSafeguards(),
      
        3603
                assess_confidence=assess_confidence,
      
        3604
                verify_action=verify_action,
      
        3605
                auto_recover=False,
      
        3606
            )
      
        3607
            context.queue_steering_message_callback = queued_messages.append
      
        3608
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3609
        
        3610
            tool_call = ToolCall(
      
        3611
                id="todo-observed-1",
      
        3612
                name="TodoWrite",
      
        3613
                arguments={
      
        3614
                    "todos": [
      
        3615
                        {
      
        3616
                            "content": "Write the introduction chapter",
      
        3617
                            "activeForm": "Writing the introduction chapter",
      
        3618
                            "status": "pending",
      
        3619
                        }
      
        3620
                    ]
      
        3621
                },
      
        3622
            )
      
        3623
            executor = FakeExecutor(
      
        3624
                [
      
        3625
                    tool_outcome(
      
        3626
                        tool_call=tool_call,
      
        3627
                        output="Todos updated",
      
        3628
                        is_error=False,
      
        3629
                        metadata={
      
        3630
                            "new_todos": [
      
        3631
                                {
      
        3632
                                    "content": "Write the introduction chapter",
      
        3633
                                    "active_form": "Writing the introduction chapter",
      
        3634
                                    "status": "pending",
      
        3635
                                }
      
        3636
                            ]
      
        3637
                        },
      
        3638
                    )
      
        3639
                ]
      
        3640
            )
      
        3641
        
        3642
            summary = TurnSummary(final_response="")
      
        3643
            await runner.execute_batch(
      
        3644
                tool_calls=[tool_call],
      
        3645
                tool_source="assistant",
      
        3646
                pending_tool_calls_seen=set(),
      
        3647
                emit=_noop_emit,
      
        3648
                summary=summary,
      
        3649
                dod=dod,
      
        3650
                executor=executor,  # type: ignore[arg-type]
      
        3651
                on_confirmation=None,
      
        3652
                on_user_question=None,
      
        3653
                emit_confirmation=None,
      
        3654
                consecutive_errors=0,
      
        3655
            )
      
        3656
        
        3657
            assert queued_messages
      
        3658
            message = queued_messages[-1]
      
        3659
            assert "Todo tracking is updated. A declared output artifact is still missing." in message
      
        3660
            assert "Continue with the next pending item: `Write the introduction chapter`." in message
      
        3661
            assert "Resume by creating `01-introduction.html` now." in message
      
        3662
            assert (
      
        3663
                "It mirrors the observed filename pattern from another `chapters/` directory "
      
        3664
                "you already inspected."
      
        3665
                in message
      
        3666
            )
      
        3667
            assert "01-introduction.html` instead of more rereads." in message
      
        3668
        
        3669
        
        3670
        @pytest.mark.asyncio
      
        3671
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        3672
            temp_dir: Path,
      
        3673
        ) -> None:
      
        3674
            async def assess_confidence(
      
        3675
                tool_name: str,
      
        3676
                tool_args: dict,
      
        3677
                context: str,
      
        3678
            ) -> ConfidenceAssessment:
      
        3679
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3680
        
        3681
            async def verify_action(
      
        3682
                tool_name: str,
      
        3683
                tool_args: dict,
      
        3684
                result: str,
      
        3685
                expected: str = "",
      
        3686
            ) -> ActionVerification:
      
        3687
                raise AssertionError("Verification should not run in this scenario")
      
        3688
        
        3689
            guide_root = temp_dir / "guides" / "nginx"
      
        3690
            chapters = guide_root / "chapters"
      
        3691
            guide_root.mkdir(parents=True)
      
        3692
            chapters.mkdir()
      
        3693
            index_path = guide_root / "index.html"
      
        3694
            chapter_one = chapters / "01-getting-started.html"
      
        3695
            chapter_two = chapters / "02-installation.html"
      
        3696
            index_path.write_text("<html></html>\n")
      
        3697
            chapter_one.write_text("<h1>One</h1>\n")
      
        3698
        
        3699
            implementation_plan = temp_dir / "implementation.md"
      
        3700
            implementation_plan.write_text(
      
        3701
                "\n".join(
      
        3702
                    [
      
        3703
                        "# Implementation Plan",
      
        3704
                        "",
      
        3705
                        "## File Changes",
      
        3706
                        f"- `{guide_root}/`",
      
        3707
                        f"- `{chapters}/`",
      
        3708
                        f"- `{index_path}`",
      
        3709
                        f"- `{chapter_one}`",
      
        3710
                        f"- `{chapter_two}`",
      
        3711
                        "",
      
        3712
                    ]
      
        3713
                )
      
        3714
            )
      
        3715
        
        3716
            context = build_context(
      
        3717
                temp_dir=temp_dir,
      
        3718
                messages=[],
      
        3719
                safeguards=FakeSafeguards(),
      
        3720
                assess_confidence=assess_confidence,
      
        3721
                verify_action=verify_action,
      
        3722
                auto_recover=False,
      
        3723
            )
      
        3724
            queued_messages: list[str] = []
      
        3725
            context.queue_steering_message_callback = queued_messages.append
      
        3726
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3727
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3728
            dod.implementation_plan = str(implementation_plan)
      
        3729
            sync_todos_to_definition_of_done(
      
        3730
                dod,
      
        3731
                [
      
        3732
                    {
      
        3733
                        "content": "Create 01-getting-started.html",
      
        3734
                        "active_form": "Creating 01-getting-started.html",
      
        3735
                        "status": "completed",
      
        3736
                    },
      
        3737
                    {
      
        3738
                        "content": "Create 02-installation.html",
      
        3739
                        "active_form": "Creating 02-installation.html",
      
        3740
                        "status": "pending",
      
        3741
                    },
      
        3742
                ],
      
        3743
                project_root=temp_dir,
      
        3744
            )
      
        3745
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3746
        
        3747
            tool_call = ToolCall(
      
        3748
                id="working-note",
      
        3749
                name="notepad_write_working",
      
        3750
                arguments={"content": "Creating the second chapter file: Installation"},
      
        3751
            )
      
        3752
            executor = FakeExecutor(
      
        3753
                [
      
        3754
                    tool_outcome(
      
        3755
                        tool_call=tool_call,
      
        3756
                        output="Working note recorded",
      
        3757
                        is_error=False,
      
        3758
                    )
      
        3759
                ]
      
        3760
            )
      
        3761
        
        3762
            summary = TurnSummary(final_response="")
      
        3763
            await runner.execute_batch(
      
        3764
                tool_calls=[tool_call],
      
        3765
                tool_source="assistant",
      
        3766
                pending_tool_calls_seen=set(),
      
        3767
                emit=_noop_emit,
      
        3768
                summary=summary,
      
        3769
                dod=dod,
      
        3770
                executor=executor,  # type: ignore[arg-type]
      
        3771
                on_confirmation=None,
      
        3772
                on_user_question=None,
      
        3773
                emit_confirmation=None,
      
        3774
                consecutive_errors=0,
      
        3775
            )
      
        3776
        
        3777
            assert queued_messages
      
        3778
            message = queued_messages[-1]
      
        3779
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        3780
            assert "Resume by creating `02-installation.html` now." in message
      
        3781
            assert "Make your next response the concrete mutation tool call itself" in message
      
        3782
            assert "refresh `TodoWrite`" in message
      
        3783
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        3784
        
        3785
        
        3786
        @pytest.mark.asyncio
      
        3787
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        3788
            temp_dir: Path,
      
        3789
        ) -> None:
      
        3790
            async def assess_confidence(
      
        3791
                tool_name: str,
      
        3792
                tool_args: dict,
      
        3793
                context: str,
      
        3794
            ) -> ConfidenceAssessment:
      
        3795
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3796
        
        3797
            async def verify_action(
      
        3798
                tool_name: str,
      
        3799
                tool_args: dict,
      
        3800
                result: str,
      
        3801
                expected: str = "",
      
        3802
            ) -> ActionVerification:
      
        3803
                raise AssertionError("Verification should not run in this scenario")
      
        3804
        
        3805
            implementation_plan = temp_dir / "implementation.md"
      
        3806
            implementation_plan.write_text(
      
        3807
                "\n".join(
      
        3808
                    [
      
        3809
                        "# Implementation Plan",
      
        3810
                        "",
      
        3811
                        "## File Changes",
      
        3812
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        3813
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        3814
                        "",
      
        3815
                    ]
      
        3816
                )
      
        3817
            )
      
        3818
        
        3819
            context = build_context(
      
        3820
                temp_dir=temp_dir,
      
        3821
                messages=[],
      
        3822
                safeguards=FakeSafeguards(),
      
        3823
                assess_confidence=assess_confidence,
      
        3824
                verify_action=verify_action,
      
        3825
                auto_recover=False,
      
        3826
            )
      
        3827
            queued_messages: list[str] = []
      
        3828
            context.queue_steering_message_callback = queued_messages.append
      
        3829
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3830
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3831
            dod.implementation_plan = str(implementation_plan)
      
        3832
            dod.pending_items.extend(
      
        3833
                [
      
        3834
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        3835
                    "Create the nginx directory structure",
      
        3836
                    "Develop the main index.html file for the nginx guide",
      
        3837
                ]
      
        3838
            )
      
        3839
        
        3840
            tool_call = ToolCall(
      
        3841
                id="working-note",
      
        3842
                name="notepad_write_working",
      
        3843
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        3844
            )
      
        3845
            executor = FakeExecutor(
      
        3846
                [
      
        3847
                    tool_outcome(
      
        3848
                        tool_call=tool_call,
      
        3849
                        output="Working note recorded",
      
        3850
                        is_error=False,
      
        3851
                    )
      
        3852
                ]
      
        3853
            )
      
        3854
        
        3855
            summary = TurnSummary(final_response="")
      
        3856
            await runner.execute_batch(
      
        3857
                tool_calls=[tool_call],
      
        3858
                tool_source="assistant",
      
        3859
                pending_tool_calls_seen=set(),
      
        3860
                emit=_noop_emit,
      
        3861
                summary=summary,
      
        3862
                dod=dod,
      
        3863
                executor=executor,  # type: ignore[arg-type]
      
        3864
                on_confirmation=None,
      
        3865
                on_user_question=None,
      
        3866
                emit_confirmation=None,
      
        3867
                consecutive_errors=0,
      
        3868
            )
      
        3869
        
        3870
            assert queued_messages
      
        3871
            message = queued_messages[-1]
      
        3872
            assert (
      
        3873
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        3874
                in message
      
        3875
            )
      
        3876
            assert "one concrete evidence-gathering tool call" in message
      
        3877
            assert "Resume by creating `index.html` now." not in message
      
        3878
        
        3879
        
        3880
        @pytest.mark.asyncio
      
        3881
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        3882
            temp_dir: Path,
      
        3883
        ) -> None:
      
        3884
            async def assess_confidence(
      
        3885
                tool_name: str,
      
        3886
                tool_args: dict,
      
        3887
                context: str,
      
        3888
            ) -> ConfidenceAssessment:
      
        3889
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3890
        
        3891
            async def verify_action(
      
        3892
                tool_name: str,
      
        3893
                tool_args: dict,
      
        3894
                result: str,
      
        3895
                expected: str = "",
      
        3896
            ) -> ActionVerification:
      
        3897
                raise AssertionError("Verification should not run in this scenario")
      
        3898
        
        3899
            guide_root = temp_dir / "guides" / "nginx"
      
        3900
            chapters_dir = guide_root / "chapters"
      
        3901
            chapters_dir.mkdir(parents=True)
      
        3902
            index_path = guide_root / "index.html"
      
        3903
            first_chapter = chapters_dir / "01-introduction.html"
      
        3904
            index_path.write_text(
      
        3905
                "\n".join(
      
        3906
                    [
      
        3907
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        3908
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        3909
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        3910
                    ]
      
        3911
                )
      
        3912
            )
      
        3913
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        3914
        
        3915
            implementation_plan = temp_dir / "implementation.md"
      
        3916
            implementation_plan.write_text(
      
        3917
                "\n".join(
      
        3918
                    [
      
        3919
                        "# Implementation Plan",
      
        3920
                        "",
      
        3921
                        "## File Changes",
      
        3922
                        f"- `{guide_root / 'index.html'}`",
      
        3923
                        f"- `{chapters_dir}/`",
      
        3924
                        "",
      
        3925
                    ]
      
        3926
                )
      
        3927
            )
      
        3928
        
        3929
            context = build_context(
      
        3930
                temp_dir=temp_dir,
      
        3931
                messages=[],
      
        3932
                safeguards=FakeSafeguards(),
      
        3933
                assess_confidence=assess_confidence,
      
        3934
                verify_action=verify_action,
      
        3935
                auto_recover=False,
      
        3936
            )
      
        3937
            queued_messages: list[str] = []
      
        3938
            context.queue_steering_message_callback = queued_messages.append
      
        3939
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3940
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3941
            dod.implementation_plan = str(implementation_plan)
      
        3942
            dod.pending_items.extend(
      
        3943
                [
      
        3944
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        3945
                    "Create chapter files following the established pattern",
      
        3946
                ]
      
        3947
            )
      
        3948
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        3949
        
        3950
            tool_call = ToolCall(
      
        3951
                id="working-note",
      
        3952
                name="notepad_write_working",
      
        3953
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        3954
            )
      
        3955
            executor = FakeExecutor(
      
        3956
                [
      
        3957
                    tool_outcome(
      
        3958
                        tool_call=tool_call,
      
        3959
                        output="Working note recorded",
      
        3960
                        is_error=False,
      
        3961
                    )
      
        3962
                ]
      
        3963
            )
      
        3964
        
        3965
            summary = TurnSummary(final_response="")
      
        3966
            await runner.execute_batch(
      
        3967
                tool_calls=[tool_call],
      
        3968
                tool_source="assistant",
      
        3969
                pending_tool_calls_seen=set(),
      
        3970
                emit=_noop_emit,
      
        3971
                summary=summary,
      
        3972
                dod=dod,
      
        3973
                executor=executor,  # type: ignore[arg-type]
      
        3974
                on_confirmation=None,
      
        3975
                on_user_question=None,
      
        3976
                emit_confirmation=None,
      
        3977
                consecutive_errors=0,
      
        3978
            )
      
        3979
        
        3980
            assert queued_messages
      
        3981
            message = queued_messages[-1]
      
        3982
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        3983
            assert "Resume by creating `02-installation.html` now." in message
      
        3984
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        3985
        
        3986
        
        3987
        @pytest.mark.asyncio
      
        3988
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        3989
            temp_dir: Path,
      
        3990
        ) -> None:
      
        3991
            async def assess_confidence(
      
        3992
                tool_name: str,
      
        3993
                tool_args: dict,
      
        3994
                context: str,
      
        3995
            ) -> ConfidenceAssessment:
      
        3996
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3997
        
        3998
            async def verify_action(
      
        3999
                tool_name: str,
      
        4000
                tool_args: dict,
      
        4001
                result: str,
      
        4002
                expected: str = "",
      
        4003
            ) -> ActionVerification:
      
        4004
                raise AssertionError("Verification should not run in this scenario")
      
        4005
        
        4006
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        4007
            chapters_dir = fortran_root / "chapters"
      
        4008
            chapters_dir.mkdir(parents=True)
      
        4009
        
        4010
            implementation_plan = temp_dir / "implementation.md"
      
        4011
            implementation_plan.write_text(
      
        4012
                "\n".join(
      
        4013
                    [
      
        4014
                        "# Implementation Plan",
      
        4015
                        "",
      
        4016
                        "## File Changes",
      
        4017
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        4018
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        4019
                        "",
      
        4020
                    ]
      
        4021
                )
      
        4022
            )
      
        4023
        
        4024
            context = build_context(
      
        4025
                temp_dir=temp_dir,
      
        4026
                messages=[],
      
        4027
                safeguards=FakeSafeguards(),
      
        4028
                assess_confidence=assess_confidence,
      
        4029
                verify_action=verify_action,
      
        4030
                auto_recover=False,
      
        4031
            )
      
        4032
            queued_messages: list[str] = []
      
        4033
            context.queue_steering_message_callback = queued_messages.append
      
        4034
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4035
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4036
            dod.implementation_plan = str(implementation_plan)
      
        4037
            dod.pending_items.extend(
      
        4038
                [
      
        4039
                    "First, examine the existing fortran guide structure and content",
      
        4040
                    "Create the nginx directory structure",
      
        4041
                    "Develop the main index.html file for nginx guide",
      
        4042
                ]
      
        4043
            )
      
        4044
        
        4045
            tool_call = ToolCall(
      
        4046
                id="glob-1",
      
        4047
                name="glob",
      
        4048
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        4049
            )
      
        4050
            executor = FakeExecutor(
      
        4051
                [
      
        4052
                    tool_outcome(
      
        4053
                        tool_call=tool_call,
      
        4054
                        output=f"{fortran_root}\n{chapters_dir}",
      
        4055
                        is_error=False,
      
        4056
                    )
      
        4057
                ]
      
        4058
            )
      
        4059
        
        4060
            summary = TurnSummary(final_response="")
      
        4061
            await runner.execute_batch(
      
        4062
                tool_calls=[tool_call],
      
        4063
                tool_source="assistant",
      
        4064
                pending_tool_calls_seen=set(),
      
        4065
                emit=_noop_emit,
      
        4066
                summary=summary,
      
        4067
                dod=dod,
      
        4068
                executor=executor,  # type: ignore[arg-type]
      
        4069
                on_confirmation=None,
      
        4070
                on_user_question=None,
      
        4071
                emit_confirmation=None,
      
        4072
                consecutive_errors=0,
      
        4073
            )
      
        4074
        
        4075
            assert queued_messages == []
      
        4076
        
        4077
        
        4078
        @pytest.mark.asyncio
      
        4079
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        4080
            temp_dir: Path,
      
        4081
        ) -> None:
      
        4082
            async def assess_confidence(
      
        4083
                tool_name: str,
      
        4084
                tool_args: dict,
      
        4085
                context: str,
      
        4086
            ) -> ConfidenceAssessment:
      
        4087
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4088
        
        4089
            async def verify_action(
      
        4090
                tool_name: str,
      
        4091
                tool_args: dict,
      
        4092
                result: str,
      
        4093
                expected: str = "",
      
        4094
            ) -> ActionVerification:
      
        4095
                raise AssertionError("Verification should not run in this scenario")
      
        4096
        
        4097
            prompt = (
      
        4098
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        4099
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        4100
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        4101
                "links and their link texts to be correct."
      
        4102
            )
      
        4103
            chapters = temp_dir / "chapters"
      
        4104
            chapters.mkdir()
      
        4105
            (chapters / "01-introduction.html").write_text(
      
        4106
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        4107
            )
      
        4108
            (chapters / "02-setup.html").write_text(
      
        4109
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        4110
            )
      
        4111
            current_block = (
      
        4112
                "<h2>Table of Contents</h2>\n"
      
        4113
                '        <ul class="chapter-list">\n'
      
        4114
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        4115
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        4116
                "        </ul>\n"
      
        4117
            )
      
        4118
            index_path = temp_dir / "index.html"
      
        4119
            index_path.write_text(current_block)
      
        4120
        
        4121
            context = build_context(
      
        4122
                temp_dir=temp_dir,
      
        4123
                messages=[],
      
        4124
                safeguards=FakeSafeguards(),
      
        4125
                assess_confidence=assess_confidence,
      
        4126
                verify_action=verify_action,
      
        4127
                auto_recover=False,
      
        4128
            )
      
        4129
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        4130
            queued_messages: list[str] = []
      
        4131
            context.queue_steering_message_callback = queued_messages.append
      
        4132
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4133
            tool_call = ToolCall(
      
        4134
                id="edit-1",
      
        4135
                name="edit",
      
        4136
                arguments={
      
        4137
                    "file_path": str(index_path),
      
        4138
                    "old_string": current_block,
      
        4139
                    "new_string": current_block,
      
        4140
                },
      
        4141
            )
      
        4142
            executor = FakeExecutor(
      
        4143
                [
      
        4144
                    tool_outcome(
      
        4145
                        tool_call=tool_call,
      
        4146
                        output=(
      
        4147
                            "[Blocked - old_string and new_string are identical - no change "
      
        4148
                            "would occur] Suggestion: Provide different old and new strings"
      
        4149
                        ),
      
        4150
                        is_error=True,
      
        4151
                        state=ToolExecutionState.BLOCKED,
      
        4152
                    )
      
        4153
                ]
      
        4154
            )
      
        4155
        
        4156
            await runner.execute_batch(
      
        4157
                tool_calls=[tool_call],
      
        4158
                tool_source="assistant",
      
        4159
                pending_tool_calls_seen=set(),
      
        4160
                emit=_noop_emit,
      
        4161
                summary=TurnSummary(final_response=""),
      
        4162
                dod=create_definition_of_done(prompt),
      
        4163
                executor=executor,  # type: ignore[arg-type]
      
        4164
                on_confirmation=None,
      
        4165
                on_user_question=None,
      
        4166
                emit_confirmation=None,
      
        4167
                consecutive_errors=0,
      
        4168
            )
      
        4169
        
        4170
            assert queued_messages == []
      
        4171
        
        4172
        
        4173
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        4174
            temp_dir: Path,
      
        4175
        ) -> None:
      
        4176
            async def assess_confidence(
      
        4177
                tool_name: str,
      
        4178
                tool_args: dict,
      
        4179
                context: str,
      
        4180
            ) -> ConfidenceAssessment:
      
        4181
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4182
        
        4183
            async def verify_action(
      
        4184
                tool_name: str,
      
        4185
                tool_args: dict,
      
        4186
                result: str,
      
        4187
                expected: str = "",
      
        4188
            ) -> ActionVerification:
      
        4189
                raise AssertionError("Verification should not run in this scenario")
      
        4190
        
        4191
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        4192
            context = build_context(
      
        4193
                temp_dir=temp_dir,
      
        4194
                messages=[
      
        4195
                    Message(
      
        4196
                        role=Role.ASSISTANT,
      
        4197
                        content=(
      
        4198
                            "Repair focus:\n"
      
        4199
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        4200
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4201
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        4202
                        ),
      
        4203
                    )
      
        4204
                ],
      
        4205
                safeguards=FakeSafeguards(),
      
        4206
                assess_confidence=assess_confidence,
      
        4207
                verify_action=verify_action,
      
        4208
            )
      
        4209
            queued: list[str] = []
      
        4210
            context.queue_steering_message_callback = queued.append
      
        4211
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4212
        
        4213
            runner._queue_blocked_html_edit_nudge(
      
        4214
                ToolCall(
      
        4215
                    id="edit-1",
      
        4216
                    name="edit",
      
        4217
                    arguments={
      
        4218
                        "file_path": str(repair_target),
      
        4219
                        "old_string": "same",
      
        4220
                        "new_string": "same",
      
        4221
                    },
      
        4222
                ),
      
        4223
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        4224
            )
      
        4225
        
        4226
            assert queued
      
        4227
            assert str(repair_target) in queued[0]
      
        4228
            assert "no on-disk change" in queued[0]
      
        4229
            assert "replace the surrounding block" in queued[0]
      
        4230
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4231
        
        4232
        
        4233
        async def _noop_emit(event: AgentEvent) -> None:
      
        4234
            return None
      
        4235
        
        4236
        
        4237
        @pytest.mark.asyncio
      
        4238
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        4239
            temp_dir: Path,
      
        4240
        ) -> None:
      
        4241
            async def assess_confidence(
      
        4242
                tool_name: str,
      
        4243
                tool_args: dict,
      
        4244
                context: str,
      
        4245
            ) -> ConfidenceAssessment:
      
        4246
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4247
        
        4248
            async def verify_action(
      
        4249
                tool_name: str,
      
        4250
                tool_args: dict,
      
        4251
                result: str,
      
        4252
                expected: str = "",
      
        4253
            ) -> ActionVerification:
      
        4254
                raise AssertionError("Verification should not run for this scenario")
      
        4255
        
        4256
            context = build_context(
      
        4257
                temp_dir=temp_dir,
      
        4258
                messages=[],
      
        4259
                safeguards=FakeSafeguards(),
      
        4260
                assess_confidence=assess_confidence,
      
        4261
                verify_action=verify_action,
      
        4262
            )
      
        4263
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4264
            tool_call = ToolCall(
      
        4265
                id="write-1",
      
        4266
                name="write",
      
        4267
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4268
            )
      
        4269
            executor = FakeExecutor(
      
        4270
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4271
            )
      
        4272
            summary = TurnSummary(final_response="")
      
        4273
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4274
            events: list[AgentEvent] = []
      
        4275
        
        4276
            async def emit(event: AgentEvent) -> None:
      
        4277
                events.append(event)
      
        4278
        
        4279
            await runner.execute_batch(
      
        4280
                tool_calls=[tool_call],
      
        4281
                tool_source="assistant",
      
        4282
                pending_tool_calls_seen=set(),
      
        4283
                emit=emit,
      
        4284
                summary=summary,
      
        4285
                dod=dod,
      
        4286
                executor=executor,  # type: ignore[arg-type]
      
        4287
                on_confirmation=None,
      
        4288
                on_user_question=None,
      
        4289
                emit_confirmation=None,
      
        4290
                consecutive_errors=0,
      
        4291
            )
      
        4292
        
        4293
            assert dod.last_verification_result == "planned"
      
        4294
            assert dod.verification_commands
      
        4295
            assert "Collect verification evidence" in dod.pending_items
      
        4296
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        4297
            assert dod.active_verification_attempt_number == 1
      
        4298
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        4299
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        4300
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        4301
            assert (
      
        4302
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4303
                == "verification-attempt-1"
      
        4304
            )
      
        4305
            assert (
      
        4306
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4307
            )
      
        4308
        
        4309
        
        4310
        @pytest.mark.asyncio
      
        4311
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        4312
            temp_dir: Path,
      
        4313
        ) -> None:
      
        4314
            async def assess_confidence(
      
        4315
                tool_name: str,
      
        4316
                tool_args: dict,
      
        4317
                context: str,
      
        4318
            ) -> ConfidenceAssessment:
      
        4319
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4320
        
        4321
            async def verify_action(
      
        4322
                tool_name: str,
      
        4323
                tool_args: dict,
      
        4324
                result: str,
      
        4325
                expected: str = "",
      
        4326
            ) -> ActionVerification:
      
        4327
                raise AssertionError("Verification should not run in this scenario")
      
        4328
        
        4329
            context = build_context(
      
        4330
                temp_dir=temp_dir,
      
        4331
                messages=[],
      
        4332
                safeguards=FakeSafeguards(),
      
        4333
                assess_confidence=assess_confidence,
      
        4334
                verify_action=verify_action,
      
        4335
            )
      
        4336
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4337
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4338
            chapters = nginx_root / "chapters"
      
        4339
            implementation_plan = temp_dir / "implementation.md"
      
        4340
            implementation_plan.write_text(
      
        4341
                "\n".join(
      
        4342
                    [
      
        4343
                        "# Implementation Plan",
      
        4344
                        "",
      
        4345
                        "## File Changes",
      
        4346
                        f"- `{chapters}/`",
      
        4347
                        f"- `{nginx_root / 'index.html'}`",
      
        4348
                        "",
      
        4349
                    ]
      
        4350
                )
      
        4351
            )
      
        4352
        
        4353
            tool_call = ToolCall(
      
        4354
                id="mkdir-1",
      
        4355
                name="bash",
      
        4356
                arguments={"command": f"mkdir -p {chapters}"},
      
        4357
            )
      
        4358
            executor = FakeExecutor(
      
        4359
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        4360
            )
      
        4361
            summary = TurnSummary(final_response="")
      
        4362
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        4363
            dod.implementation_plan = str(implementation_plan)
      
        4364
            events: list[AgentEvent] = []
      
        4365
        
        4366
            async def emit(event: AgentEvent) -> None:
      
        4367
                events.append(event)
      
        4368
        
        4369
            await runner.execute_batch(
      
        4370
                tool_calls=[tool_call],
      
        4371
                tool_source="assistant",
      
        4372
                pending_tool_calls_seen=set(),
      
        4373
                emit=emit,
      
        4374
                summary=summary,
      
        4375
                dod=dod,
      
        4376
                executor=executor,  # type: ignore[arg-type]
      
        4377
                on_confirmation=None,
      
        4378
                on_user_question=None,
      
        4379
                emit_confirmation=None,
      
        4380
                consecutive_errors=0,
      
        4381
            )
      
        4382
        
        4383
            assert dod.last_verification_result is None
      
        4384
            assert "Collect verification evidence" not in dod.pending_items
      
        4385
            assert not any(
      
        4386
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        4387
            )
      
        4388
        
        4389
        
        4390
        @pytest.mark.asyncio
      
        4391
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        4392
            temp_dir: Path,
      
        4393
        ) -> None:
      
        4394
            async def assess_confidence(
      
        4395
                tool_name: str,
      
        4396
                tool_args: dict,
      
        4397
                context: str,
      
        4398
            ) -> ConfidenceAssessment:
      
        4399
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4400
        
        4401
            async def verify_action(
      
        4402
                tool_name: str,
      
        4403
                tool_args: dict,
      
        4404
                result: str,
      
        4405
                expected: str = "",
      
        4406
            ) -> ActionVerification:
      
        4407
                raise AssertionError("Verification should not run for this scenario")
      
        4408
        
        4409
            context = build_context(
      
        4410
                temp_dir=temp_dir,
      
        4411
                messages=[],
      
        4412
                safeguards=FakeSafeguards(),
      
        4413
                assess_confidence=assess_confidence,
      
        4414
                verify_action=verify_action,
      
        4415
            )
      
        4416
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4417
            tool_call = ToolCall(
      
        4418
                id="write-1",
      
        4419
                name="write",
      
        4420
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        4421
            )
      
        4422
            executor = FakeExecutor(
      
        4423
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        4424
            )
      
        4425
            summary = TurnSummary(final_response="")
      
        4426
            dod = create_definition_of_done("Update README and verify it still works.")
      
        4427
            dod.verification_commands = ["uv run pytest -q"]
      
        4428
            dod.last_verification_result = "passed"
      
        4429
            dod.verification_attempt_counter = 1
      
        4430
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        4431
            dod.active_verification_attempt_number = 1
      
        4432
            dod.evidence = [
      
        4433
                VerificationEvidence(
      
        4434
                    command="uv run pytest -q",
      
        4435
                    passed=True,
      
        4436
                    stdout="401 passed",
      
        4437
                    kind="test",
      
        4438
                )
      
        4439
            ]
      
        4440
            dod.completed_items.append("Collect verification evidence")
      
        4441
            events: list[AgentEvent] = []
      
        4442
        
        4443
            async def emit(event: AgentEvent) -> None:
      
        4444
                events.append(event)
      
        4445
        
        4446
            await runner.execute_batch(
      
        4447
                tool_calls=[tool_call],
      
        4448
                tool_source="assistant",
      
        4449
                pending_tool_calls_seen=set(),
      
        4450
                emit=emit,
      
        4451
                summary=summary,
      
        4452
                dod=dod,
      
        4453
                executor=executor,  # type: ignore[arg-type]
      
        4454
                on_confirmation=None,
      
        4455
                on_user_question=None,
      
        4456
                emit_confirmation=None,
      
        4457
                consecutive_errors=0,
      
        4458
            )
      
        4459
        
        4460
            assert dod.last_verification_result == "stale"
      
        4461
            assert dod.evidence == []
      
        4462
            assert "Collect verification evidence" in dod.pending_items
      
        4463
            assert "Collect verification evidence" not in dod.completed_items
      
        4464
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        4465
            assert dod.active_verification_attempt_number == 2
      
        4466
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        4467
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        4468
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        4469
            assert (
      
        4470
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        4471
                == "verification-attempt-1"
      
        4472
            )
      
        4473
            assert (
      
        4474
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        4475
            )
      
        4476
            assert (
      
        4477
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        4478
                == "verification-attempt-2"
      
        4479
            )
      
        4480
            assert (
      
        4481
                summary.workflow_timeline[-1].verification_observations[0].command
      
        4482
                == "uv run pytest -q"
      
        4483
            )
      
        4484
        
        4485
        
        4486
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        4487
            async def assess_confidence(
      
        4488
                tool_name: str,
      
        4489
                tool_args: dict,
      
        4490
                context: str,
      
        4491
            ) -> ConfidenceAssessment:
      
        4492
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4493
        
        4494
            async def verify_action(
      
        4495
                tool_name: str,
      
        4496
                tool_args: dict,
      
        4497
                result: str,
      
        4498
                expected: str = "",
      
        4499
            ) -> ActionVerification:
      
        4500
                raise AssertionError("Verification should not run in this scenario")
      
        4501
        
        4502
            repair_target = temp_dir / "guide" / "index.html"
      
        4503
            context = build_context(
      
        4504
                temp_dir=temp_dir,
      
        4505
                messages=[
      
        4506
                    Message(
      
        4507
                        role=Role.ASSISTANT,
      
        4508
                        content=(
      
        4509
                            "Repair focus:\n"
      
        4510
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        4511
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4512
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        4513
                        ),
      
        4514
                    )
      
        4515
                ],
      
        4516
                safeguards=FakeSafeguards(),
      
        4517
                assess_confidence=assess_confidence,
      
        4518
                verify_action=verify_action,
      
        4519
            )
      
        4520
            queued: list[str] = []
      
        4521
            context.queue_steering_message_callback = queued.append
      
        4522
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4523
        
        4524
            runner._queue_blocked_active_repair_nudge(
      
        4525
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        4526
            )
      
        4527
        
        4528
            assert queued
      
        4529
            assert str(repair_target) in queued[0]
      
        4530
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        4531
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        4532
        
        4533
        
        4534
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        4535
            temp_dir: Path,
      
        4536
        ) -> None:
      
        4537
            async def assess_confidence(
      
        4538
                tool_name: str,
      
        4539
                tool_args: dict,
      
        4540
                context: str,
      
        4541
            ) -> ConfidenceAssessment:
      
        4542
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4543
        
        4544
            async def verify_action(
      
        4545
                tool_name: str,
      
        4546
                tool_args: dict,
      
        4547
                result: str,
      
        4548
                expected: str = "",
      
        4549
            ) -> ActionVerification:
      
        4550
                raise AssertionError("Verification should not run in this scenario")
      
        4551
        
        4552
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        4553
            stylesheet = temp_dir / "guide" / "styles.css"
      
        4554
            context = build_context(
      
        4555
                temp_dir=temp_dir,
      
        4556
                messages=[
      
        4557
                    Message(
      
        4558
                        role=Role.ASSISTANT,
      
        4559
                        content=(
      
        4560
                            "Repair focus:\n"
      
        4561
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        4562
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        4563
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        4564
                        ),
      
        4565
                    )
      
        4566
                ],
      
        4567
                safeguards=FakeSafeguards(),
      
        4568
                assess_confidence=assess_confidence,
      
        4569
                verify_action=verify_action,
      
        4570
            )
      
        4571
            queued: list[str] = []
      
        4572
            context.queue_steering_message_callback = queued.append
      
        4573
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4574
        
        4575
            runner._queue_blocked_active_repair_mutation_nudge(
      
        4576
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        4577
            )
      
        4578
        
        4579
            assert queued
      
        4580
            assert str(repair_target) in queued[0]
      
        4581
            assert str(stylesheet) in queued[0]
      
        4582
            assert "before widening the change set" in queued[0]
      
        4583
        
        4584
        
        4585
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        4586
            temp_dir: Path,
      
        4587
        ) -> None:
      
        4588
            async def assess_confidence(
      
        4589
                tool_name: str,
      
        4590
                tool_args: dict,
      
        4591
                context: str,
      
        4592
            ) -> ConfidenceAssessment:
      
        4593
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4594
        
        4595
            async def verify_action(
      
        4596
                tool_name: str,
      
        4597
                tool_args: dict,
      
        4598
                result: str,
      
        4599
                expected: str = "",
      
        4600
            ) -> ActionVerification:
      
        4601
                raise AssertionError("Verification should not run in this scenario")
      
        4602
        
        4603
            context = build_context(
      
        4604
                temp_dir=temp_dir,
      
        4605
                messages=[],
      
        4606
                safeguards=FakeSafeguards(),
      
        4607
                assess_confidence=assess_confidence,
      
        4608
                verify_action=verify_action,
      
        4609
            )
      
        4610
            queued: list[str] = []
      
        4611
            context.queue_steering_message_callback = queued.append
      
        4612
            store = DefinitionOfDoneStore(temp_dir)
      
        4613
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        4614
            plan_path = temp_dir / "implementation.md"
      
        4615
            plan_path.write_text(
      
        4616
                "# File Changes\n"
      
        4617
                "- `guide/index.html`\n"
      
        4618
                "- `guide/chapters/01-getting-started.html`\n"
      
        4619
                "- `guide/chapters/02-installation.html`\n"
      
        4620
                "- `guide/chapters/03-first-website.html`\n"
      
        4621
            )
      
        4622
            dod.implementation_plan = str(plan_path)
      
        4623
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        4624
            (temp_dir / "guide" / "index.html").write_text("index")
      
        4625
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        4626
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        4627
            runner = ToolBatchRunner(context, store)
      
        4628
        
        4629
            runner._queue_blocked_late_reference_drift_nudge(
      
        4630
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        4631
                dod=dod,
      
        4632
            )
      
        4633
        
        4634
            assert queued
      
        4635
            assert "03-first-website.html" in queued[0]
      
        4636
            assert "older reference materials" in queued[0]
      
        4637
        
        4638
        
        4639
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        4640
            temp_dir: Path,
      
        4641
        ) -> None:
      
        4642
            async def assess_confidence(
      
        4643
                tool_name: str,
      
        4644
                tool_args: dict,
      
        4645
                context: str,
      
        4646
            ) -> ConfidenceAssessment:
      
        4647
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4648
        
        4649
            async def verify_action(
      
        4650
                tool_name: str,
      
        4651
                tool_args: dict,
      
        4652
                result: str,
      
        4653
                expected: str = "",
      
        4654
            ) -> ActionVerification:
      
        4655
                raise AssertionError("Verification should not run in this scenario")
      
        4656
        
        4657
            guide_root = temp_dir / "guide"
      
        4658
            chapters = guide_root / "chapters"
      
        4659
            guide_root.mkdir(parents=True)
      
        4660
            chapters.mkdir()
      
        4661
            index_path = guide_root / "index.html"
      
        4662
            chapter_one = chapters / "01-getting-started.html"
      
        4663
            chapter_two = chapters / "02-installation.html"
      
        4664
            index_path.write_text("index")
      
        4665
            chapter_one.write_text("one")
      
        4666
            chapter_two.write_text("two")
      
        4667
        
        4668
            implementation_plan = temp_dir / "implementation.md"
      
        4669
            implementation_plan.write_text(
      
        4670
                "\n".join(
      
        4671
                    [
      
        4672
                        "# Implementation Plan",
      
        4673
                        "",
      
        4674
                        "## File Changes",
      
        4675
                        f"- `{guide_root}`",
      
        4676
                        f"- `{chapters}`",
      
        4677
                        f"- `{index_path}`",
      
        4678
                        f"- `{chapter_one}`",
      
        4679
                        f"- `{chapter_two}`",
      
        4680
                        "",
      
        4681
                    ]
      
        4682
                )
      
        4683
            )
      
        4684
        
        4685
            context = build_context(
      
        4686
                temp_dir=temp_dir,
      
        4687
                messages=[],
      
        4688
                safeguards=FakeSafeguards(),
      
        4689
                assess_confidence=assess_confidence,
      
        4690
                verify_action=verify_action,
      
        4691
            )
      
        4692
            queued: list[str] = []
      
        4693
            context.queue_steering_message_callback = queued.append
      
        4694
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4695
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        4696
            dod.implementation_plan = str(implementation_plan)
      
        4697
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4698
            sync_todos_to_definition_of_done(
      
        4699
                dod,
      
        4700
                [
      
        4701
                    {
      
        4702
                        "content": "Verify all guide files are linked and complete",
      
        4703
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        4704
                        "status": "pending",
      
        4705
                    }
      
        4706
                ],
      
        4707
                project_root=temp_dir,
      
        4708
            )
      
        4709
        
        4710
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        4711
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        4712
                dod=dod,
      
        4713
            )
      
        4714
        
        4715
            assert queued
      
        4716
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        4717
            assert "Verify all guide files are linked and complete" in queued[0]
      
        4718
            assert "Do not reopen earlier reference materials." in queued[0]
      
        4719
        
        4720
        
        4721
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        4722
            temp_dir: Path,
      
        4723
        ) -> None:
      
        4724
            async def assess_confidence(
      
        4725
                tool_name: str,
      
        4726
                tool_args: dict,
      
        4727
                context: str,
      
        4728
            ) -> ConfidenceAssessment:
      
        4729
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4730
        
        4731
            async def verify_action(
      
        4732
                tool_name: str,
      
        4733
                tool_args: dict,
      
        4734
                result: str,
      
        4735
                expected: str = "",
      
        4736
            ) -> ActionVerification:
      
        4737
                raise AssertionError("Verification should not run in this scenario")
      
        4738
        
        4739
            context = build_context(
      
        4740
                temp_dir=temp_dir,
      
        4741
                messages=[],
      
        4742
                safeguards=FakeSafeguards(),
      
        4743
                assess_confidence=assess_confidence,
      
        4744
                verify_action=verify_action,
      
        4745
            )
      
        4746
            queued: list[str] = []
      
        4747
            context.queue_steering_message_callback = queued.append
      
        4748
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4749
        
        4750
            runner._queue_blocked_html_declared_target_nudge(
      
        4751
                ToolCall(
      
        4752
                    id="write-ch1",
      
        4753
                    name="write",
      
        4754
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        4755
                ),
      
        4756
                (
      
        4757
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        4758
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        4759
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        4760
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        4761
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        4762
                ),
      
        4763
            )
      
        4764
        
        4765
            assert queued
      
        4766
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        4767
            assert "`chapters/02-installation.html`" in queued[0]
      
        4768
            assert "same file now" in queued[0]
      
        4769
        
        4770
        
        4771
        @pytest.mark.asyncio
      
        4772
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        4773
            temp_dir: Path,
      
        4774
        ) -> None:
      
        4775
            async def assess_confidence(
      
        4776
                tool_name: str,
      
        4777
                tool_args: dict,
      
        4778
                context: str,
      
        4779
            ) -> ConfidenceAssessment:
      
        4780
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        4781
        
        4782
            async def verify_action(
      
        4783
                tool_name: str,
      
        4784
                tool_args: dict,
      
        4785
                result: str,
      
        4786
                expected: str = "",
      
        4787
            ) -> ActionVerification:
      
        4788
                raise AssertionError("Verification should not run in this scenario")
      
        4789
        
        4790
            guide_root = temp_dir / "guides" / "nginx"
      
        4791
            chapters = guide_root / "chapters"
      
        4792
            chapters.mkdir(parents=True)
      
        4793
            index_path = guide_root / "index.html"
      
        4794
            chapter_one = chapters / "01-introduction.html"
      
        4795
            chapter_two = chapters / "02-installation.html"
      
        4796
            index_path.write_text("<html></html>\n")
      
        4797
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        4798
        
        4799
            implementation_plan = temp_dir / "implementation.md"
      
        4800
            implementation_plan.write_text(
      
        4801
                "\n".join(
      
        4802
                    [
      
        4803
                        "# Implementation Plan",
      
        4804
                        "",
      
        4805
                        "## File Changes",
      
        4806
                        f"- `{index_path}`",
      
        4807
                        f"- `{chapter_one}`",
      
        4808
                        f"- `{chapter_two}`",
      
        4809
                        "",
      
        4810
                    ]
      
        4811
                )
      
        4812
            )
      
        4813
        
        4814
            context = build_context(
      
        4815
                temp_dir=temp_dir,
      
        4816
                messages=[],
      
        4817
                safeguards=FakeSafeguards(),
      
        4818
                assess_confidence=assess_confidence,
      
        4819
                verify_action=verify_action,
      
        4820
                auto_recover=False,
      
        4821
            )
      
        4822
            queued: list[str] = []
      
        4823
            context.queue_steering_message_callback = queued.append
      
        4824
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4825
            tool_call = ToolCall(
      
        4826
                id="write-2",
      
        4827
                name="write",
      
        4828
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        4829
            )
      
        4830
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        4831
            executor = FakeExecutor(
      
        4832
                [
      
        4833
                    ToolExecutionOutcome(
      
        4834
                        tool_call=tool_call,
      
        4835
                        state=ToolExecutionState.BLOCKED,
      
        4836
                        message=Message.tool_result_message(
      
        4837
                            tool_call_id=tool_call.id,
      
        4838
                            display_content=blocked_message,
      
        4839
                            result_content=blocked_message,
      
        4840
                            is_error=True,
      
        4841
                        ),
      
        4842
                        event_content=blocked_message,
      
        4843
                        is_error=True,
      
        4844
                        result_output=blocked_message,
      
        4845
                    )
      
        4846
                ]
      
        4847
            )
      
        4848
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4849
            dod.implementation_plan = str(implementation_plan)
      
        4850
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        4851
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        4852
        
        4853
            await runner.execute_batch(
      
        4854
                tool_calls=[tool_call],
      
        4855
                tool_source="assistant",
      
        4856
                pending_tool_calls_seen=set(),
      
        4857
                emit=_noop_emit,
      
        4858
                summary=TurnSummary(final_response=""),
      
        4859
                dod=dod,
      
        4860
                executor=executor,  # type: ignore[arg-type]
      
        4861
                on_confirmation=None,
      
        4862
                on_user_question=None,
      
        4863
                emit_confirmation=None,
      
        4864
                consecutive_errors=0,
      
        4865
            )
      
        4866
        
        4867
            assert queued
      
        4868
            assert "did not provide a valid `file_path`" in queued[0]
      
        4869
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        4870
            assert (
      
        4871
                f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
      
        4872
                in queued[0]
      
        4873
            )
      
        4874
            assert context.recovery_context is not None
      
        4875
            assert context.recovery_context.attempts[-1].error == blocked_message