loader Public

Watch 0 Fork 0 Star 0
Python · 239230 bytes Raw Blame History
  
        1
        """Tests for tool-batch execution on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.context import RuntimeContext
      
        12
        from loader.runtime.dod import (
      
        13
            DefinitionOfDoneStore,
      
        14
            VerificationEvidence,
      
        15
            create_definition_of_done,
      
        16
        )
      
        17
        from loader.runtime.events import AgentEvent, TurnSummary
      
        18
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        19
        from loader.runtime.path_display import display_runtime_path
      
        20
        from loader.runtime.permissions import (
      
        21
            PermissionMode,
      
        22
            build_permission_policy,
      
        23
            load_permission_rules,
      
        24
        )
      
        25
        from loader.runtime.reasoning_types import (
      
        26
            ActionVerification,
      
        27
            ConfidenceAssessment,
      
        28
            ConfidenceLevel,
      
        29
        )
      
        30
        from loader.runtime.recovery import RecoveryContext
      
        31
        from loader.runtime.tool_batches import (
      
        32
            ToolBatchRunner,
      
        33
        )
      
        34
        from loader.runtime.tool_batches import (
      
        35
            _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
      
        36
        )
      
        37
        from loader.runtime.workflow import sync_todos_to_definition_of_done
      
        38
        from loader.tools.base import ToolResult as RegistryToolResult
      
        39
        from loader.tools.base import create_default_registry
      
        40
        from tests.helpers.runtime_harness import ScriptedBackend
      
        41
        
        42
        
        43
        class FakeSession:
      
        44
            def __init__(self, messages: list[Message]) -> None:
      
        45
                self.messages = list(messages)
      
        46
                self.workflow_timeline = []
      
        47
        
        48
            def append(self, message: Message) -> None:
      
        49
                self.messages.append(message)
      
        50
        
        51
            def append_workflow_timeline_entry(self, entry) -> None:
      
        52
                self.workflow_timeline.append(entry)
      
        53
        
        54
        
        55
        class FakeCodeFilter:
      
        56
            def reset(self) -> None:
      
        57
                return None
      
        58
        
        59
        
        60
        class FakeSafeguards:
      
        61
            def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
      
        62
                self.action_tracker = object()
      
        63
                self.validator = object()
      
        64
                self.code_filter = FakeCodeFilter()
      
        65
                self._detect_loop_result = detect_loop_result
      
        66
        
        67
            def filter_stream_chunk(self, content: str) -> str:
      
        68
                return content
      
        69
        
        70
            def filter_complete_content(self, content: str) -> str:
      
        71
                return content
      
        72
        
        73
            def should_steer(self) -> bool:
      
        74
                return False
      
        75
        
        76
            def get_steering_message(self) -> str | None:
      
        77
                return None
      
        78
        
        79
            def record_response(self, content: str) -> None:
      
        80
                return None
      
        81
        
        82
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        83
                return False, ""
      
        84
        
        85
            def detect_loop(self) -> tuple[bool, str]:
      
        86
                return self._detect_loop_result
      
        87
        
        88
        
        89
        class FakeExecutor:
      
        90
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        91
                self._outcomes = list(outcomes)
      
        92
                self.calls: list[ToolCall] = []
      
        93
        
        94
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        95
                self.calls.append(tool_call)
      
        96
                if not self._outcomes:
      
        97
                    raise AssertionError("No fake tool outcome queued")
      
        98
                return self._outcomes.pop(0)
      
        99
        
        100
        
        101
        def build_context(
      
        102
            *,
      
        103
            temp_dir: Path,
      
        104
            messages: list[Message],
      
        105
            safeguards: FakeSafeguards,
      
        106
            assess_confidence,
      
        107
            verify_action,
      
        108
            recovery_context: RecoveryContext | None = None,
      
        109
            confidence_scoring: bool = False,
      
        110
            verification: bool = False,
      
        111
            auto_recover: bool = True,
      
        112
            min_confidence_for_action: int = 3,
      
        113
        ) -> RuntimeContext:
      
        114
            registry = create_default_registry(temp_dir)
      
        115
            registry.configure_workspace_root(temp_dir)
      
        116
            rule_status = load_permission_rules(temp_dir)
      
        117
            policy = build_permission_policy(
      
        118
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        119
                workspace_root=temp_dir,
      
        120
                tool_requirements=registry.get_tool_requirements(),
      
        121
                rules=rule_status.rules,
      
        122
            )
      
        123
            context = RuntimeContext(
      
        124
                project_root=temp_dir,
      
        125
                backend=ScriptedBackend(),
      
        126
                registry=registry,
      
        127
                session=FakeSession(messages),  # type: ignore[arg-type]
      
        128
                config=SimpleNamespace(
      
        129
                    force_react=False,
      
        130
                    max_recovery_attempts=2,
      
        131
                    auto_recover=auto_recover,
      
        132
                    reasoning=SimpleNamespace(
      
        133
                        rollback=False,
      
        134
                        show_rollback_plan=False,
      
        135
                        completion_check=True,
      
        136
                        max_continuation_prompts=5,
      
        137
                        self_critique=False,
      
        138
                        confidence_scoring=confidence_scoring,
      
        139
                        min_confidence_for_action=min_confidence_for_action,
      
        140
                        verification=verification,
      
        141
                    ),
      
        142
                ),
      
        143
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        144
                project_context=None,
      
        145
                permission_policy=policy,
      
        146
                permission_config_status=rule_status,
      
        147
                workflow_mode="execute",
      
        148
                safeguards=safeguards,
      
        149
                reasoning=SimpleNamespace(
      
        150
                    assess_confidence=assess_confidence,
      
        151
                    verify_action=verify_action,
      
        152
                ),
      
        153
                recovery_context=recovery_context,
      
        154
            )
      
        155
            return context
      
        156
        
        157
        
        158
        def tool_outcome(
      
        159
            *,
      
        160
            tool_call: ToolCall,
      
        161
            output: str,
      
        162
            is_error: bool,
      
        163
            state: ToolExecutionState = ToolExecutionState.EXECUTED,
      
        164
            metadata: dict[str, object] | None = None,
      
        165
        ) -> ToolExecutionOutcome:
      
        166
            return ToolExecutionOutcome(
      
        167
                tool_call=tool_call,
      
        168
                state=state,
      
        169
                message=Message.tool_result_message(
      
        170
                    tool_call_id=tool_call.id,
      
        171
                    display_content=output,
      
        172
                    result_content=output,
      
        173
                    is_error=is_error,
      
        174
                ),
      
        175
                event_content=output,
      
        176
                is_error=is_error,
      
        177
                result_output=output,
      
        178
                registry_result=RegistryToolResult(
      
        179
                    output=output,
      
        180
                    is_error=is_error,
      
        181
                    metadata=metadata or {},
      
        182
                ),
      
        183
            )
      
        184
        
        185
        
        186
        @pytest.mark.asyncio
      
        187
        async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
      
        188
            captured: dict[str, str] = {}
      
        189
        
        190
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        191
                captured["context"] = context
      
        192
                return ConfidenceAssessment(
      
        193
                    action=f"{tool_name} with {tool_args}",
      
        194
                    tool_name=tool_name,
      
        195
                    tool_args=tool_args,
      
        196
                    level=ConfidenceLevel.LOW,
      
        197
                    reasoning="Need to inspect the target first.",
      
        198
                    risks=["Unknown target file"],
      
        199
                )
      
        200
        
        201
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        202
                raise AssertionError("Verification should not run for skipped actions")
      
        203
        
        204
            context = build_context(
      
        205
                temp_dir=temp_dir,
      
        206
                messages=[
      
        207
                    Message(role=Role.USER, content="Please inspect the project."),
      
        208
                    Message(role=Role.ASSISTANT, content="I will read the file next."),
      
        209
                ],
      
        210
                safeguards=FakeSafeguards(),
      
        211
                assess_confidence=assess_confidence,
      
        212
                verify_action=verify_action,
      
        213
                confidence_scoring=True,
      
        214
                min_confidence_for_action=3,
      
        215
            )
      
        216
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        217
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        218
            events: list[AgentEvent] = []
      
        219
        
        220
            async def emit(event: AgentEvent) -> None:
      
        221
                events.append(event)
      
        222
        
        223
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
      
        224
            result = await runner.execute_batch(
      
        225
                tool_calls=[tool_call],
      
        226
                tool_source="assistant",
      
        227
                pending_tool_calls_seen=set(),
      
        228
                emit=emit,
      
        229
                summary=TurnSummary(final_response=""),
      
        230
                dod=create_definition_of_done("Read the docs"),
      
        231
                executor=executor,  # type: ignore[arg-type]
      
        232
                on_confirmation=None,
      
        233
                on_user_question=None,
      
        234
                emit_confirmation=None,
      
        235
                consecutive_errors=0,
      
        236
            )
      
        237
        
        238
            assert result.actions_taken == []
      
        239
            assert executor.calls == []
      
        240
            assert "Please inspect the project." in captured["context"]
      
        241
            assert context.session.messages[-1].role == Role.USER
      
        242
            assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
      
        243
            event_types = [event.type for event in events]
      
        244
            assert "confidence" in event_types
      
        245
        
        246
        
        247
        @pytest.mark.asyncio
      
        248
        async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
      
        249
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        250
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        251
        
        252
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        253
                raise AssertionError("Verification should not run for failed actions")
      
        254
        
        255
            context = build_context(
      
        256
                temp_dir=temp_dir,
      
        257
                messages=[],
      
        258
                safeguards=FakeSafeguards(),
      
        259
                assess_confidence=assess_confidence,
      
        260
                verify_action=verify_action,
      
        261
                auto_recover=True,
      
        262
            )
      
        263
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        264
            tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
      
        265
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
      
        266
            summary = TurnSummary(final_response="")
      
        267
            events: list[AgentEvent] = []
      
        268
        
        269
            async def emit(event: AgentEvent) -> None:
      
        270
                events.append(event)
      
        271
        
        272
            await runner.execute_batch(
      
        273
                tool_calls=[tool_call],
      
        274
                tool_source="assistant",
      
        275
                pending_tool_calls_seen=set(),
      
        276
                emit=emit,
      
        277
                summary=summary,
      
        278
                dod=create_definition_of_done("Run tests"),
      
        279
                executor=executor,  # type: ignore[arg-type]
      
        280
                on_confirmation=None,
      
        281
                on_user_question=None,
      
        282
                emit_confirmation=None,
      
        283
                consecutive_errors=0,
      
        284
            )
      
        285
        
        286
            assert context.recovery_context is not None
      
        287
            assert summary.tool_result_messages
      
        288
            assert context.session.messages[-1] == summary.tool_result_messages[-1]
      
        289
            assert any(event.type == "recovery" for event in events)
      
        290
        
        291
        
        292
        @pytest.mark.asyncio
      
        293
        async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
      
        294
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        295
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        296
        
        297
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        298
                raise AssertionError("Verification should not run for this scenario")
      
        299
        
        300
            context = build_context(
      
        301
                temp_dir=temp_dir,
      
        302
                messages=[],
      
        303
                safeguards=FakeSafeguards(),
      
        304
                assess_confidence=assess_confidence,
      
        305
                verify_action=verify_action,
      
        306
                auto_recover=False,
      
        307
            )
      
        308
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        309
            tool_call = ToolCall(
      
        310
                id="bash-1",
      
        311
                name="bash",
      
        312
                arguments={"command": "python -m http.server 8000", "background": True},
      
        313
            )
      
        314
            metadata = {
      
        315
                "job_id": "bash-1",
      
        316
                "status": "running",
      
        317
                "background": True,
      
        318
            }
      
        319
            executor = FakeExecutor(
      
        320
                [
      
        321
                    tool_outcome(
      
        322
                        tool_call=tool_call,
      
        323
                        output="Started bash job bash-1",
      
        324
                        is_error=False,
      
        325
                        metadata=metadata,
      
        326
                    )
      
        327
                ]
      
        328
            )
      
        329
            events: list[AgentEvent] = []
      
        330
        
        331
            async def emit(event: AgentEvent) -> None:
      
        332
                events.append(event)
      
        333
        
        334
            await runner.execute_batch(
      
        335
                tool_calls=[tool_call],
      
        336
                tool_source="assistant",
      
        337
                pending_tool_calls_seen=set(),
      
        338
                emit=emit,
      
        339
                summary=TurnSummary(final_response=""),
      
        340
                dod=create_definition_of_done("Launch a preview server"),
      
        341
                executor=executor,  # type: ignore[arg-type]
      
        342
                on_confirmation=None,
      
        343
                on_user_question=None,
      
        344
                emit_confirmation=None,
      
        345
                consecutive_errors=0,
      
        346
            )
      
        347
        
        348
            tool_result = next(event for event in events if event.type == "tool_result")
      
        349
            assert tool_result.tool_metadata == metadata
      
        350
        
        351
        
        352
        @pytest.mark.asyncio
      
        353
        async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
      
        354
            verification_calls: list[str] = []
      
        355
        
        356
            async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
      
        357
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        358
        
        359
            async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
      
        360
                verification_calls.append(result)
      
        361
                return ActionVerification(
      
        362
                    tool_name=tool_name,
      
        363
                    tool_args=tool_args,
      
        364
                    expected_outcome="Success",
      
        365
                    actual_result=result,
      
        366
                    verified=False,
      
        367
                    discrepancies=["File contents did not match"],
      
        368
                    needs_correction=True,
      
        369
                    correction_suggestion="Read the file before editing again.",
      
        370
                )
      
        371
        
        372
            existing_recovery = RecoveryContext(
      
        373
                original_tool="edit",
      
        374
                original_args={"file_path": "README.md"},
      
        375
            )
      
        376
            context = build_context(
      
        377
                temp_dir=temp_dir,
      
        378
                messages=[],
      
        379
                safeguards=FakeSafeguards(),
      
        380
                assess_confidence=assess_confidence,
      
        381
                verify_action=verify_action,
      
        382
                recovery_context=existing_recovery,
      
        383
                verification=True,
      
        384
            )
      
        385
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        386
            tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
      
        387
            executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
      
        388
            events: list[AgentEvent] = []
      
        389
        
        390
            async def emit(event: AgentEvent) -> None:
      
        391
                events.append(event)
      
        392
        
        393
            await runner.execute_batch(
      
        394
                tool_calls=[tool_call],
      
        395
                tool_source="assistant",
      
        396
                pending_tool_calls_seen=set(),
      
        397
                emit=emit,
      
        398
                summary=TurnSummary(final_response=""),
      
        399
                dod=create_definition_of_done("Read the docs"),
      
        400
                executor=executor,  # type: ignore[arg-type]
      
        401
                on_confirmation=None,
      
        402
                on_user_question=None,
      
        403
                emit_confirmation=None,
      
        404
                consecutive_errors=0,
      
        405
            )
      
        406
        
        407
            assert verification_calls == ["file contents"]
      
        408
            assert context.recovery_context is existing_recovery
      
        409
            assert existing_recovery.successful_steps == [
      
        410
                ("read", {"file_path": "README.md"})
      
        411
            ]
      
        412
            assert context.session.messages[-1].role == Role.TOOL
      
        413
            assert context.session.messages[-1].content == "file contents"
      
        414
            assert any(event.type == "verification" for event in events)
      
        415
        
        416
        
        417
        @pytest.mark.asyncio
      
        418
        async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
      
        419
            temp_dir: Path,
      
        420
        ) -> None:
      
        421
            async def assess_confidence(
      
        422
                tool_name: str,
      
        423
                tool_args: dict,
      
        424
                context: str,
      
        425
            ) -> ConfidenceAssessment:
      
        426
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        427
        
        428
            async def verify_action(
      
        429
                tool_name: str,
      
        430
                tool_args: dict,
      
        431
                result: str,
      
        432
                expected: str = "",
      
        433
            ) -> ActionVerification:
      
        434
                raise AssertionError("Verification should not run for this scenario")
      
        435
        
        436
            existing_recovery = RecoveryContext(
      
        437
                original_tool="read",
      
        438
                original_args={"file_path": "chapters/04-data-types.html"},
      
        439
            )
      
        440
            existing_recovery.add_attempt(
      
        441
                "read",
      
        442
                {"file_path": "chapters/04-data-types.html"},
      
        443
                "File not found",
      
        444
            )
      
        445
            context = build_context(
      
        446
                temp_dir=temp_dir,
      
        447
                messages=[],
      
        448
                safeguards=FakeSafeguards(),
      
        449
                assess_confidence=assess_confidence,
      
        450
                verify_action=verify_action,
      
        451
                recovery_context=existing_recovery,
      
        452
                auto_recover=False,
      
        453
            )
      
        454
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        455
            tool_call = ToolCall(
      
        456
                id="bash-1",
      
        457
                name="bash",
      
        458
                arguments={"command": "ls chapters"},
      
        459
            )
      
        460
            executor = FakeExecutor(
      
        461
                [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
      
        462
            )
      
        463
        
        464
            summary = TurnSummary(final_response="")
      
        465
            await runner.execute_batch(
      
        466
                tool_calls=[tool_call],
      
        467
                tool_source="assistant",
      
        468
                pending_tool_calls_seen=set(),
      
        469
                emit=_noop_emit,
      
        470
                summary=summary,
      
        471
                dod=create_definition_of_done("Fix the chapter links"),
      
        472
                executor=executor,  # type: ignore[arg-type]
      
        473
                on_confirmation=None,
      
        474
                on_user_question=None,
      
        475
                emit_confirmation=None,
      
        476
                consecutive_errors=0,
      
        477
            )
      
        478
        
        479
            assert context.recovery_context is existing_recovery
      
        480
            assert existing_recovery.successful_steps == [
      
        481
                ("bash", {"command": "ls chapters"})
      
        482
            ]
      
        483
        
        484
        
        485
        @pytest.mark.asyncio
      
        486
        async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
      
        487
            temp_dir: Path,
      
        488
        ) -> None:
      
        489
            async def assess_confidence(
      
        490
                tool_name: str,
      
        491
                tool_args: dict,
      
        492
                context: str,
      
        493
            ) -> ConfidenceAssessment:
      
        494
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        495
        
        496
            async def verify_action(
      
        497
                tool_name: str,
      
        498
                tool_args: dict,
      
        499
                result: str,
      
        500
                expected: str = "",
      
        501
            ) -> ActionVerification:
      
        502
                raise AssertionError("Verification should not run for this scenario")
      
        503
        
        504
            existing_recovery = RecoveryContext(
      
        505
                original_tool="read",
      
        506
                original_args={"file_path": "chapters/04-data-types.html"},
      
        507
            )
      
        508
            existing_recovery.add_attempt(
      
        509
                "read",
      
        510
                {"file_path": "chapters/04-data-types.html"},
      
        511
                "File not found",
      
        512
            )
      
        513
            context = build_context(
      
        514
                temp_dir=temp_dir,
      
        515
                messages=[],
      
        516
                safeguards=FakeSafeguards(),
      
        517
                assess_confidence=assess_confidence,
      
        518
                verify_action=verify_action,
      
        519
                recovery_context=existing_recovery,
      
        520
                auto_recover=False,
      
        521
            )
      
        522
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        523
            tool_call = ToolCall(
      
        524
                id="patch-1",
      
        525
                name="patch",
      
        526
                arguments={
      
        527
                    "file_path": "index.html",
      
        528
                    "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
      
        529
                },
      
        530
            )
      
        531
            executor = FakeExecutor(
      
        532
                [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
      
        533
            )
      
        534
        
        535
            summary = TurnSummary(final_response="")
      
        536
            await runner.execute_batch(
      
        537
                tool_calls=[tool_call],
      
        538
                tool_source="assistant",
      
        539
                pending_tool_calls_seen=set(),
      
        540
                emit=_noop_emit,
      
        541
                summary=summary,
      
        542
                dod=create_definition_of_done("Fix the chapter links"),
      
        543
                executor=executor,  # type: ignore[arg-type]
      
        544
                on_confirmation=None,
      
        545
                on_user_question=None,
      
        546
                emit_confirmation=None,
      
        547
                consecutive_errors=0,
      
        548
            )
      
        549
        
        550
            assert context.recovery_context is None
      
        551
        
        552
        
        553
        @pytest.mark.asyncio
      
        554
        async def test_tool_batch_runner_queues_duplicate_observation_nudge(
      
        555
            temp_dir: Path,
      
        556
        ) -> None:
      
        557
            async def assess_confidence(
      
        558
                tool_name: str,
      
        559
                tool_args: dict,
      
        560
                context: str,
      
        561
            ) -> ConfidenceAssessment:
      
        562
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        563
        
        564
            async def verify_action(
      
        565
                tool_name: str,
      
        566
                tool_args: dict,
      
        567
                result: str,
      
        568
                expected: str = "",
      
        569
            ) -> ActionVerification:
      
        570
                raise AssertionError("Verification should not run for this scenario")
      
        571
        
        572
            messages = [
      
        573
                Message(
      
        574
                    role=Role.TOOL,
      
        575
                    content=(
      
        576
                        "Observation [glob]: Result: "
      
        577
                        f"{temp_dir}/chapters/01-introduction.html\n"
      
        578
                        f"{temp_dir}/chapters/02-setup.html\n"
      
        579
                        f"{temp_dir}/chapters/03-basics.html"
      
        580
                    ),
      
        581
                    tool_results=[],
      
        582
                ),
      
        583
                Message(
      
        584
                    role=Role.ASSISTANT,
      
        585
                    content="I already inspected the first chapter title.",
      
        586
                    tool_calls=[
      
        587
                        ToolCall(
      
        588
                            id="read-ch1",
      
        589
                            name="read",
      
        590
                            arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
      
        591
                        )
      
        592
                    ],
      
        593
                ),
      
        594
                Message.tool_result_message(
      
        595
                    tool_call_id="read-ch1",
      
        596
                    display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        597
                    result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
      
        598
                ),
      
        599
                Message(
      
        600
                    role=Role.ASSISTANT,
      
        601
                    content="I should update the index now.",
      
        602
                    tool_calls=[
      
        603
                        ToolCall(
      
        604
                            id="read-index",
      
        605
                            name="read",
      
        606
                            arguments={"file_path": str(temp_dir / 'index.html')},
      
        607
                        )
      
        608
                    ],
      
        609
                ),
      
        610
            ]
      
        611
            context = build_context(
      
        612
                temp_dir=temp_dir,
      
        613
                messages=messages,
      
        614
                safeguards=FakeSafeguards(),
      
        615
                assess_confidence=assess_confidence,
      
        616
                verify_action=verify_action,
      
        617
                auto_recover=False,
      
        618
            )
      
        619
            (temp_dir / "chapters").mkdir()
      
        620
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        621
            (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
      
        622
            (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
      
        623
            (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
      
        624
            implementation_plan = temp_dir / "implementation.md"
      
        625
            implementation_plan.write_text(
      
        626
                "\n".join(
      
        627
                    [
      
        628
                        "# Implementation Plan",
      
        629
                        "",
      
        630
                        "## File Changes",
      
        631
                        f"- `{temp_dir / 'index.html'}`",
      
        632
                        f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
      
        633
                        f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
      
        634
                        f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
      
        635
                        f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
      
        636
                    ]
      
        637
                )
      
        638
            )
      
        639
            context.session.current_task = (
      
        640
                f"Update {temp_dir / 'index.html'} with the right chapter links."
      
        641
            )
      
        642
            persistent_messages: list[str] = []
      
        643
            ephemeral_messages: list[str] = []
      
        644
            context.queue_steering_message_callback = persistent_messages.append
      
        645
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        646
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        647
            tool_call = ToolCall(
      
        648
                id="read-dup",
      
        649
                name="read",
      
        650
                arguments={"file_path": str(temp_dir / "index.html")},
      
        651
            )
      
        652
            duplicate_message = (
      
        653
                "[Skipped - duplicate action: Already read "
      
        654
                f"{temp_dir / 'index.html'} recently without any intervening changes; "
      
        655
                "reuse the earlier read result instead of rereading]"
      
        656
            )
      
        657
            executor = FakeExecutor(
      
        658
                [
      
        659
                    ToolExecutionOutcome(
      
        660
                        tool_call=tool_call,
      
        661
                        state=ToolExecutionState.DUPLICATE,
      
        662
                        message=Message.tool_result_message(
      
        663
                            tool_call_id=tool_call.id,
      
        664
                            display_content=duplicate_message,
      
        665
                            result_content=duplicate_message,
      
        666
                        ),
      
        667
                        event_content=duplicate_message,
      
        668
                        is_error=False,
      
        669
                        result_output=duplicate_message,
      
        670
                    )
      
        671
                ]
      
        672
            )
      
        673
        
        674
            summary = TurnSummary(final_response="")
      
        675
            dod = create_definition_of_done("Fix the chapter links")
      
        676
            dod.implementation_plan = str(implementation_plan)
      
        677
            dod.pending_items.append("Create the remaining chapter files")
      
        678
            await runner.execute_batch(
      
        679
                tool_calls=[tool_call],
      
        680
                tool_source="assistant",
      
        681
                pending_tool_calls_seen=set(),
      
        682
                emit=_noop_emit,
      
        683
                summary=summary,
      
        684
                dod=dod,
      
        685
                executor=executor,  # type: ignore[arg-type]
      
        686
                on_confirmation=None,
      
        687
                on_user_question=None,
      
        688
                emit_confirmation=None,
      
        689
                consecutive_errors=0,
      
        690
            )
      
        691
        
        692
            assert len(persistent_messages) == 1
      
        693
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        694
            assert "A declared output artifact is still missing." in persistent_messages[0]
      
        695
            assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
      
        696
            assert (
      
        697
                "Prefer one `write` call for "
      
        698
                f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
      
        699
                in persistent_messages[0]
      
        700
            )
      
        701
            assert ephemeral_messages == []
      
        702
        
        703
        
        704
        @pytest.mark.asyncio
      
        705
        async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
      
        706
            temp_dir: Path,
      
        707
        ) -> None:
      
        708
            async def assess_confidence(
      
        709
                tool_name: str,
      
        710
                tool_args: dict,
      
        711
                context: str,
      
        712
            ) -> ConfidenceAssessment:
      
        713
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        714
        
        715
            async def verify_action(
      
        716
                tool_name: str,
      
        717
                tool_args: dict,
      
        718
                result: str,
      
        719
                expected: str = "",
      
        720
            ) -> ActionVerification:
      
        721
                raise AssertionError("Verification should not run for this scenario")
      
        722
        
        723
            guide_root = temp_dir / "guide"
      
        724
            chapters = guide_root / "chapters"
      
        725
            chapters.mkdir(parents=True)
      
        726
            index = guide_root / "index.html"
      
        727
            chapter_one = chapters / "01-introduction.html"
      
        728
            index.write_text(
      
        729
                '<a href="chapters/01-introduction.html">Intro</a>\n'
      
        730
                '<a href="chapters/02-installation.html">Install</a>\n'
      
        731
            )
      
        732
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        733
        
        734
            implementation_plan = temp_dir / "implementation.md"
      
        735
            implementation_plan.write_text(
      
        736
                "\n".join(
      
        737
                    [
      
        738
                        "# Implementation Plan",
      
        739
                        "",
      
        740
                        "## File Changes",
      
        741
                        f"- `{index}`",
      
        742
                        f"- `{chapters}/` (directory for chapter files)",
      
        743
                    ]
      
        744
                )
      
        745
            )
      
        746
        
        747
            messages = [
      
        748
                Message(
      
        749
                    role=Role.ASSISTANT,
      
        750
                    content="I should keep building the guide.",
      
        751
                    tool_calls=[
      
        752
                        ToolCall(
      
        753
                            id="read-index",
      
        754
                            name="read",
      
        755
                            arguments={"file_path": str(index)},
      
        756
                        )
      
        757
                    ],
      
        758
                ),
      
        759
            ]
      
        760
            context = build_context(
      
        761
                temp_dir=temp_dir,
      
        762
                messages=messages,
      
        763
                safeguards=FakeSafeguards(),
      
        764
                assess_confidence=assess_confidence,
      
        765
                verify_action=verify_action,
      
        766
                auto_recover=False,
      
        767
            )
      
        768
            context.session.current_task = f"Build the guide rooted at {index}."
      
        769
            persistent_messages: list[str] = []
      
        770
            ephemeral_messages: list[str] = []
      
        771
            context.queue_steering_message_callback = persistent_messages.append
      
        772
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        773
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        774
            tool_call = ToolCall(
      
        775
                id="read-dup-rooted",
      
        776
                name="read",
      
        777
                arguments={"file_path": str(index)},
      
        778
            )
      
        779
            duplicate_message = (
      
        780
                "[Skipped - duplicate action: Already read "
      
        781
                f"{index} recently without any intervening changes; "
      
        782
                "reuse the earlier read result instead of rereading]"
      
        783
            )
      
        784
            executor = FakeExecutor(
      
        785
                [
      
        786
                    ToolExecutionOutcome(
      
        787
                        tool_call=tool_call,
      
        788
                        state=ToolExecutionState.DUPLICATE,
      
        789
                        message=Message.tool_result_message(
      
        790
                            tool_call_id=tool_call.id,
      
        791
                            display_content=duplicate_message,
      
        792
                            result_content=duplicate_message,
      
        793
                        ),
      
        794
                        event_content=duplicate_message,
      
        795
                        is_error=False,
      
        796
                        result_output=duplicate_message,
      
        797
                    )
      
        798
                ]
      
        799
            )
      
        800
        
        801
            summary = TurnSummary(final_response="")
      
        802
            dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
      
        803
            dod.implementation_plan = str(implementation_plan)
      
        804
            dod.touched_files = [str(index), str(chapter_one)]
      
        805
            dod.completed_items = ["Create chapter files with appropriate content"]
      
        806
            dod.pending_items.append("Create the remaining chapter files")
      
        807
        
        808
            await runner.execute_batch(
      
        809
                tool_calls=[tool_call],
      
        810
                tool_source="assistant",
      
        811
                pending_tool_calls_seen=set(),
      
        812
                emit=_noop_emit,
      
        813
                summary=summary,
      
        814
                dod=dod,
      
        815
                executor=executor,  # type: ignore[arg-type]
      
        816
                on_confirmation=None,
      
        817
                on_user_question=None,
      
        818
                emit_confirmation=None,
      
        819
                consecutive_errors=0,
      
        820
            )
      
        821
        
        822
            assert len(persistent_messages) == 1
      
        823
            assert "Create the remaining chapter files" in persistent_messages[0]
      
        824
            assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
      
        825
            assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
      
        826
            assert ephemeral_messages == []
      
        827
        
        828
        
        829
        @pytest.mark.asyncio
      
        830
        async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      
        831
            temp_dir: Path,
      
        832
        ) -> None:
      
        833
            async def assess_confidence(
      
        834
                tool_name: str,
      
        835
                tool_args: dict,
      
        836
                context: str,
      
        837
            ) -> ConfidenceAssessment:
      
        838
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        839
        
        840
            async def verify_action(
      
        841
                tool_name: str,
      
        842
                tool_args: dict,
      
        843
                result: str,
      
        844
                expected: str = "",
      
        845
            ) -> ActionVerification:
      
        846
                raise AssertionError("Verification should not run for this scenario")
      
        847
        
        848
            context = build_context(
      
        849
                temp_dir=temp_dir,
      
        850
                messages=[],
      
        851
                safeguards=FakeSafeguards(),
      
        852
                assess_confidence=assess_confidence,
      
        853
                verify_action=verify_action,
      
        854
                auto_recover=False,
      
        855
            )
      
        856
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        857
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        858
            sync_todos_to_definition_of_done(
      
        859
                dod,
      
        860
                [
      
        861
                    {
      
        862
                        "content": "Create 03-first-website.html",
      
        863
                        "active_form": "Creating 03-first-website.html",
      
        864
                        "status": "pending",
      
        865
                    },
      
        866
                    {
      
        867
                        "content": "Create 04-configuration-basics.html",
      
        868
                        "active_form": "Creating 04-configuration-basics.html",
      
        869
                        "status": "pending",
      
        870
                    },
      
        871
                ],
      
        872
            )
      
        873
        
        874
            chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
      
        875
            chapter_path.parent.mkdir(parents=True)
      
        876
            write_call = ToolCall(
      
        877
                id="write-ch3",
      
        878
                name="write",
      
        879
                arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
      
        880
            )
      
        881
            stale_todo_call = ToolCall(
      
        882
                id="todo-stale",
      
        883
                name="TodoWrite",
      
        884
                arguments={
      
        885
                    "todos": [
      
        886
                        {
      
        887
                            "content": "Create 03-first-website.html",
      
        888
                            "active_form": "Creating 03-first-website.html",
      
        889
                            "status": "pending",
      
        890
                        },
      
        891
                        {
      
        892
                            "content": "Create 04-configuration-basics.html",
      
        893
                            "active_form": "Creating 04-configuration-basics.html",
      
        894
                            "status": "pending",
      
        895
                        },
      
        896
                    ]
      
        897
                },
      
        898
            )
      
        899
            executor = FakeExecutor(
      
        900
                [
      
        901
                    tool_outcome(
      
        902
                        tool_call=write_call,
      
        903
                        output=f"Successfully wrote {chapter_path}",
      
        904
                        is_error=False,
      
        905
                    ),
      
        906
                    tool_outcome(
      
        907
                        tool_call=stale_todo_call,
      
        908
                        output="Todos updated",
      
        909
                        is_error=False,
      
        910
                        metadata={
      
        911
                            "new_todos": [
      
        912
                                {
      
        913
                                    "content": "Create 03-first-website.html",
      
        914
                                    "active_form": "Creating 03-first-website.html",
      
        915
                                    "status": "pending",
      
        916
                                },
      
        917
                                {
      
        918
                                    "content": "Create 04-configuration-basics.html",
      
        919
                                    "active_form": "Creating 04-configuration-basics.html",
      
        920
                                    "status": "pending",
      
        921
                                },
      
        922
                            ]
      
        923
                        },
      
        924
                    ),
      
        925
                ]
      
        926
            )
      
        927
        
        928
            summary = TurnSummary(final_response="")
      
        929
            await runner.execute_batch(
      
        930
                tool_calls=[write_call, stale_todo_call],
      
        931
                tool_source="assistant",
      
        932
                pending_tool_calls_seen=set(),
      
        933
                emit=_noop_emit,
      
        934
                summary=summary,
      
        935
                dod=dod,
      
        936
                executor=executor,  # type: ignore[arg-type]
      
        937
                on_confirmation=None,
      
        938
                on_user_question=None,
      
        939
                emit_confirmation=None,
      
        940
                consecutive_errors=0,
      
        941
            )
      
        942
        
        943
            assert "Create 03-first-website.html" in dod.completed_items
      
        944
            assert "Create 03-first-website.html" not in dod.pending_items
      
        945
            assert "Create 04-configuration-basics.html" in dod.pending_items
      
        946
        
        947
        
        948
        @pytest.mark.asyncio
      
        949
        async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
      
        950
            temp_dir: Path,
      
        951
        ) -> None:
      
        952
            async def assess_confidence(
      
        953
                tool_name: str,
      
        954
                tool_args: dict,
      
        955
                context: str,
      
        956
            ) -> ConfidenceAssessment:
      
        957
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        958
        
        959
            async def verify_action(
      
        960
                tool_name: str,
      
        961
                tool_args: dict,
      
        962
                result: str,
      
        963
                expected: str = "",
      
        964
            ) -> ActionVerification:
      
        965
                raise AssertionError("Verification should not run for this scenario")
      
        966
        
        967
            chapters = temp_dir / "chapters"
      
        968
            chapters.mkdir()
      
        969
            (chapters / "01-introduction.html").write_text(
      
        970
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        971
            )
      
        972
            (chapters / "02-setup.html").write_text(
      
        973
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        974
            )
      
        975
            (temp_dir / "index.html").write_text("<ul></ul>\n")
      
        976
        
        977
            context = build_context(
      
        978
                temp_dir=temp_dir,
      
        979
                messages=[],
      
        980
                safeguards=FakeSafeguards(),
      
        981
                assess_confidence=assess_confidence,
      
        982
                verify_action=verify_action,
      
        983
                auto_recover=False,
      
        984
            )
      
        985
            context.session.current_task = (
      
        986
                f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
      
        987
            )
      
        988
            persistent_messages: list[str] = []
      
        989
            ephemeral_messages: list[str] = []
      
        990
            context.queue_steering_message_callback = persistent_messages.append
      
        991
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        992
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        993
            tool_call = ToolCall(
      
        994
                id="glob-1",
      
        995
                name="glob",
      
        996
                arguments={"path": str(chapters), "pattern": "*.html"},
      
        997
            )
      
        998
            executor = FakeExecutor(
      
        999
                [
      
        1000
                    tool_outcome(
      
        1001
                        tool_call=tool_call,
      
        1002
                        output="\n".join(
      
        1003
                            [
      
        1004
                                str(chapters / "01-introduction.html"),
      
        1005
                                str(chapters / "02-setup.html"),
      
        1006
                            ]
      
        1007
                        ),
      
        1008
                        is_error=False,
      
        1009
                    )
      
        1010
                ]
      
        1011
            )
      
        1012
        
        1013
            summary = TurnSummary(final_response="")
      
        1014
            await runner.execute_batch(
      
        1015
                tool_calls=[tool_call],
      
        1016
                tool_source="assistant",
      
        1017
                pending_tool_calls_seen=set(),
      
        1018
                emit=_noop_emit,
      
        1019
                summary=summary,
      
        1020
                dod=create_definition_of_done("Fix the chapter links"),
      
        1021
                executor=executor,  # type: ignore[arg-type]
      
        1022
                on_confirmation=None,
      
        1023
                on_user_question=None,
      
        1024
                emit_confirmation=None,
      
        1025
                consecutive_errors=0,
      
        1026
            )
      
        1027
        
        1028
            assert persistent_messages == []
      
        1029
            assert ephemeral_messages == []
      
        1030
            assert len(summary.tool_result_messages) == 1
      
        1031
            assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
      
        1032
        
        1033
        
        1034
        @pytest.mark.asyncio
      
        1035
        async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
      
        1036
            temp_dir: Path,
      
        1037
        ) -> None:
      
        1038
            async def assess_confidence(
      
        1039
                tool_name: str,
      
        1040
                tool_args: dict,
      
        1041
                context: str,
      
        1042
            ) -> ConfidenceAssessment:
      
        1043
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1044
        
        1045
            async def verify_action(
      
        1046
                tool_name: str,
      
        1047
                tool_args: dict,
      
        1048
                result: str,
      
        1049
                expected: str = "",
      
        1050
            ) -> ActionVerification:
      
        1051
                raise AssertionError("Verification should not run for this scenario")
      
        1052
        
        1053
            chapters = temp_dir / "chapters"
      
        1054
            chapters.mkdir()
      
        1055
            (chapters / "01-introduction.html").write_text(
      
        1056
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1057
            )
      
        1058
            (chapters / "02-setup.html").write_text(
      
        1059
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1060
            )
      
        1061
            index_path = temp_dir / "index.html"
      
        1062
            old_block = (
      
        1063
                '<ul class="chapter-list">\n'
      
        1064
                '    <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
      
        1065
                '    <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
      
        1066
                "</ul>\n"
      
        1067
            )
      
        1068
            new_block = (
      
        1069
                '<ul class="chapter-list">\n'
      
        1070
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1071
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1072
                "</ul>\n"
      
        1073
            )
      
        1074
            index_path.write_text(new_block)
      
        1075
        
        1076
            context = build_context(
      
        1077
                temp_dir=temp_dir,
      
        1078
                messages=[],
      
        1079
                safeguards=FakeSafeguards(),
      
        1080
                assess_confidence=assess_confidence,
      
        1081
                verify_action=verify_action,
      
        1082
                auto_recover=False,
      
        1083
            )
      
        1084
            context.session.current_task = (
      
        1085
                "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1086
            )
      
        1087
            persistent_messages: list[str] = []
      
        1088
            ephemeral_messages: list[str] = []
      
        1089
            context.queue_steering_message_callback = persistent_messages.append
      
        1090
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1091
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1092
            tool_call = ToolCall(
      
        1093
                id="edit-1",
      
        1094
                name="edit",
      
        1095
                arguments={
      
        1096
                    "file_path": str(index_path),
      
        1097
                    "old_string": old_block,
      
        1098
                    "new_string": new_block,
      
        1099
                },
      
        1100
            )
      
        1101
            executor = FakeExecutor(
      
        1102
                [
      
        1103
                    tool_outcome(
      
        1104
                        tool_call=tool_call,
      
        1105
                        output=f"Successfully edited {index_path}",
      
        1106
                        is_error=False,
      
        1107
                    )
      
        1108
                ]
      
        1109
            )
      
        1110
        
        1111
            summary = TurnSummary(final_response="")
      
        1112
            await runner.execute_batch(
      
        1113
                tool_calls=[tool_call],
      
        1114
                tool_source="assistant",
      
        1115
                pending_tool_calls_seen=set(),
      
        1116
                emit=_noop_emit,
      
        1117
                summary=summary,
      
        1118
                dod=create_definition_of_done(
      
        1119
                    "Update index.html so every chapter link and title matches the real HTML files in chapters/."
      
        1120
                ),
      
        1121
                executor=executor,  # type: ignore[arg-type]
      
        1122
                on_confirmation=None,
      
        1123
                on_user_question=None,
      
        1124
                emit_confirmation=None,
      
        1125
                consecutive_errors=0,
      
        1126
            )
      
        1127
        
        1128
            assert all(
      
        1129
                "Semantic verification preview:" not in message.content
      
        1130
                for message in summary.tool_result_messages
      
        1131
            )
      
        1132
            assert persistent_messages == []
      
        1133
            assert ephemeral_messages == []
      
        1134
        
        1135
        
        1136
        @pytest.mark.asyncio
      
        1137
        async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
      
        1138
            temp_dir: Path,
      
        1139
        ) -> None:
      
        1140
            async def assess_confidence(
      
        1141
                tool_name: str,
      
        1142
                tool_args: dict,
      
        1143
                context: str,
      
        1144
            ) -> ConfidenceAssessment:
      
        1145
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1146
        
        1147
            async def verify_action(
      
        1148
                tool_name: str,
      
        1149
                tool_args: dict,
      
        1150
                result: str,
      
        1151
                expected: str = "",
      
        1152
            ) -> ActionVerification:
      
        1153
                raise AssertionError("Verification should not run for this scenario")
      
        1154
        
        1155
            chapters = temp_dir / "chapters"
      
        1156
            chapters.mkdir()
      
        1157
            (chapters / "01-introduction.html").write_text(
      
        1158
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        1159
            )
      
        1160
            (chapters / "02-setup.html").write_text(
      
        1161
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        1162
            )
      
        1163
            index_path = temp_dir / "index.html"
      
        1164
            index_path.write_text(
      
        1165
                "<h2>Table of Contents</h2>\n"
      
        1166
                '<ul class="chapter-list">\n'
      
        1167
                '    <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        1168
                '    <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        1169
                "</ul>\n"
      
        1170
            )
      
        1171
        
        1172
            prompt = (
      
        1173
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1174
                "for the structure and cadence of the guide. We are going to make an all "
      
        1175
                "new equally thorough guide on how to use the nginx tool."
      
        1176
            )
      
        1177
        
        1178
            context = build_context(
      
        1179
                temp_dir=temp_dir,
      
        1180
                messages=[],
      
        1181
                safeguards=FakeSafeguards(),
      
        1182
                assess_confidence=assess_confidence,
      
        1183
                verify_action=verify_action,
      
        1184
                auto_recover=False,
      
        1185
            )
      
        1186
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        1187
            persistent_messages: list[str] = []
      
        1188
            ephemeral_messages: list[str] = []
      
        1189
            context.queue_steering_message_callback = persistent_messages.append
      
        1190
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1191
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1192
            tool_call = ToolCall(
      
        1193
                id="read-index",
      
        1194
                name="read",
      
        1195
                arguments={"file_path": str(index_path)},
      
        1196
            )
      
        1197
            executor = FakeExecutor(
      
        1198
                [
      
        1199
                    tool_outcome(
      
        1200
                        tool_call=tool_call,
      
        1201
                        output=index_path.read_text(),
      
        1202
                        is_error=False,
      
        1203
                    )
      
        1204
                ]
      
        1205
            )
      
        1206
        
        1207
            summary = TurnSummary(final_response="")
      
        1208
            await runner.execute_batch(
      
        1209
                tool_calls=[tool_call],
      
        1210
                tool_source="assistant",
      
        1211
                pending_tool_calls_seen=set(),
      
        1212
                emit=_noop_emit,
      
        1213
                summary=summary,
      
        1214
                dod=create_definition_of_done(prompt),
      
        1215
                executor=executor,  # type: ignore[arg-type]
      
        1216
                on_confirmation=None,
      
        1217
                on_user_question=None,
      
        1218
                emit_confirmation=None,
      
        1219
                consecutive_errors=0,
      
        1220
            )
      
        1221
        
        1222
            assert persistent_messages == []
      
        1223
            assert ephemeral_messages == []
      
        1224
            assert all(
      
        1225
                "Semantic verification preview:" not in message.content
      
        1226
                for message in summary.tool_result_messages
      
        1227
            )
      
        1228
        
        1229
        
        1230
        @pytest.mark.asyncio
      
        1231
        async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
      
        1232
            temp_dir: Path,
      
        1233
        ) -> None:
      
        1234
            async def assess_confidence(
      
        1235
                tool_name: str,
      
        1236
                tool_args: dict,
      
        1237
                context: str,
      
        1238
            ) -> ConfidenceAssessment:
      
        1239
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1240
        
        1241
            async def verify_action(
      
        1242
                tool_name: str,
      
        1243
                tool_args: dict,
      
        1244
                result: str,
      
        1245
                expected: str = "",
      
        1246
            ) -> ActionVerification:
      
        1247
                raise AssertionError("Verification should not run for this scenario")
      
        1248
        
        1249
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1250
            reference.parent.mkdir(parents=True)
      
        1251
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1252
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1253
            chapters = nginx_root / "chapters"
      
        1254
            implementation_plan = temp_dir / "implementation.md"
      
        1255
            implementation_plan.write_text(
      
        1256
                "\n".join(
      
        1257
                    [
      
        1258
                        "# Implementation Plan",
      
        1259
                        "",
      
        1260
                        "## File Changes",
      
        1261
                        f"- `{chapters}/`",
      
        1262
                        f"- `{nginx_root / 'index.html'}`",
      
        1263
                        "",
      
        1264
                    ]
      
        1265
                )
      
        1266
            )
      
        1267
        
        1268
            context = build_context(
      
        1269
                temp_dir=temp_dir,
      
        1270
                messages=[],
      
        1271
                safeguards=FakeSafeguards(),
      
        1272
                assess_confidence=assess_confidence,
      
        1273
                verify_action=verify_action,
      
        1274
                auto_recover=False,
      
        1275
            )
      
        1276
            persistent_messages: list[str] = []
      
        1277
            ephemeral_messages: list[str] = []
      
        1278
            context.queue_steering_message_callback = persistent_messages.append
      
        1279
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1280
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1281
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1282
            dod.implementation_plan = str(implementation_plan)
      
        1283
            sync_todos_to_definition_of_done(
      
        1284
                dod,
      
        1285
                [
      
        1286
                    {
      
        1287
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1288
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1289
                        "status": "pending",
      
        1290
                    },
      
        1291
                    {
      
        1292
                        "content": "Create the nginx directory structure",
      
        1293
                        "active_form": "Working on: Create the nginx directory structure",
      
        1294
                        "status": "pending",
      
        1295
                    },
      
        1296
                    {
      
        1297
                        "content": "Create the nginx index.html file",
      
        1298
                        "active_form": "Working on: Create the nginx index.html file",
      
        1299
                        "status": "pending",
      
        1300
                    },
      
        1301
                ],
      
        1302
            )
      
        1303
            tool_call = ToolCall(
      
        1304
                id="read-reference",
      
        1305
                name="read",
      
        1306
                arguments={"file_path": str(reference)},
      
        1307
            )
      
        1308
            executor = FakeExecutor(
      
        1309
                [
      
        1310
                    tool_outcome(
      
        1311
                        tool_call=tool_call,
      
        1312
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1313
                        is_error=False,
      
        1314
                    )
      
        1315
                ]
      
        1316
            )
      
        1317
        
        1318
            summary = TurnSummary(final_response="")
      
        1319
            await runner.execute_batch(
      
        1320
                tool_calls=[tool_call],
      
        1321
                tool_source="assistant",
      
        1322
                pending_tool_calls_seen=set(),
      
        1323
                emit=_noop_emit,
      
        1324
                summary=summary,
      
        1325
                dod=dod,
      
        1326
                executor=executor,  # type: ignore[arg-type]
      
        1327
                on_confirmation=None,
      
        1328
                on_user_question=None,
      
        1329
                emit_confirmation=None,
      
        1330
                consecutive_errors=0,
      
        1331
            )
      
        1332
        
        1333
            assert (
      
        1334
                "Examine the existing Fortran guide structure to understand the cadence and format"
      
        1335
                in dod.completed_items
      
        1336
            )
      
        1337
            assert any(
      
        1338
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1339
                in message
      
        1340
                for message in persistent_messages
      
        1341
            )
      
        1342
            assert any(
      
        1343
                "Resume by creating `chapters/` now." in message
      
        1344
                for message in persistent_messages
      
        1345
            )
      
        1346
            assert all("01-introduction.html" not in message for message in persistent_messages)
      
        1347
            assert ephemeral_messages == []
      
        1348
        
        1349
        
        1350
        @pytest.mark.asyncio
      
        1351
        async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
      
        1352
            temp_dir: Path,
      
        1353
        ) -> None:
      
        1354
            async def assess_confidence(
      
        1355
                tool_name: str,
      
        1356
                tool_args: dict,
      
        1357
                context: str,
      
        1358
            ) -> ConfidenceAssessment:
      
        1359
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1360
        
        1361
            async def verify_action(
      
        1362
                tool_name: str,
      
        1363
                tool_args: dict,
      
        1364
                result: str,
      
        1365
                expected: str = "",
      
        1366
            ) -> ActionVerification:
      
        1367
                raise AssertionError("Verification should not run for this scenario")
      
        1368
        
        1369
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        1370
            reference.parent.mkdir(parents=True)
      
        1371
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1372
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1373
            chapters = nginx_root / "chapters"
      
        1374
            implementation_plan = temp_dir / "implementation.md"
      
        1375
            implementation_plan.write_text(
      
        1376
                "\n".join(
      
        1377
                    [
      
        1378
                        "# Implementation Plan",
      
        1379
                        "",
      
        1380
                        "## File Changes",
      
        1381
                        f"- `{nginx_root / 'index.html'}`",
      
        1382
                        f"- `{chapters}/`",
      
        1383
                        "",
      
        1384
                    ]
      
        1385
                )
      
        1386
            )
      
        1387
        
        1388
            context = build_context(
      
        1389
                temp_dir=temp_dir,
      
        1390
                messages=[],
      
        1391
                safeguards=FakeSafeguards(),
      
        1392
                assess_confidence=assess_confidence,
      
        1393
                verify_action=verify_action,
      
        1394
                auto_recover=False,
      
        1395
            )
      
        1396
            persistent_messages: list[str] = []
      
        1397
            ephemeral_messages: list[str] = []
      
        1398
            context.queue_steering_message_callback = persistent_messages.append
      
        1399
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1400
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1401
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        1402
            dod.implementation_plan = str(implementation_plan)
      
        1403
            sync_todos_to_definition_of_done(
      
        1404
                dod,
      
        1405
                [
      
        1406
                    {
      
        1407
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1408
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1409
                        "status": "pending",
      
        1410
                    },
      
        1411
                    {
      
        1412
                        "content": "Create the nginx directory structure",
      
        1413
                        "active_form": "Working on: Create the nginx directory structure",
      
        1414
                        "status": "pending",
      
        1415
                    },
      
        1416
                    {
      
        1417
                        "content": "Create the nginx index.html file",
      
        1418
                        "active_form": "Working on: Create the nginx index.html file",
      
        1419
                        "status": "pending",
      
        1420
                    },
      
        1421
                ],
      
        1422
                project_root=temp_dir,
      
        1423
            )
      
        1424
            tool_call = ToolCall(
      
        1425
                id="read-reference-index-first",
      
        1426
                name="read",
      
        1427
                arguments={"file_path": str(reference)},
      
        1428
            )
      
        1429
            executor = FakeExecutor(
      
        1430
                [
      
        1431
                    tool_outcome(
      
        1432
                        tool_call=tool_call,
      
        1433
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        1434
                        is_error=False,
      
        1435
                    )
      
        1436
                ]
      
        1437
            )
      
        1438
        
        1439
            summary = TurnSummary(final_response="")
      
        1440
            await runner.execute_batch(
      
        1441
                tool_calls=[tool_call],
      
        1442
                tool_source="assistant",
      
        1443
                pending_tool_calls_seen=set(),
      
        1444
                emit=_noop_emit,
      
        1445
                summary=summary,
      
        1446
                dod=dod,
      
        1447
                executor=executor,  # type: ignore[arg-type]
      
        1448
                on_confirmation=None,
      
        1449
                on_user_question=None,
      
        1450
                emit_confirmation=None,
      
        1451
                consecutive_errors=0,
      
        1452
            )
      
        1453
        
        1454
            assert persistent_messages
      
        1455
            assert any(
      
        1456
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1457
                in message
      
        1458
                for message in persistent_messages
      
        1459
            )
      
        1460
            assert any(
      
        1461
                "Resume by creating `chapters/` now." in message
      
        1462
                for message in persistent_messages
      
        1463
            )
      
        1464
            assert all(
      
        1465
                "Next step: create `index.html`." not in message
      
        1466
                for message in persistent_messages
      
        1467
            )
      
        1468
            assert ephemeral_messages == []
      
        1469
        
        1470
        
        1471
        @pytest.mark.asyncio
      
        1472
        async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
      
        1473
            temp_dir: Path,
      
        1474
        ) -> None:
      
        1475
            async def assess_confidence(
      
        1476
                tool_name: str,
      
        1477
                tool_args: dict,
      
        1478
                context: str,
      
        1479
            ) -> ConfidenceAssessment:
      
        1480
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1481
        
        1482
            async def verify_action(
      
        1483
                tool_name: str,
      
        1484
                tool_args: dict,
      
        1485
                result: str,
      
        1486
                expected: str = "",
      
        1487
            ) -> ActionVerification:
      
        1488
                raise AssertionError("Verification should not run for this scenario")
      
        1489
        
        1490
            reference = temp_dir / "fortran" / "index.html"
      
        1491
            reference.parent.mkdir(parents=True)
      
        1492
            reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
      
        1493
        
        1494
            messages = [
      
        1495
                Message(
      
        1496
                    role=Role.TOOL,
      
        1497
                    content=(
      
        1498
                        "Observation [read]: Result: "
      
        1499
                        "<h1>Fortran Beginner's Guide</h1>\n"
      
        1500
                    ),
      
        1501
                )
      
        1502
            ]
      
        1503
            context = build_context(
      
        1504
                temp_dir=temp_dir,
      
        1505
                messages=messages,
      
        1506
                safeguards=FakeSafeguards(),
      
        1507
                assess_confidence=assess_confidence,
      
        1508
                verify_action=verify_action,
      
        1509
                auto_recover=False,
      
        1510
            )
      
        1511
            prompt = (
      
        1512
                "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
      
        1513
                "for the structure and cadence of the guide. We are going to make an all "
      
        1514
                "new equally thorough guide on how to use the nginx tool."
      
        1515
            )
      
        1516
            context.session.current_task = prompt
      
        1517
            persistent_messages: list[str] = []
      
        1518
            ephemeral_messages: list[str] = []
      
        1519
            context.queue_steering_message_callback = persistent_messages.append
      
        1520
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1521
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1522
            dod = create_definition_of_done(prompt)
      
        1523
            sync_todos_to_definition_of_done(
      
        1524
                dod,
      
        1525
                [
      
        1526
                    {
      
        1527
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        1528
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        1529
                        "status": "completed",
      
        1530
                    },
      
        1531
                    {
      
        1532
                        "content": "Create the nginx directory structure",
      
        1533
                        "active_form": "Working on: Create the nginx directory structure",
      
        1534
                        "status": "pending",
      
        1535
                    },
      
        1536
                    {
      
        1537
                        "content": "Create the nginx index.html file",
      
        1538
                        "active_form": "Working on: Create the nginx index.html file",
      
        1539
                        "status": "pending",
      
        1540
                    },
      
        1541
                ],
      
        1542
            )
      
        1543
            tool_call = ToolCall(
      
        1544
                id="read-dup",
      
        1545
                name="read",
      
        1546
                arguments={"file_path": str(reference)},
      
        1547
            )
      
        1548
            duplicate_message = (
      
        1549
                "[Skipped - duplicate action: Already read "
      
        1550
                f"{reference} recently without any intervening changes; "
      
        1551
                "reuse the earlier read result instead of rereading]"
      
        1552
            )
      
        1553
            executor = FakeExecutor(
      
        1554
                [
      
        1555
                    ToolExecutionOutcome(
      
        1556
                        tool_call=tool_call,
      
        1557
                        state=ToolExecutionState.DUPLICATE,
      
        1558
                        message=Message.tool_result_message(
      
        1559
                            tool_call_id=tool_call.id,
      
        1560
                            display_content=duplicate_message,
      
        1561
                            result_content=duplicate_message,
      
        1562
                        ),
      
        1563
                        event_content=duplicate_message,
      
        1564
                        is_error=False,
      
        1565
                        result_output=duplicate_message,
      
        1566
                    )
      
        1567
                ]
      
        1568
            )
      
        1569
        
        1570
            summary = TurnSummary(final_response="")
      
        1571
            await runner.execute_batch(
      
        1572
                tool_calls=[tool_call],
      
        1573
                tool_source="assistant",
      
        1574
                pending_tool_calls_seen=set(),
      
        1575
                emit=_noop_emit,
      
        1576
                summary=summary,
      
        1577
                dod=dod,
      
        1578
                executor=executor,  # type: ignore[arg-type]
      
        1579
                on_confirmation=None,
      
        1580
                on_user_question=None,
      
        1581
                emit_confirmation=None,
      
        1582
                consecutive_errors=0,
      
        1583
            )
      
        1584
        
        1585
            assert len(persistent_messages) == 1
      
        1586
            assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
      
        1587
            assert (
      
        1588
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        1589
                in persistent_messages[0]
      
        1590
            )
      
        1591
            assert "Update `" not in persistent_messages[0]
      
        1592
            assert ephemeral_messages == []
      
        1593
        
        1594
        
        1595
        @pytest.mark.asyncio
      
        1596
        async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
      
        1597
            temp_dir: Path,
      
        1598
        ) -> None:
      
        1599
            async def assess_confidence(
      
        1600
                tool_name: str,
      
        1601
                tool_args: dict,
      
        1602
                context: str,
      
        1603
            ) -> ConfidenceAssessment:
      
        1604
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        1605
        
        1606
            async def verify_action(
      
        1607
                tool_name: str,
      
        1608
                tool_args: dict,
      
        1609
                result: str,
      
        1610
                expected: str = "",
      
        1611
            ) -> ActionVerification:
      
        1612
                raise AssertionError("Verification should not run for this scenario")
      
        1613
        
        1614
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        1615
            chapters = guide_root / "chapters"
      
        1616
            chapters.mkdir(parents=True)
      
        1617
            chapter_one = chapters / "01-introduction.html"
      
        1618
            chapter_one.write_text("<html></html>\n")
      
        1619
            index_path = guide_root / "index.html"
      
        1620
        
        1621
            reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
      
        1622
            reference.parent.mkdir(parents=True, exist_ok=True)
      
        1623
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        1624
        
        1625
            implementation_plan = temp_dir / "implementation.md"
      
        1626
            implementation_plan.write_text(
      
        1627
                "\n".join(
      
        1628
                    [
      
        1629
                        "# Implementation Plan",
      
        1630
                        "",
      
        1631
                        "## File Changes",
      
        1632
                        f"- `{guide_root}/`",
      
        1633
                        f"- `{chapters}/`",
      
        1634
                        f"- `{index_path}`",
      
        1635
                        f"- `{chapter_one}`",
      
        1636
                        f"- `{chapters / '02-installation.html'}`",
      
        1637
                        "",
      
        1638
                    ]
      
        1639
                )
      
        1640
            )
      
        1641
        
        1642
            context = build_context(
      
        1643
                temp_dir=temp_dir,
      
        1644
                messages=[],
      
        1645
                safeguards=FakeSafeguards(),
      
        1646
                assess_confidence=assess_confidence,
      
        1647
                verify_action=verify_action,
      
        1648
                auto_recover=False,
      
        1649
            )
      
        1650
            persistent_messages: list[str] = []
      
        1651
            ephemeral_messages: list[str] = []
      
        1652
            context.queue_steering_message_callback = persistent_messages.append
      
        1653
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1654
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1655
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1656
            dod.implementation_plan = str(implementation_plan)
      
        1657
            dod.touched_files.append(str(chapter_one))
      
        1658
            sync_todos_to_definition_of_done(
      
        1659
                dod,
      
        1660
                [
      
        1661
                    {
      
        1662
                        "content": "Examine the existing Fortran guide structure to understand the format and cadence",
      
        1663
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
      
        1664
                        "status": "pending",
      
        1665
                    },
      
        1666
                    {
      
        1667
                        "content": "Create each chapter file with appropriate content",
      
        1668
                        "active_form": "Working on: Create each chapter file with appropriate content",
      
        1669
                        "status": "pending",
      
        1670
                    },
      
        1671
                    {
      
        1672
                        "content": "Ensure all files follow the same structure and style as the Fortran guide",
      
        1673
                        "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
      
        1674
                        "status": "pending",
      
        1675
                    },
      
        1676
                ],
      
        1677
            )
      
        1678
            tool_call = ToolCall(
      
        1679
                id="read-reference-chapter",
      
        1680
                name="read",
      
        1681
                arguments={"file_path": str(reference)},
      
        1682
            )
      
        1683
            read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
      
        1684
            executor = FakeExecutor(
      
        1685
                [
      
        1686
                    ToolExecutionOutcome(
      
        1687
                        tool_call=tool_call,
      
        1688
                        state=ToolExecutionState.EXECUTED,
      
        1689
                        message=Message.tool_result_message(
      
        1690
                            tool_call_id=tool_call.id,
      
        1691
                            display_content=read_output,
      
        1692
                            result_content=read_output,
      
        1693
                        ),
      
        1694
                        event_content=read_output,
      
        1695
                        is_error=False,
      
        1696
                        result_output=read_output,
      
        1697
                    )
      
        1698
                ]
      
        1699
            )
      
        1700
        
        1701
            summary = TurnSummary(final_response="")
      
        1702
            await runner.execute_batch(
      
        1703
                tool_calls=[tool_call],
      
        1704
                tool_source="assistant",
      
        1705
                pending_tool_calls_seen=set(),
      
        1706
                emit=_noop_emit,
      
        1707
                summary=summary,
      
        1708
                dod=dod,
      
        1709
                executor=executor,  # type: ignore[arg-type]
      
        1710
                on_confirmation=None,
      
        1711
                on_user_question=None,
      
        1712
                emit_confirmation=None,
      
        1713
                consecutive_errors=0,
      
        1714
            )
      
        1715
        
        1716
            assert persistent_messages
      
        1717
            assert any(
      
        1718
                "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
      
        1719
                in message
      
        1720
                for message in persistent_messages
      
        1721
            )
      
        1722
            assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
      
        1723
            assert not any(
      
        1724
                "Continue with the next pending item: `Create each chapter file with appropriate content`"
      
        1725
                in message
      
        1726
                for message in persistent_messages
      
        1727
            )
      
        1728
            assert ephemeral_messages == []
      
        1729
        
        1730
        
        1731
        @pytest.mark.asyncio
      
        1732
        async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
      
        1733
            temp_dir: Path,
      
        1734
        ) -> None:
      
        1735
            async def assess_confidence(
      
        1736
                tool_name: str,
      
        1737
                tool_args: dict,
      
        1738
                context: str,
      
        1739
            ) -> ConfidenceAssessment:
      
        1740
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1741
        
        1742
            async def verify_action(
      
        1743
                tool_name: str,
      
        1744
                tool_args: dict,
      
        1745
                result: str,
      
        1746
                expected: str = "",
      
        1747
            ) -> ActionVerification:
      
        1748
                raise AssertionError("Verification should not run for this scenario")
      
        1749
        
        1750
            guide_root = temp_dir / "guides" / "nginx"
      
        1751
            chapters = guide_root / "chapters"
      
        1752
            guide_root.mkdir(parents=True)
      
        1753
            chapters.mkdir()
      
        1754
            index_path = guide_root / "index.html"
      
        1755
            chapter_one = chapters / "01-getting-started.html"
      
        1756
            chapter_two = chapters / "02-installation.html"
      
        1757
            index_path.write_text("<html></html>\n")
      
        1758
            chapter_one.write_text("<h1>One</h1>\n")
      
        1759
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1760
        
        1761
            implementation_plan = temp_dir / "implementation.md"
      
        1762
            implementation_plan.write_text(
      
        1763
                "\n".join(
      
        1764
                    [
      
        1765
                        "# Implementation Plan",
      
        1766
                        "",
      
        1767
                        "## File Changes",
      
        1768
                        f"- `{guide_root}/`",
      
        1769
                        f"- `{chapters}/`",
      
        1770
                        f"- `{index_path}`",
      
        1771
                        f"- `{chapter_one}`",
      
        1772
                        f"- `{chapter_two}`",
      
        1773
                        "",
      
        1774
                    ]
      
        1775
                )
      
        1776
            )
      
        1777
        
        1778
            context = build_context(
      
        1779
                temp_dir=temp_dir,
      
        1780
                messages=[],
      
        1781
                safeguards=FakeSafeguards(),
      
        1782
                assess_confidence=assess_confidence,
      
        1783
                verify_action=verify_action,
      
        1784
                auto_recover=False,
      
        1785
            )
      
        1786
            persistent_messages: list[str] = []
      
        1787
            ephemeral_messages: list[str] = []
      
        1788
            context.queue_steering_message_callback = persistent_messages.append
      
        1789
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1790
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1791
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1792
            dod.implementation_plan = str(implementation_plan)
      
        1793
            dod.pending_items = [
      
        1794
                "Create 07-performance-tuning.html",
      
        1795
                "Verify all guide files are linked and complete",
      
        1796
                "Complete the requested work",
      
        1797
            ]
      
        1798
        
        1799
            tool_call = ToolCall(
      
        1800
                id="read-dup",
      
        1801
                name="read",
      
        1802
                arguments={"file_path": str(chapter_one)},
      
        1803
            )
      
        1804
            duplicate_message = (
      
        1805
                "[Skipped - duplicate action: Already read "
      
        1806
                f"{chapter_one} recently without any intervening changes; "
      
        1807
                "reuse the earlier read result instead of rereading]"
      
        1808
            )
      
        1809
            executor = FakeExecutor(
      
        1810
                [
      
        1811
                    ToolExecutionOutcome(
      
        1812
                        tool_call=tool_call,
      
        1813
                        state=ToolExecutionState.DUPLICATE,
      
        1814
                        message=Message.tool_result_message(
      
        1815
                            tool_call_id=tool_call.id,
      
        1816
                            display_content=duplicate_message,
      
        1817
                            result_content=duplicate_message,
      
        1818
                        ),
      
        1819
                        event_content=duplicate_message,
      
        1820
                        is_error=False,
      
        1821
                        result_output=duplicate_message,
      
        1822
                    )
      
        1823
                ]
      
        1824
            )
      
        1825
        
        1826
            summary = TurnSummary(final_response="")
      
        1827
            await runner.execute_batch(
      
        1828
                tool_calls=[tool_call],
      
        1829
                tool_source="assistant",
      
        1830
                pending_tool_calls_seen=set(),
      
        1831
                emit=_noop_emit,
      
        1832
                summary=summary,
      
        1833
                dod=dod,
      
        1834
                executor=executor,  # type: ignore[arg-type]
      
        1835
                on_confirmation=None,
      
        1836
                on_user_question=None,
      
        1837
                emit_confirmation=None,
      
        1838
                consecutive_errors=0,
      
        1839
            )
      
        1840
        
        1841
            assert len(persistent_messages) == 1
      
        1842
            assert "Verify all guide files are linked and complete" in persistent_messages[0]
      
        1843
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1844
            assert ephemeral_messages == []
      
        1845
        
        1846
        
        1847
        @pytest.mark.asyncio
      
        1848
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
      
        1849
            temp_dir: Path,
      
        1850
        ) -> None:
      
        1851
            async def assess_confidence(
      
        1852
                tool_name: str,
      
        1853
                tool_args: dict,
      
        1854
                context: str,
      
        1855
            ) -> ConfidenceAssessment:
      
        1856
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1857
        
        1858
            async def verify_action(
      
        1859
                tool_name: str,
      
        1860
                tool_args: dict,
      
        1861
                result: str,
      
        1862
                expected: str = "",
      
        1863
            ) -> ActionVerification:
      
        1864
                raise AssertionError("Verification should not run for this scenario")
      
        1865
        
        1866
            guide_root = temp_dir / "guides" / "nginx"
      
        1867
            chapters = guide_root / "chapters"
      
        1868
            guide_root.mkdir(parents=True)
      
        1869
            chapters.mkdir()
      
        1870
            index_path = guide_root / "index.html"
      
        1871
            chapter_one = chapters / "01-getting-started.html"
      
        1872
            chapter_two = chapters / "02-installation.html"
      
        1873
            index_path.write_text("<html></html>\n")
      
        1874
            chapter_one.write_text("<h1>One</h1>\n")
      
        1875
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1876
        
        1877
            implementation_plan = temp_dir / "implementation.md"
      
        1878
            implementation_plan.write_text(
      
        1879
                "\n".join(
      
        1880
                    [
      
        1881
                        "# Implementation Plan",
      
        1882
                        "",
      
        1883
                        "## File Changes",
      
        1884
                        f"- `{guide_root}/`",
      
        1885
                        f"- `{chapters}/`",
      
        1886
                        f"- `{index_path}`",
      
        1887
                        f"- `{chapter_one}`",
      
        1888
                        f"- `{chapter_two}`",
      
        1889
                        "",
      
        1890
                    ]
      
        1891
                )
      
        1892
            )
      
        1893
        
        1894
            context = build_context(
      
        1895
                temp_dir=temp_dir,
      
        1896
                messages=[],
      
        1897
                safeguards=FakeSafeguards(),
      
        1898
                assess_confidence=assess_confidence,
      
        1899
                verify_action=verify_action,
      
        1900
                auto_recover=False,
      
        1901
            )
      
        1902
            persistent_messages: list[str] = []
      
        1903
            ephemeral_messages: list[str] = []
      
        1904
            context.queue_steering_message_callback = persistent_messages.append
      
        1905
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        1906
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        1907
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        1908
            dod.implementation_plan = str(implementation_plan)
      
        1909
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        1910
            dod.pending_items = [
      
        1911
                "Create 07-performance-tuning.html",
      
        1912
                "Complete the requested work",
      
        1913
            ]
      
        1914
        
        1915
            tool_call = ToolCall(
      
        1916
                id="read-dup",
      
        1917
                name="read",
      
        1918
                arguments={"file_path": str(chapter_one)},
      
        1919
            )
      
        1920
            duplicate_message = (
      
        1921
                "[Skipped - duplicate action: Already read "
      
        1922
                f"{chapter_one} recently without any intervening changes; "
      
        1923
                "reuse the earlier read result instead of rereading]"
      
        1924
            )
      
        1925
            executor = FakeExecutor(
      
        1926
                [
      
        1927
                    ToolExecutionOutcome(
      
        1928
                        tool_call=tool_call,
      
        1929
                        state=ToolExecutionState.DUPLICATE,
      
        1930
                        message=Message.tool_result_message(
      
        1931
                            tool_call_id=tool_call.id,
      
        1932
                            display_content=duplicate_message,
      
        1933
                            result_content=duplicate_message,
      
        1934
                        ),
      
        1935
                        event_content=duplicate_message,
      
        1936
                        is_error=False,
      
        1937
                        result_output=duplicate_message,
      
        1938
                    )
      
        1939
                ]
      
        1940
            )
      
        1941
        
        1942
            summary = TurnSummary(final_response="")
      
        1943
            await runner.execute_batch(
      
        1944
                tool_calls=[tool_call],
      
        1945
                tool_source="assistant",
      
        1946
                pending_tool_calls_seen=set(),
      
        1947
                emit=_noop_emit,
      
        1948
                summary=summary,
      
        1949
                dod=dod,
      
        1950
                executor=executor,  # type: ignore[arg-type]
      
        1951
                on_confirmation=None,
      
        1952
                on_user_question=None,
      
        1953
                emit_confirmation=None,
      
        1954
                consecutive_errors=0,
      
        1955
            )
      
        1956
        
        1957
            assert len(persistent_messages) == 1
      
        1958
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        1959
            assert (
      
        1960
                "Move to verification or final confirmation using the files already on disk."
      
        1961
                in persistent_messages[0]
      
        1962
            )
      
        1963
            assert "Create 07-performance-tuning.html" not in persistent_messages[0]
      
        1964
            assert ephemeral_messages == []
      
        1965
        
        1966
        
        1967
        @pytest.mark.asyncio
      
        1968
        async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
      
        1969
            temp_dir: Path,
      
        1970
        ) -> None:
      
        1971
            async def assess_confidence(
      
        1972
                tool_name: str,
      
        1973
                tool_args: dict,
      
        1974
                context: str,
      
        1975
            ) -> ConfidenceAssessment:
      
        1976
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        1977
        
        1978
            async def verify_action(
      
        1979
                tool_name: str,
      
        1980
                tool_args: dict,
      
        1981
                result: str,
      
        1982
                expected: str = "",
      
        1983
            ) -> ActionVerification:
      
        1984
                raise AssertionError("Verification should not run for this scenario")
      
        1985
        
        1986
            guide_root = temp_dir / "guides" / "nginx"
      
        1987
            chapters = guide_root / "chapters"
      
        1988
            guide_root.mkdir(parents=True)
      
        1989
            chapters.mkdir()
      
        1990
            index_path = guide_root / "index.html"
      
        1991
            chapter_one = chapters / "01-getting-started.html"
      
        1992
            chapter_two = chapters / "02-installation.html"
      
        1993
            index_path.write_text("<html></html>\n")
      
        1994
            chapter_one.write_text("<h1>One</h1>\n")
      
        1995
            chapter_two.write_text("<h1>Two</h1>\n")
      
        1996
        
        1997
            implementation_plan = temp_dir / "implementation.md"
      
        1998
            implementation_plan.write_text(
      
        1999
                "\n".join(
      
        2000
                    [
      
        2001
                        "# Implementation Plan",
      
        2002
                        "",
      
        2003
                        "## File Changes",
      
        2004
                        f"- `{guide_root}/`",
      
        2005
                        f"- `{chapters}/`",
      
        2006
                        f"- `{index_path}`",
      
        2007
                        f"- `{chapter_one}`",
      
        2008
                        f"- `{chapter_two}`",
      
        2009
                        "",
      
        2010
                    ]
      
        2011
                )
      
        2012
            )
      
        2013
        
        2014
            context = build_context(
      
        2015
                temp_dir=temp_dir,
      
        2016
                messages=[],
      
        2017
                safeguards=FakeSafeguards(),
      
        2018
                assess_confidence=assess_confidence,
      
        2019
                verify_action=verify_action,
      
        2020
                auto_recover=False,
      
        2021
            )
      
        2022
            persistent_messages: list[str] = []
      
        2023
            ephemeral_messages: list[str] = []
      
        2024
            context.queue_steering_message_callback = persistent_messages.append
      
        2025
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2026
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2027
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2028
            dod.implementation_plan = str(implementation_plan)
      
        2029
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2030
            dod.pending_items = [
      
        2031
                "Create 01-getting-started.html",
      
        2032
                "Creating 02-installation.html",
      
        2033
                "Complete the requested work",
      
        2034
            ]
      
        2035
        
        2036
            tool_call = ToolCall(
      
        2037
                id="read-dup-built-stale",
      
        2038
                name="read",
      
        2039
                arguments={"file_path": str(chapter_one)},
      
        2040
            )
      
        2041
            duplicate_message = (
      
        2042
                "[Skipped - duplicate action: Already read "
      
        2043
                f"{chapter_one} recently without any intervening changes; "
      
        2044
                "reuse the earlier read result instead of rereading]"
      
        2045
            )
      
        2046
            executor = FakeExecutor(
      
        2047
                [
      
        2048
                    ToolExecutionOutcome(
      
        2049
                        tool_call=tool_call,
      
        2050
                        state=ToolExecutionState.DUPLICATE,
      
        2051
                        message=Message.tool_result_message(
      
        2052
                            tool_call_id=tool_call.id,
      
        2053
                            display_content=duplicate_message,
      
        2054
                            result_content=duplicate_message,
      
        2055
                        ),
      
        2056
                        event_content=duplicate_message,
      
        2057
                        is_error=False,
      
        2058
                        result_output=duplicate_message,
      
        2059
                    )
      
        2060
                ]
      
        2061
            )
      
        2062
        
        2063
            summary = TurnSummary(final_response="")
      
        2064
            await runner.execute_batch(
      
        2065
                tool_calls=[tool_call],
      
        2066
                tool_source="assistant",
      
        2067
                pending_tool_calls_seen=set(),
      
        2068
                emit=_noop_emit,
      
        2069
                summary=summary,
      
        2070
                dod=dod,
      
        2071
                executor=executor,  # type: ignore[arg-type]
      
        2072
                on_confirmation=None,
      
        2073
                on_user_question=None,
      
        2074
                emit_confirmation=None,
      
        2075
                consecutive_errors=0,
      
        2076
            )
      
        2077
        
        2078
            assert len(persistent_messages) == 1
      
        2079
            assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      
        2080
            assert (
      
        2081
                "Move to verification or final confirmation using the files already on disk."
      
        2082
                in persistent_messages[0]
      
        2083
            )
      
        2084
            assert "Create 01-getting-started.html" not in persistent_messages[0]
      
        2085
            assert "Creating 02-installation.html" not in persistent_messages[0]
      
        2086
            assert ephemeral_messages == []
      
        2087
        
        2088
        
        2089
        @pytest.mark.asyncio
      
        2090
        async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
      
        2091
            temp_dir: Path,
      
        2092
        ) -> None:
      
        2093
            async def assess_confidence(
      
        2094
                tool_name: str,
      
        2095
                tool_args: dict,
      
        2096
                context: str,
      
        2097
            ) -> ConfidenceAssessment:
      
        2098
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2099
        
        2100
            async def verify_action(
      
        2101
                tool_name: str,
      
        2102
                tool_args: dict,
      
        2103
                result: str,
      
        2104
                expected: str = "",
      
        2105
            ) -> ActionVerification:
      
        2106
                raise AssertionError("Verification should not run for this scenario")
      
        2107
        
        2108
            guide_root = temp_dir / "guides" / "nginx"
      
        2109
            chapters = guide_root / "chapters"
      
        2110
            guide_root.mkdir(parents=True)
      
        2111
            chapters.mkdir()
      
        2112
            index_path = guide_root / "index.html"
      
        2113
            chapter_one = chapters / "01-getting-started.html"
      
        2114
            chapter_two = chapters / "02-installation.html"
      
        2115
            index_path.write_text("<html></html>\n")
      
        2116
            chapter_one.write_text("<h1>One</h1>\n")
      
        2117
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2118
        
        2119
            implementation_plan = temp_dir / "implementation.md"
      
        2120
            implementation_plan.write_text(
      
        2121
                "\n".join(
      
        2122
                    [
      
        2123
                        "# Implementation Plan",
      
        2124
                        "",
      
        2125
                        "## File Changes",
      
        2126
                        f"- `{guide_root}/`",
      
        2127
                        f"- `{chapters}/`",
      
        2128
                        f"- `{index_path}`",
      
        2129
                        f"- `{chapter_one}`",
      
        2130
                        f"- `{chapter_two}`",
      
        2131
                        "",
      
        2132
                    ]
      
        2133
                )
      
        2134
            )
      
        2135
        
        2136
            context = build_context(
      
        2137
                temp_dir=temp_dir,
      
        2138
                messages=[],
      
        2139
                safeguards=FakeSafeguards(),
      
        2140
                assess_confidence=assess_confidence,
      
        2141
                verify_action=verify_action,
      
        2142
                auto_recover=False,
      
        2143
            )
      
        2144
            persistent_messages: list[str] = []
      
        2145
            ephemeral_messages: list[str] = []
      
        2146
            context.queue_steering_message_callback = persistent_messages.append
      
        2147
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2148
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2149
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2150
            dod.implementation_plan = str(implementation_plan)
      
        2151
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2152
            sync_todos_to_definition_of_done(
      
        2153
                dod,
      
        2154
                [
      
        2155
                    {
      
        2156
                        "content": "Create 01-getting-started.html",
      
        2157
                        "active_form": "Creating 01-getting-started.html",
      
        2158
                        "status": "pending",
      
        2159
                    },
      
        2160
                    {
      
        2161
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        2162
                        "active_form": "Reviewing guide consistency and linkage",
      
        2163
                        "status": "pending",
      
        2164
                    },
      
        2165
                ],
      
        2166
            )
      
        2167
        
        2168
            tool_call = ToolCall(
      
        2169
                id="read-built-review",
      
        2170
                name="read",
      
        2171
                arguments={"file_path": str(chapter_one)},
      
        2172
            )
      
        2173
            executor = FakeExecutor(
      
        2174
                [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
      
        2175
            )
      
        2176
        
        2177
            summary = TurnSummary(final_response="")
      
        2178
            await runner.execute_batch(
      
        2179
                tool_calls=[tool_call],
      
        2180
                tool_source="assistant",
      
        2181
                pending_tool_calls_seen=set(),
      
        2182
                emit=_noop_emit,
      
        2183
                summary=summary,
      
        2184
                dod=dod,
      
        2185
                executor=executor,  # type: ignore[arg-type]
      
        2186
                on_confirmation=None,
      
        2187
                on_user_question=None,
      
        2188
                emit_confirmation=None,
      
        2189
                consecutive_errors=0,
      
        2190
            )
      
        2191
        
        2192
            assert persistent_messages == []
      
        2193
            assert len(ephemeral_messages) == 1
      
        2194
            message = ephemeral_messages[0]
      
        2195
            assert "All explicitly planned artifacts already exist." in message
      
        2196
            assert "Ensure all files are properly linked and formatted consistently" in message
      
        2197
            assert "Create 01-getting-started.html" not in message
      
        2198
            assert "do not keep broad-rereading the output set" in message
      
        2199
            assert "If no specific mismatch remains, move to verification now." in message
      
        2200
        
        2201
        
        2202
        @pytest.mark.asyncio
      
        2203
        async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
      
        2204
            temp_dir: Path,
      
        2205
        ) -> None:
      
        2206
            async def assess_confidence(
      
        2207
                tool_name: str,
      
        2208
                tool_args: dict,
      
        2209
                context: str,
      
        2210
            ) -> ConfidenceAssessment:
      
        2211
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        2212
        
        2213
            async def verify_action(
      
        2214
                tool_name: str,
      
        2215
                tool_args: dict,
      
        2216
                result: str,
      
        2217
                expected: str = "",
      
        2218
            ) -> ActionVerification:
      
        2219
                raise AssertionError("Verification should not run for this scenario")
      
        2220
        
        2221
            guide_root = temp_dir / "guides" / "nginx"
      
        2222
            chapters = guide_root / "chapters"
      
        2223
            guide_root.mkdir(parents=True)
      
        2224
            chapters.mkdir()
      
        2225
            index_path = guide_root / "index.html"
      
        2226
            chapter_one = chapters / "01-getting-started.html"
      
        2227
            chapter_two = chapters / "02-installation.html"
      
        2228
            index_path.write_text("<html></html>\n")
      
        2229
            chapter_one.write_text("<h1>One</h1>\n")
      
        2230
            chapter_two.write_text("<h1>Two</h1>\n")
      
        2231
        
        2232
            implementation_plan = temp_dir / "implementation.md"
      
        2233
            implementation_plan.write_text(
      
        2234
                "\n".join(
      
        2235
                    [
      
        2236
                        "# Implementation Plan",
      
        2237
                        "",
      
        2238
                        "## File Changes",
      
        2239
                        f"- `{guide_root}/`",
      
        2240
                        f"- `{chapters}/`",
      
        2241
                        f"- `{index_path}`",
      
        2242
                        f"- `{chapter_one}`",
      
        2243
                        f"- `{chapter_two}`",
      
        2244
                        "",
      
        2245
                    ]
      
        2246
                )
      
        2247
            )
      
        2248
        
        2249
            context = build_context(
      
        2250
                temp_dir=temp_dir,
      
        2251
                messages=[],
      
        2252
                safeguards=FakeSafeguards(),
      
        2253
                assess_confidence=assess_confidence,
      
        2254
                verify_action=verify_action,
      
        2255
                auto_recover=False,
      
        2256
            )
      
        2257
            persistent_messages: list[str] = []
      
        2258
            ephemeral_messages: list[str] = []
      
        2259
            context.queue_steering_message_callback = persistent_messages.append
      
        2260
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2261
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2262
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2263
            dod.implementation_plan = str(implementation_plan)
      
        2264
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        2265
        
        2266
            tool_call = ToolCall(
      
        2267
                id="read-built-verify",
      
        2268
                name="read",
      
        2269
                arguments={"file_path": str(chapter_one)},
      
        2270
            )
      
        2271
            executor = FakeExecutor(
      
        2272
                [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
      
        2273
            )
      
        2274
        
        2275
            summary = TurnSummary(final_response="")
      
        2276
            await runner.execute_batch(
      
        2277
                tool_calls=[tool_call],
      
        2278
                tool_source="assistant",
      
        2279
                pending_tool_calls_seen=set(),
      
        2280
                emit=_noop_emit,
      
        2281
                summary=summary,
      
        2282
                dod=dod,
      
        2283
                executor=executor,  # type: ignore[arg-type]
      
        2284
                on_confirmation=None,
      
        2285
                on_user_question=None,
      
        2286
                emit_confirmation=None,
      
        2287
                consecutive_errors=0,
      
        2288
            )
      
        2289
        
        2290
            assert len(persistent_messages) == 1
      
        2291
            assert "All explicitly planned artifacts already exist." in persistent_messages[0]
      
        2292
            assert "Verification should run next." in persistent_messages[0]
      
        2293
            assert "stop broad rereads" in persistent_messages[0]
      
        2294
            assert ephemeral_messages == []
      
        2295
            assert context.workflow_mode == "verify"
      
        2296
        
        2297
        
        2298
        @pytest.mark.asyncio
      
        2299
        async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
      
        2300
            temp_dir: Path,
      
        2301
        ) -> None:
      
        2302
            async def assess_confidence(
      
        2303
                tool_name: str,
      
        2304
                tool_args: dict,
      
        2305
                context: str,
      
        2306
            ) -> ConfidenceAssessment:
      
        2307
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2308
        
        2309
            async def verify_action(
      
        2310
                tool_name: str,
      
        2311
                tool_args: dict,
      
        2312
                result: str,
      
        2313
                expected: str = "",
      
        2314
            ) -> ActionVerification:
      
        2315
                raise AssertionError("Verification should not run for this scenario")
      
        2316
        
        2317
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2318
            reference.parent.mkdir(parents=True)
      
        2319
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2320
        
        2321
            context = build_context(
      
        2322
                temp_dir=temp_dir,
      
        2323
                messages=[],
      
        2324
                safeguards=FakeSafeguards(),
      
        2325
                assess_confidence=assess_confidence,
      
        2326
                verify_action=verify_action,
      
        2327
                auto_recover=False,
      
        2328
            )
      
        2329
            persistent_messages: list[str] = []
      
        2330
            ephemeral_messages: list[str] = []
      
        2331
            context.queue_steering_message_callback = persistent_messages.append
      
        2332
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2333
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2334
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2335
            sync_todos_to_definition_of_done(
      
        2336
                dod,
      
        2337
                [
      
        2338
                    {
      
        2339
                        "content": "Examine the existing Fortran guide structure to understand the cadence and format",
      
        2340
                        "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
      
        2341
                        "status": "pending",
      
        2342
                    },
      
        2343
                    {
      
        2344
                        "content": "Create the nginx index.html file",
      
        2345
                        "active_form": "Working on: Create the nginx index.html file",
      
        2346
                        "status": "pending",
      
        2347
                    },
      
        2348
                ],
      
        2349
            )
      
        2350
            tool_call = ToolCall(
      
        2351
                id="read-reference",
      
        2352
                name="read",
      
        2353
                arguments={"file_path": str(reference)},
      
        2354
            )
      
        2355
            executor = FakeExecutor(
      
        2356
                [
      
        2357
                    tool_outcome(
      
        2358
                        tool_call=tool_call,
      
        2359
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2360
                        is_error=False,
      
        2361
                    )
      
        2362
                ]
      
        2363
            )
      
        2364
        
        2365
            summary = TurnSummary(final_response="")
      
        2366
            await runner.execute_batch(
      
        2367
                tool_calls=[tool_call],
      
        2368
                tool_source="assistant",
      
        2369
                pending_tool_calls_seen=set(),
      
        2370
                emit=_noop_emit,
      
        2371
                summary=summary,
      
        2372
                dod=dod,
      
        2373
                executor=executor,  # type: ignore[arg-type]
      
        2374
                on_confirmation=None,
      
        2375
                on_user_question=None,
      
        2376
                emit_confirmation=None,
      
        2377
                consecutive_errors=0,
      
        2378
            )
      
        2379
        
        2380
            assert any(
      
        2381
                "Continue with the next pending item: `Create the nginx index.html file`"
      
        2382
                in message
      
        2383
                for message in persistent_messages
      
        2384
            )
      
        2385
            assert any(
      
        2386
                "stop gathering more reference material and perform the change now" in message
      
        2387
                for message in persistent_messages
      
        2388
            )
      
        2389
            assert ephemeral_messages == []
      
        2390
        
        2391
        
        2392
        @pytest.mark.asyncio
      
        2393
        async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
      
        2394
            temp_dir: Path,
      
        2395
        ) -> None:
      
        2396
            async def assess_confidence(
      
        2397
                tool_name: str,
      
        2398
                tool_args: dict,
      
        2399
                context: str,
      
        2400
            ) -> ConfidenceAssessment:
      
        2401
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2402
        
        2403
            async def verify_action(
      
        2404
                tool_name: str,
      
        2405
                tool_args: dict,
      
        2406
                result: str,
      
        2407
                expected: str = "",
      
        2408
            ) -> ActionVerification:
      
        2409
                raise AssertionError("Verification should not run for this scenario")
      
        2410
        
        2411
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        2412
            reference.parent.mkdir(parents=True)
      
        2413
            reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
      
        2414
        
        2415
            context = build_context(
      
        2416
                temp_dir=temp_dir,
      
        2417
                messages=[],
      
        2418
                safeguards=FakeSafeguards(),
      
        2419
                assess_confidence=assess_confidence,
      
        2420
                verify_action=verify_action,
      
        2421
                auto_recover=False,
      
        2422
            )
      
        2423
            persistent_messages: list[str] = []
      
        2424
            ephemeral_messages: list[str] = []
      
        2425
            context.queue_steering_message_callback = persistent_messages.append
      
        2426
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2427
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2428
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2429
            sync_todos_to_definition_of_done(
      
        2430
                dod,
      
        2431
                [
      
        2432
                    {
      
        2433
                        "content": "First, examine the existing fortran guide structure and content",
      
        2434
                        "active_form": "Working on: First, examine the existing fortran guide structure and content",
      
        2435
                        "status": "pending",
      
        2436
                    },
      
        2437
                    {
      
        2438
                        "content": "Create the nginx directory structure",
      
        2439
                        "active_form": "Working on: Create the nginx directory structure",
      
        2440
                        "status": "pending",
      
        2441
                    },
      
        2442
                ],
      
        2443
            )
      
        2444
            tool_call = ToolCall(
      
        2445
                id="read-reference",
      
        2446
                name="read",
      
        2447
                arguments={"file_path": str(reference)},
      
        2448
            )
      
        2449
            executor = FakeExecutor(
      
        2450
                [
      
        2451
                    tool_outcome(
      
        2452
                        tool_call=tool_call,
      
        2453
                        output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
      
        2454
                        is_error=False,
      
        2455
                    )
      
        2456
                ]
      
        2457
            )
      
        2458
        
        2459
            summary = TurnSummary(final_response="")
      
        2460
            await runner.execute_batch(
      
        2461
                tool_calls=[tool_call],
      
        2462
                tool_source="assistant",
      
        2463
                pending_tool_calls_seen=set(),
      
        2464
                emit=_noop_emit,
      
        2465
                summary=summary,
      
        2466
                dod=dod,
      
        2467
                executor=executor,  # type: ignore[arg-type]
      
        2468
                on_confirmation=None,
      
        2469
                on_user_question=None,
      
        2470
                emit_confirmation=None,
      
        2471
                consecutive_errors=0,
      
        2472
            )
      
        2473
        
        2474
            assert persistent_messages
      
        2475
            assert any(
      
        2476
                "Continue with the next pending item: `Create the nginx directory structure`"
      
        2477
                in message
      
        2478
                for message in persistent_messages
      
        2479
            )
      
        2480
            assert ephemeral_messages == []
      
        2481
        
        2482
        
        2483
        @pytest.mark.asyncio
      
        2484
        async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
      
        2485
            temp_dir: Path,
      
        2486
        ) -> None:
      
        2487
            async def assess_confidence(
      
        2488
                tool_name: str,
      
        2489
                tool_args: dict,
      
        2490
                context: str,
      
        2491
            ) -> ConfidenceAssessment:
      
        2492
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2493
        
        2494
            async def verify_action(
      
        2495
                tool_name: str,
      
        2496
                tool_args: dict,
      
        2497
                result: str,
      
        2498
                expected: str = "",
      
        2499
            ) -> ActionVerification:
      
        2500
                raise AssertionError("Verification should not run for this scenario")
      
        2501
        
        2502
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2503
            chapters = nginx_root / "chapters"
      
        2504
            implementation_plan = temp_dir / "implementation.md"
      
        2505
            implementation_plan.write_text(
      
        2506
                "\n".join(
      
        2507
                    [
      
        2508
                        "# Implementation Plan",
      
        2509
                        "",
      
        2510
                        "## File Changes",
      
        2511
                        f"- `{chapters}/`",
      
        2512
                        f"- `{nginx_root / 'index.html'}`",
      
        2513
                        "",
      
        2514
                    ]
      
        2515
                )
      
        2516
            )
      
        2517
        
        2518
            context = build_context(
      
        2519
                temp_dir=temp_dir,
      
        2520
                messages=[],
      
        2521
                safeguards=FakeSafeguards(),
      
        2522
                assess_confidence=assess_confidence,
      
        2523
                verify_action=verify_action,
      
        2524
                auto_recover=False,
      
        2525
            )
      
        2526
            persistent_messages: list[str] = []
      
        2527
            ephemeral_messages: list[str] = []
      
        2528
            context.queue_steering_message_callback = persistent_messages.append
      
        2529
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2530
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2531
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2532
            dod.implementation_plan = str(implementation_plan)
      
        2533
            sync_todos_to_definition_of_done(
      
        2534
                dod,
      
        2535
                [
      
        2536
                    {
      
        2537
                        "content": "Create the nginx directory structure",
      
        2538
                        "active_form": "Creating the nginx directory structure",
      
        2539
                        "status": "pending",
      
        2540
                    },
      
        2541
                    {
      
        2542
                        "content": "Develop the main index.html file with proper structure",
      
        2543
                        "active_form": "Developing the main index.html file with proper structure",
      
        2544
                        "status": "pending",
      
        2545
                    },
      
        2546
                ],
      
        2547
            )
      
        2548
        
        2549
            tool_call = ToolCall(
      
        2550
                id="mkdir-nginx",
      
        2551
                name="bash",
      
        2552
                arguments={"command": f"mkdir -p {chapters}"},
      
        2553
            )
      
        2554
            executor = FakeExecutor(
      
        2555
                [
      
        2556
                    tool_outcome(
      
        2557
                        tool_call=tool_call,
      
        2558
                        output="",
      
        2559
                        is_error=False,
      
        2560
                    )
      
        2561
                ]
      
        2562
            )
      
        2563
        
        2564
            summary = TurnSummary(final_response="")
      
        2565
            await runner.execute_batch(
      
        2566
                tool_calls=[tool_call],
      
        2567
                tool_source="assistant",
      
        2568
                pending_tool_calls_seen=set(),
      
        2569
                emit=_noop_emit,
      
        2570
                summary=summary,
      
        2571
                dod=dod,
      
        2572
                executor=executor,  # type: ignore[arg-type]
      
        2573
                on_confirmation=None,
      
        2574
                on_user_question=None,
      
        2575
                emit_confirmation=None,
      
        2576
                consecutive_errors=0,
      
        2577
            )
      
        2578
        
        2579
            assert persistent_messages
      
        2580
            message = persistent_messages[-1]
      
        2581
            assert "Directory setup is complete." in message
      
        2582
            assert "Next step: create `index.html`." in message
      
        2583
            assert "Write a compact but real initial version of that file now" in message
      
        2584
            assert ephemeral_messages == []
      
        2585
        
        2586
        
        2587
        @pytest.mark.asyncio
      
        2588
        async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
      
        2589
            temp_dir: Path,
      
        2590
        ) -> None:
      
        2591
            async def assess_confidence(
      
        2592
                tool_name: str,
      
        2593
                tool_args: dict,
      
        2594
                context: str,
      
        2595
            ) -> ConfidenceAssessment:
      
        2596
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2597
        
        2598
            async def verify_action(
      
        2599
                tool_name: str,
      
        2600
                tool_args: dict,
      
        2601
                result: str,
      
        2602
                expected: str = "",
      
        2603
            ) -> ActionVerification:
      
        2604
                raise AssertionError("Verification should not run for this scenario")
      
        2605
        
        2606
            nginx_root = temp_dir / "guides" / "nginx"
      
        2607
            chapters = nginx_root / "chapters"
      
        2608
            chapters.mkdir(parents=True)
      
        2609
            index_path = nginx_root / "index.html"
      
        2610
        
        2611
            implementation_plan = temp_dir / "implementation.md"
      
        2612
            implementation_plan.write_text(
      
        2613
                "\n".join(
      
        2614
                    [
      
        2615
                        "# Implementation Plan",
      
        2616
                        "",
      
        2617
                        "## File Changes",
      
        2618
                        f"- `{chapters}/`",
      
        2619
                        f"- `{index_path}`",
      
        2620
                        f"- `{chapters / '01-introduction.html'}`",
      
        2621
                        "",
      
        2622
                    ]
      
        2623
                )
      
        2624
            )
      
        2625
        
        2626
            context = build_context(
      
        2627
                temp_dir=temp_dir,
      
        2628
                messages=[],
      
        2629
                safeguards=FakeSafeguards(),
      
        2630
                assess_confidence=assess_confidence,
      
        2631
                verify_action=verify_action,
      
        2632
                auto_recover=False,
      
        2633
            )
      
        2634
            persistent_messages: list[str] = []
      
        2635
            ephemeral_messages: list[str] = []
      
        2636
            context.queue_steering_message_callback = persistent_messages.append
      
        2637
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2638
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2639
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2640
            dod.implementation_plan = str(implementation_plan)
      
        2641
            sync_todos_to_definition_of_done(
      
        2642
                dod,
      
        2643
                [
      
        2644
                    {
      
        2645
                        "content": "Create the main index.html file with proper structure",
      
        2646
                        "active_form": "Creating the main index.html file with proper structure",
      
        2647
                        "status": "pending",
      
        2648
                    },
      
        2649
                    {
      
        2650
                        "content": "Create each chapter file with appropriate content",
      
        2651
                        "active_form": "Creating each chapter file with appropriate content",
      
        2652
                        "status": "pending",
      
        2653
                    },
      
        2654
                ],
      
        2655
            )
      
        2656
        
        2657
            tool_call = ToolCall(
      
        2658
                id="write-index",
      
        2659
                name="write",
      
        2660
                arguments={
      
        2661
                    "file_path": str(index_path),
      
        2662
                    "content": "<html></html>\n",
      
        2663
                },
      
        2664
            )
      
        2665
            executor = FakeExecutor(
      
        2666
                [
      
        2667
                    tool_outcome(
      
        2668
                        tool_call=tool_call,
      
        2669
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        2670
                        is_error=False,
      
        2671
                    )
      
        2672
                ]
      
        2673
            )
      
        2674
        
        2675
            summary = TurnSummary(final_response="")
      
        2676
            await runner.execute_batch(
      
        2677
                tool_calls=[tool_call],
      
        2678
                tool_source="assistant",
      
        2679
                pending_tool_calls_seen=set(),
      
        2680
                emit=_noop_emit,
      
        2681
                summary=summary,
      
        2682
                dod=dod,
      
        2683
                executor=executor,  # type: ignore[arg-type]
      
        2684
                on_confirmation=None,
      
        2685
                on_user_question=None,
      
        2686
                emit_confirmation=None,
      
        2687
                consecutive_errors=0,
      
        2688
            )
      
        2689
        
        2690
            assert persistent_messages
      
        2691
            assert ephemeral_messages == []
      
        2692
            message = persistent_messages[-1]
      
        2693
            assert "Confirmed progress:" in message
      
        2694
            assert "Next step: create `01-introduction.html`." in message
      
        2695
            assert (
      
        2696
                f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
      
        2697
                in message
      
        2698
            )
      
        2699
            assert "Write a compact but real initial version of that file now" not in message
      
        2700
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        2701
        
        2702
        
        2703
        @pytest.mark.asyncio
      
        2704
        async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
      
        2705
            temp_dir: Path,
      
        2706
            monkeypatch: pytest.MonkeyPatch,
      
        2707
        ) -> None:
      
        2708
            monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
      
        2709
        
        2710
            async def assess_confidence(
      
        2711
                tool_name: str,
      
        2712
                tool_args: dict,
      
        2713
                context: str,
      
        2714
            ) -> ConfidenceAssessment:
      
        2715
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2716
        
        2717
            async def verify_action(
      
        2718
                tool_name: str,
      
        2719
                tool_args: dict,
      
        2720
                result: str,
      
        2721
                expected: str = "",
      
        2722
            ) -> ActionVerification:
      
        2723
                raise AssertionError("Verification should not run for this scenario")
      
        2724
        
        2725
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        2726
            chapters = nginx_root / "chapters"
      
        2727
            index_path = nginx_root / "index.html"
      
        2728
        
        2729
            implementation_plan = temp_dir / "implementation.md"
      
        2730
            implementation_plan.write_text(
      
        2731
                "\n".join(
      
        2732
                    [
      
        2733
                        "# Implementation Plan",
      
        2734
                        "",
      
        2735
                        "## File Changes",
      
        2736
                        f"- `{chapters}/`",
      
        2737
                        f"- `{index_path}`",
      
        2738
                        "",
      
        2739
                    ]
      
        2740
                )
      
        2741
            )
      
        2742
        
        2743
            context = build_context(
      
        2744
                temp_dir=temp_dir,
      
        2745
                messages=[],
      
        2746
                safeguards=FakeSafeguards(),
      
        2747
                assess_confidence=assess_confidence,
      
        2748
                verify_action=verify_action,
      
        2749
                auto_recover=False,
      
        2750
            )
      
        2751
            persistent_messages: list[str] = []
      
        2752
            context.queue_steering_message_callback = persistent_messages.append
      
        2753
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2754
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2755
            dod.implementation_plan = str(implementation_plan)
      
        2756
            sync_todos_to_definition_of_done(
      
        2757
                dod,
      
        2758
                [
      
        2759
                    {
      
        2760
                        "content": "Create the nginx directory structure",
      
        2761
                        "active_form": "Creating the nginx directory structure",
      
        2762
                        "status": "pending",
      
        2763
                    },
      
        2764
                    {
      
        2765
                        "content": "Develop the main index.html file with proper structure",
      
        2766
                        "active_form": "Developing the main index.html file with proper structure",
      
        2767
                        "status": "pending",
      
        2768
                    },
      
        2769
                ],
      
        2770
            )
      
        2771
        
        2772
            tool_call = ToolCall(
      
        2773
                id="mkdir-nginx-home",
      
        2774
                name="bash",
      
        2775
                arguments={"command": f"mkdir -p {chapters}"},
      
        2776
            )
      
        2777
            executor = FakeExecutor(
      
        2778
                [
      
        2779
                    tool_outcome(
      
        2780
                        tool_call=tool_call,
      
        2781
                        output="",
      
        2782
                        is_error=False,
      
        2783
                    )
      
        2784
                ]
      
        2785
            )
      
        2786
        
        2787
            summary = TurnSummary(final_response="")
      
        2788
            await runner.execute_batch(
      
        2789
                tool_calls=[tool_call],
      
        2790
                tool_source="assistant",
      
        2791
                pending_tool_calls_seen=set(),
      
        2792
                emit=_noop_emit,
      
        2793
                summary=summary,
      
        2794
                dod=dod,
      
        2795
                executor=executor,  # type: ignore[arg-type]
      
        2796
                on_confirmation=None,
      
        2797
                on_user_question=None,
      
        2798
                emit_confirmation=None,
      
        2799
                consecutive_errors=0,
      
        2800
            )
      
        2801
        
        2802
            assert persistent_messages
      
        2803
            message = persistent_messages[-1]
      
        2804
            assert "Next step: create `index.html`." in message
      
        2805
            assert "`~/Loader/guides/nginx/index.html`" in message
      
        2806
            assert "Write a compact but real initial version of that file now" in message
      
        2807
        
        2808
        
        2809
        @pytest.mark.asyncio
      
        2810
        async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
      
        2811
            temp_dir: Path,
      
        2812
        ) -> None:
      
        2813
            async def assess_confidence(
      
        2814
                tool_name: str,
      
        2815
                tool_args: dict,
      
        2816
                context: str,
      
        2817
            ) -> ConfidenceAssessment:
      
        2818
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        2819
        
        2820
            async def verify_action(
      
        2821
                tool_name: str,
      
        2822
                tool_args: dict,
      
        2823
                result: str,
      
        2824
                expected: str = "",
      
        2825
            ) -> ActionVerification:
      
        2826
                raise AssertionError("Verification should not run in this scenario")
      
        2827
        
        2828
            nginx_root = temp_dir / "guides" / "nginx"
      
        2829
            chapters = nginx_root / "chapters"
      
        2830
            chapters.mkdir(parents=True)
      
        2831
            index_path = nginx_root / "index.html"
      
        2832
            index_path.write_text(
      
        2833
                "\n".join(
      
        2834
                    [
      
        2835
                        "<html>",
      
        2836
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        2837
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        2838
                        "</html>",
      
        2839
                    ]
      
        2840
                )
      
        2841
                + "\n"
      
        2842
            )
      
        2843
        
        2844
            implementation_plan = temp_dir / "implementation.md"
      
        2845
            implementation_plan.write_text(
      
        2846
                "\n".join(
      
        2847
                    [
      
        2848
                        "# Implementation Plan",
      
        2849
                        "",
      
        2850
                        "## File Changes",
      
        2851
                        f"- `{nginx_root}/`",
      
        2852
                        f"- `{chapters}/`",
      
        2853
                        f"- `{index_path}`",
      
        2854
                        f"- `{chapters / '01-introduction.html'}`",
      
        2855
                        "",
      
        2856
                    ]
      
        2857
                )
      
        2858
            )
      
        2859
        
        2860
            context = build_context(
      
        2861
                temp_dir=temp_dir,
      
        2862
                messages=[],
      
        2863
                safeguards=FakeSafeguards(),
      
        2864
                assess_confidence=assess_confidence,
      
        2865
                verify_action=verify_action,
      
        2866
                auto_recover=False,
      
        2867
            )
      
        2868
            persistent_messages: list[str] = []
      
        2869
            ephemeral_messages: list[str] = []
      
        2870
            context.queue_steering_message_callback = persistent_messages.append
      
        2871
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2872
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2873
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2874
            dod.implementation_plan = str(implementation_plan)
      
        2875
            dod.touched_files.append(str(index_path))
      
        2876
            dod.completed_items.append("Develop the main index.html file for the nginx guide")
      
        2877
            dod.pending_items.append("Create chapter files for the nginx guide")
      
        2878
        
        2879
            tool_call = ToolCall(
      
        2880
                id="read-index-self-audit",
      
        2881
                name="read",
      
        2882
                arguments={"file_path": str(index_path)},
      
        2883
            )
      
        2884
            executor = FakeExecutor(
      
        2885
                [
      
        2886
                    tool_outcome(
      
        2887
                        tool_call=tool_call,
      
        2888
                        output="1\t<html>\n",
      
        2889
                        is_error=False,
      
        2890
                    )
      
        2891
                ]
      
        2892
            )
      
        2893
        
        2894
            summary = TurnSummary(final_response="")
      
        2895
            await runner.execute_batch(
      
        2896
                tool_calls=[tool_call],
      
        2897
                tool_source="assistant",
      
        2898
                pending_tool_calls_seen=set(),
      
        2899
                emit=_noop_emit,
      
        2900
                summary=summary,
      
        2901
                dod=dod,
      
        2902
                executor=executor,  # type: ignore[arg-type]
      
        2903
                on_confirmation=None,
      
        2904
                on_user_question=None,
      
        2905
                emit_confirmation=None,
      
        2906
                consecutive_errors=0,
      
        2907
            )
      
        2908
        
        2909
            assert persistent_messages
      
        2910
            message = persistent_messages[-1]
      
        2911
            assert "You already have the current contents of `index.html` from the successful write." in message
      
        2912
            assert "Resume by creating `01-introduction.html` now." in message
      
        2913
            assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
      
        2914
            assert ephemeral_messages == []
      
        2915
        
        2916
        
        2917
        @pytest.mark.asyncio
      
        2918
        async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
      
        2919
            temp_dir: Path,
      
        2920
        ) -> None:
      
        2921
            async def assess_confidence(
      
        2922
                tool_name: str,
      
        2923
                tool_args: dict,
      
        2924
                context: str,
      
        2925
            ) -> ConfidenceAssessment:
      
        2926
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        2927
        
        2928
            async def verify_action(
      
        2929
                tool_name: str,
      
        2930
                tool_args: dict,
      
        2931
                result: str,
      
        2932
                expected: str = "",
      
        2933
            ) -> ActionVerification:
      
        2934
                raise AssertionError("Verification should not run for this scenario")
      
        2935
        
        2936
            nginx_root = temp_dir / "guides" / "nginx"
      
        2937
            chapters = nginx_root / "chapters"
      
        2938
            chapters.mkdir(parents=True)
      
        2939
            index_path = nginx_root / "index.html"
      
        2940
        
        2941
            implementation_plan = temp_dir / "implementation.md"
      
        2942
            implementation_plan.write_text(
      
        2943
                "\n".join(
      
        2944
                    [
      
        2945
                        "# Implementation Plan",
      
        2946
                        "",
      
        2947
                        "## File Changes",
      
        2948
                        f"- `{chapters}/`",
      
        2949
                        f"- `{index_path}`",
      
        2950
                        f"- `{chapters / '01-introduction.html'}`",
      
        2951
                        "",
      
        2952
                    ]
      
        2953
                )
      
        2954
            )
      
        2955
        
        2956
            context = build_context(
      
        2957
                temp_dir=temp_dir,
      
        2958
                messages=[
      
        2959
                    Message(
      
        2960
                        role=Role.USER,
      
        2961
                        content=(
      
        2962
                            "[EMPTY ASSISTANT RESPONSE]\n"
      
        2963
                            "Respond with that concrete mutation tool call now. Do not return an empty response."
      
        2964
                        ),
      
        2965
                    )
      
        2966
                ],
      
        2967
                safeguards=FakeSafeguards(),
      
        2968
                assess_confidence=assess_confidence,
      
        2969
                verify_action=verify_action,
      
        2970
                auto_recover=False,
      
        2971
            )
      
        2972
            persistent_messages: list[str] = []
      
        2973
            ephemeral_messages: list[str] = []
      
        2974
            context.queue_steering_message_callback = persistent_messages.append
      
        2975
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        2976
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        2977
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        2978
            dod.implementation_plan = str(implementation_plan)
      
        2979
            sync_todos_to_definition_of_done(
      
        2980
                dod,
      
        2981
                [
      
        2982
                    {
      
        2983
                        "content": "Create the main index.html file with proper structure",
      
        2984
                        "active_form": "Creating the main index.html file with proper structure",
      
        2985
                        "status": "pending",
      
        2986
                    },
      
        2987
                    {
      
        2988
                        "content": "Create each chapter file with appropriate content",
      
        2989
                        "active_form": "Creating each chapter file with appropriate content",
      
        2990
                        "status": "pending",
      
        2991
                    },
      
        2992
                ],
      
        2993
            )
      
        2994
        
        2995
            tool_call = ToolCall(
      
        2996
                id="write-index-recovered",
      
        2997
                name="write",
      
        2998
                arguments={
      
        2999
                    "file_path": str(index_path),
      
        3000
                    "content": "<html></html>\n",
      
        3001
                },
      
        3002
            )
      
        3003
            executor = FakeExecutor(
      
        3004
                [
      
        3005
                    tool_outcome(
      
        3006
                        tool_call=tool_call,
      
        3007
                        output=f"Successfully wrote 14 bytes to {index_path}",
      
        3008
                        is_error=False,
      
        3009
                    )
      
        3010
                ]
      
        3011
            )
      
        3012
        
        3013
            summary = TurnSummary(final_response="")
      
        3014
            await runner.execute_batch(
      
        3015
                tool_calls=[tool_call],
      
        3016
                tool_source="assistant",
      
        3017
                pending_tool_calls_seen=set(),
      
        3018
                emit=_noop_emit,
      
        3019
                summary=summary,
      
        3020
                dod=dod,
      
        3021
                executor=executor,  # type: ignore[arg-type]
      
        3022
                on_confirmation=None,
      
        3023
                on_user_question=None,
      
        3024
                emit_confirmation=None,
      
        3025
                consecutive_errors=0,
      
        3026
            )
      
        3027
        
        3028
            assert persistent_messages
      
        3029
            assert ephemeral_messages == []
      
        3030
            message = persistent_messages[-1]
      
        3031
            assert "Next step: create `01-introduction.html`." in message
      
        3032
            assert "Write a compact but real initial version of that file now" not in message
      
        3033
        
        3034
        
        3035
        @pytest.mark.asyncio
      
        3036
        async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
      
        3037
            temp_dir: Path,
      
        3038
        ) -> None:
      
        3039
            async def assess_confidence(
      
        3040
                tool_name: str,
      
        3041
                tool_args: dict,
      
        3042
                context: str,
      
        3043
            ) -> ConfidenceAssessment:
      
        3044
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3045
        
        3046
            async def verify_action(
      
        3047
                tool_name: str,
      
        3048
                tool_args: dict,
      
        3049
                result: str,
      
        3050
                expected: str = "",
      
        3051
            ) -> ActionVerification:
      
        3052
                raise AssertionError("Verification should not run in this scenario")
      
        3053
        
        3054
            guide_root = temp_dir / "guides" / "nginx"
      
        3055
            chapters = guide_root / "chapters"
      
        3056
            chapters.mkdir(parents=True)
      
        3057
            index_path = guide_root / "index.html"
      
        3058
            index_path.write_text(
      
        3059
                "\n".join(
      
        3060
                    [
      
        3061
                        "<html>",
      
        3062
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
      
        3063
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        3064
                        "</html>",
      
        3065
                    ]
      
        3066
                )
      
        3067
                + "\n"
      
        3068
            )
      
        3069
        
        3070
            implementation_plan = temp_dir / "implementation.md"
      
        3071
            implementation_plan.write_text(
      
        3072
                "\n".join(
      
        3073
                    [
      
        3074
                        "# Implementation Plan",
      
        3075
                        "",
      
        3076
                        "## File Changes",
      
        3077
                        f"- `{guide_root}/`",
      
        3078
                        f"- `{chapters}/`",
      
        3079
                        f"- `{index_path}`",
      
        3080
                        "",
      
        3081
                    ]
      
        3082
                )
      
        3083
            )
      
        3084
        
        3085
            context = build_context(
      
        3086
                temp_dir=temp_dir,
      
        3087
                messages=[],
      
        3088
                safeguards=FakeSafeguards(),
      
        3089
                assess_confidence=assess_confidence,
      
        3090
                verify_action=verify_action,
      
        3091
            )
      
        3092
            queued_messages: list[str] = []
      
        3093
            context.queue_steering_message_callback = queued_messages.append
      
        3094
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3095
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3096
            dod.implementation_plan = str(implementation_plan)
      
        3097
            dod.touched_files.append(str(index_path))
      
        3098
            sync_todos_to_definition_of_done(
      
        3099
                dod,
      
        3100
                [
      
        3101
                    {
      
        3102
                        "content": "Develop the main index.html file with proper structure",
      
        3103
                        "active_form": "Developing the main index.html file with proper structure",
      
        3104
                        "status": "completed",
      
        3105
                    },
      
        3106
                    {
      
        3107
                        "content": "Create chapter files with content and structure",
      
        3108
                        "active_form": "Creating chapter files with content and structure",
      
        3109
                        "status": "pending",
      
        3110
                    },
      
        3111
                ],
      
        3112
            )
      
        3113
        
        3114
            todos = [
      
        3115
                {
      
        3116
                    "content": "Develop the main index.html file with proper structure",
      
        3117
                    "active_form": "Developing the main index.html file with proper structure",
      
        3118
                    "status": "completed",
      
        3119
                },
      
        3120
                {
      
        3121
                    "content": "Create chapter files with content and structure",
      
        3122
                    "active_form": "Creating chapter files with content and structure",
      
        3123
                    "status": "pending",
      
        3124
                },
      
        3125
            ]
      
        3126
            tool_call = ToolCall(
      
        3127
                id="todo-aggregate",
      
        3128
                name="TodoWrite",
      
        3129
                arguments={"todos": todos},
      
        3130
            )
      
        3131
            executor = FakeExecutor(
      
        3132
                [
      
        3133
                    tool_outcome(
      
        3134
                        tool_call=tool_call,
      
        3135
                        output="Todos updated",
      
        3136
                        is_error=False,
      
        3137
                        metadata={"new_todos": todos},
      
        3138
                    )
      
        3139
                ]
      
        3140
            )
      
        3141
        
        3142
            summary = TurnSummary(final_response="")
      
        3143
            await runner.execute_batch(
      
        3144
                tool_calls=[tool_call],
      
        3145
                tool_source="assistant",
      
        3146
                pending_tool_calls_seen=set(),
      
        3147
                emit=_noop_emit,
      
        3148
                summary=summary,
      
        3149
                dod=dod,
      
        3150
                executor=executor,  # type: ignore[arg-type]
      
        3151
                on_confirmation=None,
      
        3152
                on_user_question=None,
      
        3153
                emit_confirmation=None,
      
        3154
                consecutive_errors=0,
      
        3155
            )
      
        3156
        
        3157
            assert queued_messages
      
        3158
            message = queued_messages[-1]
      
        3159
            assert "Todo tracking is updated." in message
      
        3160
            assert "Next step: create `01-introduction.html`." in message
      
        3161
            assert (
      
        3162
                "Continue with the next pending item: `Create chapter files with content and structure`."
      
        3163
                not in message
      
        3164
            )
      
        3165
        
        3166
        
        3167
        @pytest.mark.asyncio
      
        3168
        async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
      
        3169
            temp_dir: Path,
      
        3170
        ) -> None:
      
        3171
            async def assess_confidence(
      
        3172
                tool_name: str,
      
        3173
                tool_args: dict,
      
        3174
                context: str,
      
        3175
            ) -> ConfidenceAssessment:
      
        3176
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3177
        
        3178
            async def verify_action(
      
        3179
                tool_name: str,
      
        3180
                tool_args: dict,
      
        3181
                result: str,
      
        3182
                expected: str = "",
      
        3183
            ) -> ActionVerification:
      
        3184
                raise AssertionError("Verification should not run for this scenario")
      
        3185
        
        3186
            guide_root = temp_dir / "guides" / "nginx"
      
        3187
            chapters = guide_root / "chapters"
      
        3188
            chapters.mkdir(parents=True)
      
        3189
            index_path = guide_root / "index.html"
      
        3190
            chapter_one = chapters / "01-getting-started.html"
      
        3191
            chapter_one.write_text("<h1>One</h1>\n")
      
        3192
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3193
        
        3194
            implementation_plan = temp_dir / "implementation.md"
      
        3195
            implementation_plan.write_text(
      
        3196
                "\n".join(
      
        3197
                    [
      
        3198
                        "# Implementation Plan",
      
        3199
                        "",
      
        3200
                        "## File Changes",
      
        3201
                        f"- `{index_path}`",
      
        3202
                        f"- `{chapter_one}`",
      
        3203
                        f"- `{chapters / '06-ssl-configuration.html'}`",
      
        3204
                        "",
      
        3205
                    ]
      
        3206
                )
      
        3207
            )
      
        3208
        
        3209
            context = build_context(
      
        3210
                temp_dir=temp_dir,
      
        3211
                messages=[],
      
        3212
                safeguards=FakeSafeguards(),
      
        3213
                assess_confidence=assess_confidence,
      
        3214
                verify_action=verify_action,
      
        3215
                auto_recover=False,
      
        3216
            )
      
        3217
            persistent_messages: list[str] = []
      
        3218
            ephemeral_messages: list[str] = []
      
        3219
            context.queue_steering_message_callback = persistent_messages.append
      
        3220
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3221
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3222
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3223
            dod.implementation_plan = str(implementation_plan)
      
        3224
            sync_todos_to_definition_of_done(
      
        3225
                dod,
      
        3226
                [
      
        3227
                    {
      
        3228
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3229
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3230
                        "status": "pending",
      
        3231
                    },
      
        3232
                    {
      
        3233
                        "content": "Create the final chapter (06-ssl-configuration.html)",
      
        3234
                        "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
      
        3235
                        "status": "pending",
      
        3236
                    },
      
        3237
                ],
      
        3238
            )
      
        3239
            assert tool_batches_should_prioritize_missing_artifact(
      
        3240
                dod=dod,
      
        3241
                next_pending=dod.pending_items[0],
      
        3242
                missing_artifact=(chapters / "06-ssl-configuration.html", False),
      
        3243
                project_root=temp_dir,
      
        3244
            )
      
        3245
        
        3246
            tool_call = ToolCall(
      
        3247
                id="dup-read",
      
        3248
                name="read",
      
        3249
                arguments={"file_path": str(index_path)},
      
        3250
            )
      
        3251
            runner._queue_duplicate_observation_nudge(tool_call, dod=dod)  # type: ignore[attr-defined]
      
        3252
        
        3253
            assert persistent_messages
      
        3254
            message = persistent_messages[-1]
      
        3255
            assert "06-ssl-configuration.html" in message
      
        3256
            assert "Do not switch into review or consistency-check mode" in message
      
        3257
            assert (
      
        3258
                "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
      
        3259
                not in message
      
        3260
            )
      
        3261
        
        3262
        
        3263
        @pytest.mark.asyncio
      
        3264
        async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
      
        3265
            temp_dir: Path,
      
        3266
        ) -> None:
      
        3267
            async def assess_confidence(
      
        3268
                tool_name: str,
      
        3269
                tool_args: dict,
      
        3270
                context: str,
      
        3271
            ) -> ConfidenceAssessment:
      
        3272
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        3273
        
        3274
            async def verify_action(
      
        3275
                tool_name: str,
      
        3276
                tool_args: dict,
      
        3277
                result: str,
      
        3278
                expected: str = "",
      
        3279
            ) -> ActionVerification:
      
        3280
                raise AssertionError("Verification should not run for this scenario")
      
        3281
        
        3282
            guide_root = temp_dir / "guides" / "nginx"
      
        3283
            chapters = guide_root / "chapters"
      
        3284
            chapters.mkdir(parents=True)
      
        3285
            index_path = guide_root / "index.html"
      
        3286
            chapter_one = chapters / "01-getting-started.html"
      
        3287
            chapter_two = chapters / "02-installation.html"
      
        3288
            index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
      
        3289
            chapter_one.write_text("<h1>One</h1>\n")
      
        3290
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3291
        
        3292
            implementation_plan = temp_dir / "implementation.md"
      
        3293
            implementation_plan.write_text(
      
        3294
                "\n".join(
      
        3295
                    [
      
        3296
                        "# Implementation Plan",
      
        3297
                        "",
      
        3298
                        "## File Changes",
      
        3299
                        f"- `{chapters}/`",
      
        3300
                        f"- `{index_path}`",
      
        3301
                        f"- `{chapter_one}`",
      
        3302
                        f"- `{chapter_two}`",
      
        3303
                        "",
      
        3304
                    ]
      
        3305
                )
      
        3306
            )
      
        3307
        
        3308
            context = build_context(
      
        3309
                temp_dir=temp_dir,
      
        3310
                messages=[],
      
        3311
                safeguards=FakeSafeguards(),
      
        3312
                assess_confidence=assess_confidence,
      
        3313
                verify_action=verify_action,
      
        3314
                auto_recover=False,
      
        3315
            )
      
        3316
            persistent_messages: list[str] = []
      
        3317
            ephemeral_messages: list[str] = []
      
        3318
            context.queue_steering_message_callback = persistent_messages.append
      
        3319
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3320
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3321
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3322
            dod.implementation_plan = str(implementation_plan)
      
        3323
            sync_todos_to_definition_of_done(
      
        3324
                dod,
      
        3325
                [
      
        3326
                    {
      
        3327
                        "content": "Create the guide files",
      
        3328
                        "active_form": "Working on: Create the guide files",
      
        3329
                        "status": "completed",
      
        3330
                    },
      
        3331
                    {
      
        3332
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3333
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3334
                        "status": "pending",
      
        3335
                    },
      
        3336
                ],
      
        3337
            )
      
        3338
            tool_call = ToolCall(
      
        3339
                id="write-final",
      
        3340
                name="write",
      
        3341
                arguments={
      
        3342
                    "file_path": str(chapter_two),
      
        3343
                    "content": "<h1>Two</h1>\n",
      
        3344
                },
      
        3345
            )
      
        3346
            executor = FakeExecutor(
      
        3347
                [
      
        3348
                    tool_outcome(
      
        3349
                        tool_call=tool_call,
      
        3350
                        output=f"Successfully wrote {chapter_two}",
      
        3351
                        is_error=False,
      
        3352
                    )
      
        3353
                ]
      
        3354
            )
      
        3355
        
        3356
            summary = TurnSummary(final_response="")
      
        3357
            await runner.execute_batch(
      
        3358
                tool_calls=[tool_call],
      
        3359
                tool_source="assistant",
      
        3360
                pending_tool_calls_seen=set(),
      
        3361
                emit=_noop_emit,
      
        3362
                summary=summary,
      
        3363
                dod=dod,
      
        3364
                executor=executor,  # type: ignore[arg-type]
      
        3365
                on_confirmation=None,
      
        3366
                on_user_question=None,
      
        3367
                emit_confirmation=None,
      
        3368
                consecutive_errors=0,
      
        3369
            )
      
        3370
        
        3371
            assert any(
      
        3372
                "All explicitly planned artifacts now exist on disk." in message
      
        3373
                for message in persistent_messages
      
        3374
            )
      
        3375
            assert any(
      
        3376
                "Ensure all files are properly linked and formatted consistently" in message
      
        3377
                for message in persistent_messages
      
        3378
            )
      
        3379
            assert any(
      
        3380
                "Move to verification once no specific mismatch remains." in message
      
        3381
                for message in persistent_messages
      
        3382
            )
      
        3383
        
        3384
        
        3385
        @pytest.mark.asyncio
      
        3386
        async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
      
        3387
            temp_dir: Path,
      
        3388
        ) -> None:
      
        3389
            async def assess_confidence(
      
        3390
                tool_name: str,
      
        3391
                tool_args: dict,
      
        3392
                context: str,
      
        3393
            ) -> ConfidenceAssessment:
      
        3394
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3395
        
        3396
            async def verify_action(
      
        3397
                tool_name: str,
      
        3398
                tool_args: dict,
      
        3399
                result: str,
      
        3400
                expected: str = "",
      
        3401
            ) -> ActionVerification:
      
        3402
                raise AssertionError("Verification should not run in this scenario")
      
        3403
        
        3404
            guide_root = temp_dir / "guides" / "nginx"
      
        3405
            chapters = guide_root / "chapters"
      
        3406
            guide_root.mkdir(parents=True)
      
        3407
            chapters.mkdir()
      
        3408
            index_path = guide_root / "index.html"
      
        3409
            index_path.write_text("<html></html>\n")
      
        3410
            chapter_one = chapters / "01-getting-started.html"
      
        3411
            chapter_two = chapters / "02-installation.html"
      
        3412
            implementation_plan = temp_dir / "implementation.md"
      
        3413
            implementation_plan.write_text(
      
        3414
                "\n".join(
      
        3415
                    [
      
        3416
                        "# Implementation Plan",
      
        3417
                        "",
      
        3418
                        "## File Changes",
      
        3419
                        f"- `{guide_root}/`",
      
        3420
                        f"- `{index_path}`",
      
        3421
                        f"- `{chapter_one}`",
      
        3422
                        f"- `{chapter_two}`",
      
        3423
                        "",
      
        3424
                    ]
      
        3425
                )
      
        3426
            )
      
        3427
        
        3428
            context = build_context(
      
        3429
                temp_dir=temp_dir,
      
        3430
                messages=[],
      
        3431
                safeguards=FakeSafeguards(),
      
        3432
                assess_confidence=assess_confidence,
      
        3433
                verify_action=verify_action,
      
        3434
                auto_recover=False,
      
        3435
            )
      
        3436
            persistent_messages: list[str] = []
      
        3437
            ephemeral_messages: list[str] = []
      
        3438
            context.queue_steering_message_callback = persistent_messages.append
      
        3439
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3440
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3441
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3442
            dod.implementation_plan = str(implementation_plan)
      
        3443
            sync_todos_to_definition_of_done(
      
        3444
                dod,
      
        3445
                [
      
        3446
                    {
      
        3447
                        "content": "Create the main index.html file with proper structure",
      
        3448
                        "active_form": "Working on: Create the main index.html file with proper structure",
      
        3449
                        "status": "pending",
      
        3450
                    },
      
        3451
                    {
      
        3452
                        "content": "Create each chapter file in sequence, following the established pattern",
      
        3453
                        "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
      
        3454
                        "status": "pending",
      
        3455
                    },
      
        3456
                    {
      
        3457
                        "content": "Ensure all files are properly linked and formatted consistently",
      
        3458
                        "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
      
        3459
                        "status": "pending",
      
        3460
                    },
      
        3461
                ],
      
        3462
            )
      
        3463
            tool_call = ToolCall(
      
        3464
                id="write-index",
      
        3465
                name="write",
      
        3466
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        3467
            )
      
        3468
            executor = FakeExecutor(
      
        3469
                [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
      
        3470
            )
      
        3471
        
        3472
            summary = TurnSummary(final_response="")
      
        3473
            await runner.execute_batch(
      
        3474
                tool_calls=[tool_call],
      
        3475
                tool_source="assistant",
      
        3476
                pending_tool_calls_seen=set(),
      
        3477
                emit=_noop_emit,
      
        3478
                summary=summary,
      
        3479
                dod=dod,
      
        3480
                executor=executor,  # type: ignore[arg-type]
      
        3481
                on_confirmation=None,
      
        3482
                on_user_question=None,
      
        3483
                emit_confirmation=None,
      
        3484
                consecutive_errors=0,
      
        3485
            )
      
        3486
        
        3487
            assert persistent_messages
      
        3488
            assert ephemeral_messages == []
      
        3489
            message = persistent_messages[-1]
      
        3490
            assert "Next step: create `01-getting-started.html`." in message
      
        3491
            assert "Write a compact but real initial version of that file now" not in message
      
        3492
            assert "refresh `TodoWrite`" not in message
      
        3493
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3494
        
        3495
        
        3496
        @pytest.mark.asyncio
      
        3497
        async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
      
        3498
            temp_dir: Path,
      
        3499
        ) -> None:
      
        3500
            async def assess_confidence(
      
        3501
                tool_name: str,
      
        3502
                tool_args: dict,
      
        3503
                context: str,
      
        3504
            ) -> ConfidenceAssessment:
      
        3505
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3506
        
        3507
            async def verify_action(
      
        3508
                tool_name: str,
      
        3509
                tool_args: dict,
      
        3510
                result: str,
      
        3511
                expected: str = "",
      
        3512
            ) -> ActionVerification:
      
        3513
                raise AssertionError("Verification should not run in this scenario")
      
        3514
        
        3515
            guide_root = temp_dir / "guides" / "nginx"
      
        3516
            chapters = guide_root / "chapters"
      
        3517
            guide_root.mkdir(parents=True)
      
        3518
            chapters.mkdir()
      
        3519
            index_path = guide_root / "index.html"
      
        3520
            index_path.write_text("<html></html>\n")
      
        3521
        
        3522
            chapter_paths = [
      
        3523
                chapters / "01-getting-started.html",
      
        3524
                chapters / "02-installation.html",
      
        3525
                chapters / "03-first-website.html",
      
        3526
                chapters / "04-configuration-basics.html",
      
        3527
                chapters / "05-advanced-configurations.html",
      
        3528
                chapters / "06-performance-tuning.html",
      
        3529
                chapters / "07-security-best-practices.html",
      
        3530
            ]
      
        3531
            for chapter in chapter_paths[:4]:
      
        3532
                chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
      
        3533
            chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
      
        3534
        
        3535
            implementation_plan = temp_dir / "implementation.md"
      
        3536
            implementation_plan.write_text(
      
        3537
                "\n".join(
      
        3538
                    [
      
        3539
                        "# Implementation Plan",
      
        3540
                        "",
      
        3541
                        "## File Changes",
      
        3542
                        f"- `{guide_root}/`",
      
        3543
                        f"- `{chapters}/`",
      
        3544
                        f"- `{index_path}`",
      
        3545
                        *[f"- `{path}`" for path in chapter_paths],
      
        3546
                        "",
      
        3547
                    ]
      
        3548
                )
      
        3549
            )
      
        3550
        
        3551
            context = build_context(
      
        3552
                temp_dir=temp_dir,
      
        3553
                messages=[],
      
        3554
                safeguards=FakeSafeguards(),
      
        3555
                assess_confidence=assess_confidence,
      
        3556
                verify_action=verify_action,
      
        3557
                auto_recover=False,
      
        3558
            )
      
        3559
            persistent_messages: list[str] = []
      
        3560
            ephemeral_messages: list[str] = []
      
        3561
            context.queue_steering_message_callback = persistent_messages.append
      
        3562
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3563
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3564
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3565
            dod.implementation_plan = str(implementation_plan)
      
        3566
            sync_todos_to_definition_of_done(
      
        3567
                dod,
      
        3568
                [
      
        3569
                    {
      
        3570
                        "content": "Create the nginx guide artifacts",
      
        3571
                        "active_form": "Creating nginx guide artifacts",
      
        3572
                        "status": "pending",
      
        3573
                    },
      
        3574
                    {
      
        3575
                        "content": "Verify all guide files are linked and complete",
      
        3576
                        "active_form": "Verifying guide linkage and completeness",
      
        3577
                        "status": "pending",
      
        3578
                    },
      
        3579
                ],
      
        3580
            )
      
        3581
            tool_call = ToolCall(
      
        3582
                id="write-chapter-05",
      
        3583
                name="write",
      
        3584
                arguments={
      
        3585
                    "file_path": str(chapter_paths[4]),
      
        3586
                    "content": "<h1>Advanced configurations</h1>\n",
      
        3587
                },
      
        3588
            )
      
        3589
            executor = FakeExecutor(
      
        3590
                [
      
        3591
                    tool_outcome(
      
        3592
                        tool_call=tool_call,
      
        3593
                        output=f"Successfully wrote {chapter_paths[4]}",
      
        3594
                        is_error=False,
      
        3595
                    )
      
        3596
                ]
      
        3597
            )
      
        3598
        
        3599
            summary = TurnSummary(final_response="")
      
        3600
            await runner.execute_batch(
      
        3601
                tool_calls=[tool_call],
      
        3602
                tool_source="assistant",
      
        3603
                pending_tool_calls_seen=set(),
      
        3604
                emit=_noop_emit,
      
        3605
                summary=summary,
      
        3606
                dod=dod,
      
        3607
                executor=executor,  # type: ignore[arg-type]
      
        3608
                on_confirmation=None,
      
        3609
                on_user_question=None,
      
        3610
                emit_confirmation=None,
      
        3611
                consecutive_errors=0,
      
        3612
            )
      
        3613
        
        3614
            assert any(
      
        3615
                "Next step: create `06-performance-tuning.html`." in message
      
        3616
                for message in ephemeral_messages
      
        3617
            )
      
        3618
            assert not any(
      
        3619
                "All explicitly planned artifacts now exist on disk." in message
      
        3620
                for message in ephemeral_messages
      
        3621
            )
      
        3622
        
        3623
        
        3624
        @pytest.mark.asyncio
      
        3625
        async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
      
        3626
            temp_dir: Path,
      
        3627
        ) -> None:
      
        3628
            async def assess_confidence(
      
        3629
                tool_name: str,
      
        3630
                tool_args: dict,
      
        3631
                context: str,
      
        3632
            ) -> ConfidenceAssessment:
      
        3633
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3634
        
        3635
            async def verify_action(
      
        3636
                tool_name: str,
      
        3637
                tool_args: dict,
      
        3638
                result: str,
      
        3639
                expected: str = "",
      
        3640
            ) -> ActionVerification:
      
        3641
                raise AssertionError("Verification should not run in this scenario")
      
        3642
        
        3643
            guide_root = temp_dir / "guides" / "nginx"
      
        3644
            chapters = guide_root / "chapters"
      
        3645
            guide_root.mkdir(parents=True)
      
        3646
            chapters.mkdir()
      
        3647
            index_path = guide_root / "index.html"
      
        3648
            chapter_paths = [
      
        3649
                chapters / "01-introduction.html",
      
        3650
                chapters / "02-installation.html",
      
        3651
                chapters / "03-configuration.html",
      
        3652
                chapters / "04-basic-usage.html",
      
        3653
                chapters / "05-advanced-features.html",
      
        3654
            ]
      
        3655
            for path in (index_path, *chapter_paths[:4]):
      
        3656
                path.write_text("<html></html>\n")
      
        3657
        
        3658
            implementation_plan = temp_dir / "implementation.md"
      
        3659
            implementation_plan.write_text(
      
        3660
                "\n".join(
      
        3661
                    [
      
        3662
                        "# Implementation Plan",
      
        3663
                        "",
      
        3664
                        "## File Changes",
      
        3665
                        f"- `{guide_root}/`",
      
        3666
                        f"- `{chapters}/`",
      
        3667
                        f"- `{index_path}`",
      
        3668
                        *[f"- `{path}`" for path in chapter_paths],
      
        3669
                        "",
      
        3670
                    ]
      
        3671
                )
      
        3672
            )
      
        3673
        
        3674
            context = build_context(
      
        3675
                temp_dir=temp_dir,
      
        3676
                messages=[],
      
        3677
                safeguards=FakeSafeguards(),
      
        3678
                assess_confidence=assess_confidence,
      
        3679
                verify_action=verify_action,
      
        3680
                auto_recover=False,
      
        3681
            )
      
        3682
            persistent_messages: list[str] = []
      
        3683
            ephemeral_messages: list[str] = []
      
        3684
            context.queue_steering_message_callback = persistent_messages.append
      
        3685
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3686
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3687
            dod = create_definition_of_done("Create a thorough nginx guide.")
      
        3688
            dod.implementation_plan = str(implementation_plan)
      
        3689
            dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
      
        3690
            dod.completed_items.extend(
      
        3691
                [
      
        3692
                    "Create the nginx directory structure",
      
        3693
                    "Create the main index.html file with proper structure",
      
        3694
                ]
      
        3695
            )
      
        3696
            sync_todos_to_definition_of_done(
      
        3697
                dod,
      
        3698
                [
      
        3699
                    {
      
        3700
                        "content": "Create each chapter file with appropriate content",
      
        3701
                        "active_form": "Creating each chapter file with appropriate content",
      
        3702
                        "status": "pending",
      
        3703
                    }
      
        3704
                ],
      
        3705
            )
      
        3706
            tool_call = ToolCall(
      
        3707
                id="write-chapter-04",
      
        3708
                name="write",
      
        3709
                arguments={
      
        3710
                    "file_path": str(chapter_paths[3]),
      
        3711
                    "content": "<html>updated</html>\n",
      
        3712
                },
      
        3713
            )
      
        3714
            executor = FakeExecutor(
      
        3715
                [
      
        3716
                    tool_outcome(
      
        3717
                        tool_call=tool_call,
      
        3718
                        output=f"Successfully wrote {chapter_paths[3]}",
      
        3719
                        is_error=False,
      
        3720
                    )
      
        3721
                ]
      
        3722
            )
      
        3723
        
        3724
            summary = TurnSummary(final_response="")
      
        3725
            await runner.execute_batch(
      
        3726
                tool_calls=[tool_call],
      
        3727
                tool_source="assistant",
      
        3728
                pending_tool_calls_seen=set(),
      
        3729
                emit=_noop_emit,
      
        3730
                summary=summary,
      
        3731
                dod=dod,
      
        3732
                executor=executor,  # type: ignore[arg-type]
      
        3733
                on_confirmation=None,
      
        3734
                on_user_question=None,
      
        3735
                emit_confirmation=None,
      
        3736
                consecutive_errors=0,
      
        3737
            )
      
        3738
        
        3739
            assert ephemeral_messages
      
        3740
            message = ephemeral_messages[-1]
      
        3741
            assert "Next step: create `05-advanced-features.html`." in message
      
        3742
            assert "Do not reread reference material or spend the next turn on bookkeeping." in message
      
        3743
            assert "refresh `TodoWrite`" not in message
      
        3744
        
        3745
        
        3746
        @pytest.mark.asyncio
      
        3747
        async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
      
        3748
            temp_dir: Path,
      
        3749
        ) -> None:
      
        3750
            async def assess_confidence(
      
        3751
                tool_name: str,
      
        3752
                tool_args: dict,
      
        3753
                context: str,
      
        3754
            ) -> ConfidenceAssessment:
      
        3755
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3756
        
        3757
            async def verify_action(
      
        3758
                tool_name: str,
      
        3759
                tool_args: dict,
      
        3760
                result: str,
      
        3761
                expected: str = "",
      
        3762
            ) -> ActionVerification:
      
        3763
                raise AssertionError("Verification should not run in this scenario")
      
        3764
        
        3765
            guide_root = temp_dir / "guides" / "nginx"
      
        3766
            chapters = guide_root / "chapters"
      
        3767
            guide_root.mkdir(parents=True)
      
        3768
            chapters.mkdir()
      
        3769
            index_path = guide_root / "index.html"
      
        3770
            index_path.write_text("<html></html>\n")
      
        3771
            chapter_one = chapters / "01-getting-started.html"
      
        3772
            chapter_two = chapters / "02-installation.html"
      
        3773
            chapter_one.write_text("<h1>One</h1>\n")
      
        3774
        
        3775
            implementation_plan = temp_dir / "implementation.md"
      
        3776
            implementation_plan.write_text(
      
        3777
                "\n".join(
      
        3778
                    [
      
        3779
                        "# Implementation Plan",
      
        3780
                        "",
      
        3781
                        "## File Changes",
      
        3782
                        f"- `{guide_root}/`",
      
        3783
                        f"- `{chapters}/`",
      
        3784
                        f"- `{index_path}`",
      
        3785
                        f"- `{chapter_one}`",
      
        3786
                        f"- `{chapter_two}`",
      
        3787
                        "",
      
        3788
                    ]
      
        3789
                )
      
        3790
            )
      
        3791
        
        3792
            context = build_context(
      
        3793
                temp_dir=temp_dir,
      
        3794
                messages=[],
      
        3795
                safeguards=FakeSafeguards(),
      
        3796
                assess_confidence=assess_confidence,
      
        3797
                verify_action=verify_action,
      
        3798
                auto_recover=False,
      
        3799
            )
      
        3800
            persistent_messages: list[str] = []
      
        3801
            ephemeral_messages: list[str] = []
      
        3802
            context.queue_steering_message_callback = persistent_messages.append
      
        3803
            context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
      
        3804
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3805
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3806
            dod.implementation_plan = str(implementation_plan)
      
        3807
            sync_todos_to_definition_of_done(
      
        3808
                dod,
      
        3809
                [
      
        3810
                    {
      
        3811
                        "content": "Create 01-getting-started.html",
      
        3812
                        "active_form": "Creating 01-getting-started.html",
      
        3813
                        "status": "completed",
      
        3814
                    },
      
        3815
                    {
      
        3816
                        "content": "Create 02-installation.html",
      
        3817
                        "active_form": "Creating 02-installation.html",
      
        3818
                        "status": "pending",
      
        3819
                    },
      
        3820
                ],
      
        3821
            )
      
        3822
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        3823
        
        3824
            tool_call = ToolCall(
      
        3825
                id="todo-only",
      
        3826
                name="TodoWrite",
      
        3827
                arguments={
      
        3828
                    "todos": [
      
        3829
                        {
      
        3830
                            "content": "Create 01-getting-started.html",
      
        3831
                            "active_form": "Creating 01-getting-started.html",
      
        3832
                            "status": "completed",
      
        3833
                        },
      
        3834
                        {
      
        3835
                            "content": "Create 02-installation.html",
      
        3836
                            "active_form": "Creating 02-installation.html",
      
        3837
                            "status": "pending",
      
        3838
                        },
      
        3839
                    ]
      
        3840
                },
      
        3841
            )
      
        3842
            executor = FakeExecutor(
      
        3843
                [
      
        3844
                    tool_outcome(
      
        3845
                        tool_call=tool_call,
      
        3846
                        output="Todos updated",
      
        3847
                        is_error=False,
      
        3848
                        metadata={
      
        3849
                            "new_todos": [
      
        3850
                                {
      
        3851
                                    "content": "Create 01-getting-started.html",
      
        3852
                                    "active_form": "Creating 01-getting-started.html",
      
        3853
                                    "status": "completed",
      
        3854
                                },
      
        3855
                                {
      
        3856
                                    "content": "Create 02-installation.html",
      
        3857
                                    "active_form": "Creating 02-installation.html",
      
        3858
                                    "status": "pending",
      
        3859
                                },
      
        3860
                            ]
      
        3861
                        },
      
        3862
                    )
      
        3863
                ]
      
        3864
            )
      
        3865
        
        3866
            summary = TurnSummary(final_response="")
      
        3867
            await runner.execute_batch(
      
        3868
                tool_calls=[tool_call],
      
        3869
                tool_source="assistant",
      
        3870
                pending_tool_calls_seen=set(),
      
        3871
                emit=_noop_emit,
      
        3872
                summary=summary,
      
        3873
                dod=dod,
      
        3874
                executor=executor,  # type: ignore[arg-type]
      
        3875
                on_confirmation=None,
      
        3876
                on_user_question=None,
      
        3877
                emit_confirmation=None,
      
        3878
                consecutive_errors=0,
      
        3879
            )
      
        3880
        
        3881
            assert persistent_messages
      
        3882
            message = persistent_messages[-1]
      
        3883
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        3884
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        3885
            assert "Make your next response the concrete mutation tool call itself." in message
      
        3886
            assert ephemeral_messages == []
      
        3887
        
        3888
        
        3889
        @pytest.mark.asyncio
      
        3890
        async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
      
        3891
            temp_dir: Path,
      
        3892
        ) -> None:
      
        3893
            async def assess_confidence(
      
        3894
                tool_name: str,
      
        3895
                tool_args: dict,
      
        3896
                context: str,
      
        3897
            ) -> ConfidenceAssessment:
      
        3898
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        3899
        
        3900
            async def verify_action(
      
        3901
                tool_name: str,
      
        3902
                tool_args: dict,
      
        3903
                result: str,
      
        3904
                expected: str = "",
      
        3905
            ) -> ActionVerification:
      
        3906
                raise AssertionError("Verification should not run in this scenario")
      
        3907
        
        3908
            guide_root = temp_dir / "guides" / "nginx"
      
        3909
            chapters = guide_root / "chapters"
      
        3910
            guide_root.mkdir(parents=True)
      
        3911
            chapters.mkdir()
      
        3912
            index_path = guide_root / "index.html"
      
        3913
            chapter_one = chapters / "01-getting-started.html"
      
        3914
            chapter_two = chapters / "02-installation.html"
      
        3915
            index_path.write_text("<html></html>\n")
      
        3916
            chapter_one.write_text("<h1>One</h1>\n")
      
        3917
            chapter_two.write_text("<h1>Two</h1>\n")
      
        3918
        
        3919
            implementation_plan = temp_dir / "implementation.md"
      
        3920
            implementation_plan.write_text(
      
        3921
                "\n".join(
      
        3922
                    [
      
        3923
                        "# Implementation Plan",
      
        3924
                        "",
      
        3925
                        "## File Changes",
      
        3926
                        f"- `{guide_root}/`",
      
        3927
                        f"- `{chapters}/`",
      
        3928
                        f"- `{index_path}`",
      
        3929
                        f"- `{chapter_one}`",
      
        3930
                        f"- `{chapter_two}`",
      
        3931
                        "",
      
        3932
                    ]
      
        3933
                )
      
        3934
            )
      
        3935
        
        3936
            context = build_context(
      
        3937
                temp_dir=temp_dir,
      
        3938
                messages=[],
      
        3939
                safeguards=FakeSafeguards(),
      
        3940
                assess_confidence=assess_confidence,
      
        3941
                verify_action=verify_action,
      
        3942
                auto_recover=False,
      
        3943
            )
      
        3944
            queued_messages: list[str] = []
      
        3945
            context.queue_steering_message_callback = queued_messages.append
      
        3946
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        3947
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        3948
            dod.implementation_plan = str(implementation_plan)
      
        3949
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        3950
            sync_todos_to_definition_of_done(
      
        3951
                dod,
      
        3952
                [
      
        3953
                    {
      
        3954
                        "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3955
                        "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3956
                        "status": "pending",
      
        3957
                    },
      
        3958
                    {
      
        3959
                        "content": "Verify all guide files are linked and complete",
      
        3960
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        3961
                        "status": "pending",
      
        3962
                    },
      
        3963
                ],
      
        3964
                project_root=temp_dir,
      
        3965
            )
      
        3966
        
        3967
            tool_call = ToolCall(
      
        3968
                id="todo-only",
      
        3969
                name="TodoWrite",
      
        3970
                arguments={
      
        3971
                    "todos": [
      
        3972
                        {
      
        3973
                            "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3974
                            "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3975
                            "status": "pending",
      
        3976
                        },
      
        3977
                        {
      
        3978
                            "content": "Verify all guide files are linked and complete",
      
        3979
                            "active_form": "Working on: Verify all guide files are linked and complete",
      
        3980
                            "status": "pending",
      
        3981
                        },
      
        3982
                    ]
      
        3983
                },
      
        3984
            )
      
        3985
            executor = FakeExecutor(
      
        3986
                [
      
        3987
                    tool_outcome(
      
        3988
                        tool_call=tool_call,
      
        3989
                        output="Todos updated",
      
        3990
                        is_error=False,
      
        3991
                        metadata={
      
        3992
                            "new_todos": [
      
        3993
                                {
      
        3994
                                    "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3995
                                    "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
      
        3996
                                    "status": "pending",
      
        3997
                                },
      
        3998
                                {
      
        3999
                                    "content": "Verify all guide files are linked and complete",
      
        4000
                                    "active_form": "Working on: Verify all guide files are linked and complete",
      
        4001
                                    "status": "pending",
      
        4002
                                },
      
        4003
                            ]
      
        4004
                        },
      
        4005
                    )
      
        4006
                ]
      
        4007
            )
      
        4008
        
        4009
            summary = TurnSummary(final_response="")
      
        4010
            await runner.execute_batch(
      
        4011
                tool_calls=[tool_call],
      
        4012
                tool_source="assistant",
      
        4013
                pending_tool_calls_seen=set(),
      
        4014
                emit=_noop_emit,
      
        4015
                summary=summary,
      
        4016
                dod=dod,
      
        4017
                executor=executor,  # type: ignore[arg-type]
      
        4018
                on_confirmation=None,
      
        4019
                on_user_question=None,
      
        4020
                emit_confirmation=None,
      
        4021
                consecutive_errors=0,
      
        4022
            )
      
        4023
        
        4024
            assert queued_messages
      
        4025
            message = queued_messages[-1]
      
        4026
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4027
            assert "Verify all guide files are linked and complete" in message
      
        4028
            assert "Move to verification once no specific mismatch remains." in message
      
        4029
            assert "reopen reference materials" in message
      
        4030
            assert "Fortran guide structure" not in message
      
        4031
            assert context.workflow_mode == "execute"
      
        4032
        
        4033
        
        4034
        @pytest.mark.asyncio
      
        4035
        async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
      
        4036
            temp_dir: Path,
      
        4037
        ) -> None:
      
        4038
            async def assess_confidence(
      
        4039
                tool_name: str,
      
        4040
                tool_args: dict,
      
        4041
                context: str,
      
        4042
            ) -> ConfidenceAssessment:
      
        4043
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4044
        
        4045
            async def verify_action(
      
        4046
                tool_name: str,
      
        4047
                tool_args: dict,
      
        4048
                result: str,
      
        4049
                expected: str = "",
      
        4050
            ) -> ActionVerification:
      
        4051
                raise AssertionError("Verification should not run for this scenario")
      
        4052
        
        4053
            guide_root = temp_dir / "guides" / "nginx"
      
        4054
            chapters = guide_root / "chapters"
      
        4055
            guide_root.mkdir(parents=True)
      
        4056
            chapters.mkdir()
      
        4057
            index_path = guide_root / "index.html"
      
        4058
            chapter_one = chapters / "01-introduction.html"
      
        4059
            chapter_two = chapters / "02-installation.html"
      
        4060
            index_path.write_text(
      
        4061
                "\n".join(
      
        4062
                    [
      
        4063
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        4064
                        '<a href="chapters/02-installation.html">Install</a>',
      
        4065
                        '<a href="../index.html">Back</a>',
      
        4066
                        "",
      
        4067
                    ]
      
        4068
                )
      
        4069
            )
      
        4070
            chapter_one.write_text("<html></html>\n")
      
        4071
            chapter_two.write_text("<html></html>\n")
      
        4072
        
        4073
            implementation_plan = temp_dir / "implementation.md"
      
        4074
            implementation_plan.write_text(
      
        4075
                "\n".join(
      
        4076
                    [
      
        4077
                        "# Implementation Plan",
      
        4078
                        "",
      
        4079
                        "## File Changes",
      
        4080
                        f"- `{guide_root}/`",
      
        4081
                        f"- `{chapters}/`",
      
        4082
                        f"- `{index_path}`",
      
        4083
                        f"- `{chapter_one}`",
      
        4084
                        f"- `{chapter_two}`",
      
        4085
                        "",
      
        4086
                    ]
      
        4087
                )
      
        4088
            )
      
        4089
        
        4090
            context = build_context(
      
        4091
                temp_dir=temp_dir,
      
        4092
                messages=[],
      
        4093
                safeguards=FakeSafeguards(),
      
        4094
                assess_confidence=assess_confidence,
      
        4095
                verify_action=verify_action,
      
        4096
                auto_recover=False,
      
        4097
            )
      
        4098
            queued_messages: list[str] = []
      
        4099
            context.queue_steering_message_callback = queued_messages.append
      
        4100
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4101
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4102
            dod.implementation_plan = str(implementation_plan)
      
        4103
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4104
            sync_todos_to_definition_of_done(
      
        4105
                dod,
      
        4106
                [
      
        4107
                    {
      
        4108
                        "content": "Create chapter files following the established pattern",
      
        4109
                        "active_form": "Creating chapter files",
      
        4110
                        "status": "in_progress",
      
        4111
                    }
      
        4112
                ],
      
        4113
                project_root=temp_dir,
      
        4114
            )
      
        4115
        
        4116
            tool_call = ToolCall(
      
        4117
                id="todo-post-build",
      
        4118
                name="TodoWrite",
      
        4119
                arguments={
      
        4120
                    "todos": [
      
        4121
                        {
      
        4122
                            "content": "Create chapter files following the established pattern",
      
        4123
                            "active_form": "Creating chapter files",
      
        4124
                            "status": "in_progress",
      
        4125
                        }
      
        4126
                    ]
      
        4127
                },
      
        4128
            )
      
        4129
            executor = FakeExecutor(
      
        4130
                [
      
        4131
                    tool_outcome(
      
        4132
                        tool_call=tool_call,
      
        4133
                        output="Todos updated",
      
        4134
                        is_error=False,
      
        4135
                        metadata={
      
        4136
                            "new_todos": [
      
        4137
                                {
      
        4138
                                    "content": "Create chapter files following the established pattern",
      
        4139
                                    "active_form": "Creating chapter files",
      
        4140
                                    "status": "in_progress",
      
        4141
                                }
      
        4142
                            ]
      
        4143
                        },
      
        4144
                    )
      
        4145
                ]
      
        4146
            )
      
        4147
        
        4148
            summary = TurnSummary(final_response="")
      
        4149
            await runner.execute_batch(
      
        4150
                tool_calls=[tool_call],
      
        4151
                tool_source="assistant",
      
        4152
                pending_tool_calls_seen=set(),
      
        4153
                emit=_noop_emit,
      
        4154
                summary=summary,
      
        4155
                dod=dod,
      
        4156
                executor=executor,  # type: ignore[arg-type]
      
        4157
                on_confirmation=None,
      
        4158
                on_user_question=None,
      
        4159
                emit_confirmation=None,
      
        4160
                consecutive_errors=0,
      
        4161
            )
      
        4162
        
        4163
            assert queued_messages
      
        4164
            message = queued_messages[-1]
      
        4165
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4166
            assert "Verification should run next." in message
      
        4167
            assert "Repair or verify the current files instead of expanding the artifact set." not in message
      
        4168
            assert context.workflow_mode == "verify"
      
        4169
        
        4170
        
        4171
        @pytest.mark.asyncio
      
        4172
        async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
      
        4173
            temp_dir: Path,
      
        4174
        ) -> None:
      
        4175
            async def assess_confidence(
      
        4176
                tool_name: str,
      
        4177
                tool_args: dict,
      
        4178
                context: str,
      
        4179
            ) -> ConfidenceAssessment:
      
        4180
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4181
        
        4182
            async def verify_action(
      
        4183
                tool_name: str,
      
        4184
                tool_args: dict,
      
        4185
                result: str,
      
        4186
                expected: str = "",
      
        4187
            ) -> ActionVerification:
      
        4188
                raise AssertionError("Verification should not run for this scenario")
      
        4189
        
        4190
            guide_root = temp_dir / "guides" / "nginx"
      
        4191
            chapters = guide_root / "chapters"
      
        4192
            guide_root.mkdir(parents=True)
      
        4193
            chapters.mkdir()
      
        4194
            index_path = guide_root / "index.html"
      
        4195
            chapter_one = chapters / "01-introduction.html"
      
        4196
            chapter_two = chapters / "02-installation.html"
      
        4197
            index_path.write_text("<html></html>\n")
      
        4198
            chapter_one.write_text("<html></html>\n")
      
        4199
            chapter_two.write_text("<html></html>\n")
      
        4200
        
        4201
            implementation_plan = temp_dir / "implementation.md"
      
        4202
            implementation_plan.write_text(
      
        4203
                "\n".join(
      
        4204
                    [
      
        4205
                        "# Implementation Plan",
      
        4206
                        "",
      
        4207
                        "## File Changes",
      
        4208
                        f"- `{guide_root}/`",
      
        4209
                        f"- `{chapters}/`",
      
        4210
                        f"- `{index_path}`",
      
        4211
                        f"- `{chapter_one}`",
      
        4212
                        f"- `{chapter_two}`",
      
        4213
                        "",
      
        4214
                    ]
      
        4215
                )
      
        4216
            )
      
        4217
        
        4218
            context = build_context(
      
        4219
                temp_dir=temp_dir,
      
        4220
                messages=[],
      
        4221
                safeguards=FakeSafeguards(),
      
        4222
                assess_confidence=assess_confidence,
      
        4223
                verify_action=verify_action,
      
        4224
                auto_recover=False,
      
        4225
            )
      
        4226
            queued_messages: list[str] = []
      
        4227
            context.queue_steering_message_callback = queued_messages.append
      
        4228
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4229
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4230
            dod.implementation_plan = str(implementation_plan)
      
        4231
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4232
        
        4233
            todo_call = ToolCall(
      
        4234
                id="todo-post-build-preempt",
      
        4235
                name="TodoWrite",
      
        4236
                arguments={"todos": []},
      
        4237
            )
      
        4238
            audit_read = ToolCall(
      
        4239
                id="read-after-todo",
      
        4240
                name="read",
      
        4241
                arguments={"file_path": str(index_path)},
      
        4242
            )
      
        4243
            executor = FakeExecutor(
      
        4244
                [
      
        4245
                    tool_outcome(
      
        4246
                        tool_call=todo_call,
      
        4247
                        output="Todos updated",
      
        4248
                        is_error=False,
      
        4249
                        metadata={"new_todos": []},
      
        4250
                    ),
      
        4251
                    tool_outcome(
      
        4252
                        tool_call=audit_read,
      
        4253
                        output=index_path.read_text(),
      
        4254
                        is_error=False,
      
        4255
                    ),
      
        4256
                ]
      
        4257
            )
      
        4258
        
        4259
            summary = TurnSummary(final_response="")
      
        4260
            result = await runner.execute_batch(
      
        4261
                tool_calls=[todo_call, audit_read],
      
        4262
                tool_source="assistant",
      
        4263
                pending_tool_calls_seen=set(),
      
        4264
                emit=_noop_emit,
      
        4265
                summary=summary,
      
        4266
                dod=dod,
      
        4267
                executor=executor,  # type: ignore[arg-type]
      
        4268
                on_confirmation=None,
      
        4269
                on_user_question=None,
      
        4270
                emit_confirmation=None,
      
        4271
                consecutive_errors=0,
      
        4272
            )
      
        4273
        
        4274
            assert result.continue_after_batch is True
      
        4275
            assert result.halted is False
      
        4276
            assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
      
        4277
            assert len(summary.tool_result_messages) == 1
      
        4278
            assert context.workflow_mode == "verify"
      
        4279
            assert queued_messages
      
        4280
            assert "Verification should run next." in queued_messages[-1]
      
        4281
        
        4282
        
        4283
        @pytest.mark.asyncio
      
        4284
        async def test_tool_batch_runner_todowrite_complete_directory_plan_does_not_reinfer_first_child(
      
        4285
            temp_dir: Path,
      
        4286
        ) -> None:
      
        4287
            async def assess_confidence(
      
        4288
                tool_name: str,
      
        4289
                tool_args: dict,
      
        4290
                context: str,
      
        4291
            ) -> ConfidenceAssessment:
      
        4292
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4293
        
        4294
            async def verify_action(
      
        4295
                tool_name: str,
      
        4296
                tool_args: dict,
      
        4297
                result: str,
      
        4298
                expected: str = "",
      
        4299
            ) -> ActionVerification:
      
        4300
                raise AssertionError("Verification should not run for this scenario")
      
        4301
        
        4302
            reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
      
        4303
            reference.parent.mkdir(parents=True)
      
        4304
            reference.write_text("<h1>Introduction</h1>\n")
      
        4305
        
        4306
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4307
            chapters = guide_root / "chapters"
      
        4308
            guide_root.mkdir(parents=True)
      
        4309
            chapters.mkdir()
      
        4310
            index_path = guide_root / "index.html"
      
        4311
            chapter_one = chapters / "01-introduction.html"
      
        4312
            chapter_two = chapters / "02-installation.html"
      
        4313
            chapter_three = chapters / "03-basic-configuration.html"
      
        4314
            index_path.write_text(
      
        4315
                "\n".join(
      
        4316
                    [
      
        4317
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4318
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        4319
                        '<a href="chapters/03-basic-configuration.html">Configuration</a>',
      
        4320
                        "",
      
        4321
                    ]
      
        4322
                )
      
        4323
            )
      
        4324
            chapter_one.write_text("<html></html>\n")
      
        4325
            chapter_two.write_text("<html></html>\n")
      
        4326
            chapter_three.write_text("<html></html>\n")
      
        4327
        
        4328
            implementation_plan = temp_dir / "implementation.md"
      
        4329
            implementation_plan.write_text(
      
        4330
                "\n".join(
      
        4331
                    [
      
        4332
                        "# Implementation Plan",
      
        4333
                        "",
      
        4334
                        "## File Changes",
      
        4335
                        f"- `{guide_root / 'index.html'}`",
      
        4336
                        f"- `{chapters}/`",
      
        4337
                        "",
      
        4338
                    ]
      
        4339
                )
      
        4340
            )
      
        4341
        
        4342
            messages = [
      
        4343
                Message(
      
        4344
                    role=Role.ASSISTANT,
      
        4345
                    content="I examined the reference guide structure.",
      
        4346
                    tool_calls=[
      
        4347
                        ToolCall(
      
        4348
                            id="read-reference-child",
      
        4349
                            name="read",
      
        4350
                            arguments={"file_path": str(reference)},
      
        4351
                        )
      
        4352
                    ],
      
        4353
                )
      
        4354
            ]
      
        4355
            context = build_context(
      
        4356
                temp_dir=temp_dir,
      
        4357
                messages=messages,
      
        4358
                safeguards=FakeSafeguards(),
      
        4359
                assess_confidence=assess_confidence,
      
        4360
                verify_action=verify_action,
      
        4361
                auto_recover=False,
      
        4362
            )
      
        4363
            queued_messages: list[str] = []
      
        4364
            context.queue_steering_message_callback = queued_messages.append
      
        4365
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4366
            dod = create_definition_of_done("Create an equally thorough nginx guide.")
      
        4367
            dod.implementation_plan = str(implementation_plan)
      
        4368
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4369
        
        4370
            todo_call = ToolCall(
      
        4371
                id="todo-complete-directory-plan",
      
        4372
                name="TodoWrite",
      
        4373
                arguments={"todos": []},
      
        4374
            )
      
        4375
            executor = FakeExecutor(
      
        4376
                [
      
        4377
                    tool_outcome(
      
        4378
                        tool_call=todo_call,
      
        4379
                        output="Todos updated",
      
        4380
                        is_error=False,
      
        4381
                        metadata={"new_todos": []},
      
        4382
                    )
      
        4383
                ]
      
        4384
            )
      
        4385
        
        4386
            summary = TurnSummary(final_response="")
      
        4387
            result = await runner.execute_batch(
      
        4388
                tool_calls=[todo_call],
      
        4389
                tool_source="assistant",
      
        4390
                pending_tool_calls_seen=set(),
      
        4391
                emit=_noop_emit,
      
        4392
                summary=summary,
      
        4393
                dod=dod,
      
        4394
                executor=executor,  # type: ignore[arg-type]
      
        4395
                on_confirmation=None,
      
        4396
                on_user_question=None,
      
        4397
                emit_confirmation=None,
      
        4398
                consecutive_errors=0,
      
        4399
            )
      
        4400
        
        4401
            assert result.continue_after_batch is True
      
        4402
            assert queued_messages
      
        4403
            message = queued_messages[-1]
      
        4404
            assert "Verification should run next." in message
      
        4405
            assert "01-introduction.html" not in message
      
        4406
            assert "chapter files" not in message.lower()
      
        4407
            assert context.workflow_mode == "verify"
      
        4408
        
        4409
        
        4410
        @pytest.mark.asyncio
      
        4411
        async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
      
        4412
            temp_dir: Path,
      
        4413
        ) -> None:
      
        4414
            async def assess_confidence(
      
        4415
                tool_name: str,
      
        4416
                tool_args: dict,
      
        4417
                context: str,
      
        4418
            ) -> ConfidenceAssessment:
      
        4419
                raise AssertionError("Confidence scoring should not run for this scenario")
      
        4420
        
        4421
            async def verify_action(
      
        4422
                tool_name: str,
      
        4423
                tool_args: dict,
      
        4424
                result: str,
      
        4425
                expected: str = "",
      
        4426
            ) -> ActionVerification:
      
        4427
                raise AssertionError("Verification should not run for this scenario")
      
        4428
        
        4429
            guide_root = temp_dir / "guides" / "nginx"
      
        4430
            chapters = guide_root / "chapters"
      
        4431
            guide_root.mkdir(parents=True)
      
        4432
            chapters.mkdir()
      
        4433
            index_path = guide_root / "index.html"
      
        4434
            chapter_one = chapters / "01-introduction.html"
      
        4435
            chapter_two = chapters / "02-installation.html"
      
        4436
            index_path.write_text(
      
        4437
                "\n".join(
      
        4438
                    [
      
        4439
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        4440
                        '<a href="chapters/02-installation.html">Install</a>',
      
        4441
                        '<a href="../index.html">Back</a>',
      
        4442
                        "",
      
        4443
                    ]
      
        4444
                )
      
        4445
            )
      
        4446
            chapter_one.write_text("<html></html>\n")
      
        4447
            chapter_two.write_text("<html></html>\n")
      
        4448
        
        4449
            implementation_plan = temp_dir / "implementation.md"
      
        4450
            implementation_plan.write_text(
      
        4451
                "\n".join(
      
        4452
                    [
      
        4453
                        "# Implementation Plan",
      
        4454
                        "",
      
        4455
                        "## File Changes",
      
        4456
                        f"- `{guide_root}/`",
      
        4457
                        f"- `{chapters}/`",
      
        4458
                        f"- `{index_path}`",
      
        4459
                        f"- `{chapter_one}`",
      
        4460
                        f"- `{chapter_two}`",
      
        4461
                        "",
      
        4462
                    ]
      
        4463
                )
      
        4464
            )
      
        4465
        
        4466
            context = build_context(
      
        4467
                temp_dir=temp_dir,
      
        4468
                messages=[],
      
        4469
                safeguards=FakeSafeguards(),
      
        4470
                assess_confidence=assess_confidence,
      
        4471
                verify_action=verify_action,
      
        4472
                auto_recover=False,
      
        4473
            )
      
        4474
            queued_messages: list[str] = []
      
        4475
            context.queue_steering_message_callback = queued_messages.append
      
        4476
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4477
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4478
            dod.implementation_plan = str(implementation_plan)
      
        4479
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        4480
        
        4481
            tool_call = ToolCall(
      
        4482
                id="todo-post-build-expansion",
      
        4483
                name="TodoWrite",
      
        4484
                arguments={
      
        4485
                    "todos": [
      
        4486
                        {
      
        4487
                            "content": "Create index.html for nginx guide",
      
        4488
                            "activeForm": "Creating index.html",
      
        4489
                            "status": "in_progress",
      
        4490
                        },
      
        4491
                        {
      
        4492
                            "content": "Create chapter 01-introduction.html",
      
        4493
                            "activeForm": "Creating chapter 01-introduction.html",
      
        4494
                            "status": "completed",
      
        4495
                        },
      
        4496
                        {
      
        4497
                            "content": "Create chapter 02-installation.html",
      
        4498
                            "activeForm": "Creating chapter 02-installation.html",
      
        4499
                            "status": "completed",
      
        4500
                        },
      
        4501
                        {
      
        4502
                            "content": "Create chapter 08-troubleshooting.html",
      
        4503
                            "activeForm": "Creating chapter 08-troubleshooting.html",
      
        4504
                            "status": "pending",
      
        4505
                        },
      
        4506
                    ]
      
        4507
                },
      
        4508
            )
      
        4509
            executor = FakeExecutor(
      
        4510
                [
      
        4511
                    tool_outcome(
      
        4512
                        tool_call=tool_call,
      
        4513
                        output="Todos updated",
      
        4514
                        is_error=False,
      
        4515
                        metadata={
      
        4516
                            "new_todos": [
      
        4517
                                {
      
        4518
                                    "content": "Create index.html for nginx guide",
      
        4519
                                    "active_form": "Creating index.html",
      
        4520
                                    "status": "in_progress",
      
        4521
                                },
      
        4522
                                {
      
        4523
                                    "content": "Create chapter 01-introduction.html",
      
        4524
                                    "active_form": "Creating chapter 01-introduction.html",
      
        4525
                                    "status": "completed",
      
        4526
                                },
      
        4527
                                {
      
        4528
                                    "content": "Create chapter 02-installation.html",
      
        4529
                                    "active_form": "Creating chapter 02-installation.html",
      
        4530
                                    "status": "completed",
      
        4531
                                },
      
        4532
                                {
      
        4533
                                    "content": "Create chapter 08-troubleshooting.html",
      
        4534
                                    "active_form": "Creating chapter 08-troubleshooting.html",
      
        4535
                                    "status": "pending",
      
        4536
                                },
      
        4537
                            ]
      
        4538
                        },
      
        4539
                    )
      
        4540
                ]
      
        4541
            )
      
        4542
        
        4543
            summary = TurnSummary(final_response="")
      
        4544
            await runner.execute_batch(
      
        4545
                tool_calls=[tool_call],
      
        4546
                tool_source="assistant",
      
        4547
                pending_tool_calls_seen=set(),
      
        4548
                emit=_noop_emit,
      
        4549
                summary=summary,
      
        4550
                dod=dod,
      
        4551
                executor=executor,  # type: ignore[arg-type]
      
        4552
                on_confirmation=None,
      
        4553
                on_user_question=None,
      
        4554
                emit_confirmation=None,
      
        4555
                consecutive_errors=0,
      
        4556
            )
      
        4557
        
        4558
            assert queued_messages
      
        4559
            message = queued_messages[-1]
      
        4560
            assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      
        4561
            assert "Verification should run next." in message
      
        4562
            assert "Repair or verify the current files instead of expanding the artifact set." not in message
      
        4563
            assert "08-troubleshooting.html" not in message
      
        4564
            assert context.workflow_mode == "verify"
      
        4565
        
        4566
        
        4567
        @pytest.mark.asyncio
      
        4568
        async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      
        4569
            temp_dir: Path,
      
        4570
        ) -> None:
      
        4571
            async def assess_confidence(
      
        4572
                tool_name: str,
      
        4573
                tool_args: dict,
      
        4574
                context: str,
      
        4575
            ) -> ConfidenceAssessment:
      
        4576
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4577
        
        4578
            async def verify_action(
      
        4579
                tool_name: str,
      
        4580
                tool_args: dict,
      
        4581
                result: str,
      
        4582
                expected: str = "",
      
        4583
            ) -> ActionVerification:
      
        4584
                raise AssertionError("Verification should not run in this scenario")
      
        4585
        
        4586
            guide_root = temp_dir / "guides" / "nginx"
      
        4587
            chapters = guide_root / "chapters"
      
        4588
            guide_root.mkdir(parents=True)
      
        4589
            chapters.mkdir()
      
        4590
            index_path = guide_root / "index.html"
      
        4591
            index_path.write_text(
      
        4592
                "\n".join(
      
        4593
                    [
      
        4594
                        "<!DOCTYPE html>",
      
        4595
                        "<html>",
      
        4596
                        "<body>",
      
        4597
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        4598
                        "</body>",
      
        4599
                        "</html>",
      
        4600
                        "",
      
        4601
                    ]
      
        4602
                )
      
        4603
            )
      
        4604
        
        4605
            implementation_plan = temp_dir / "implementation.md"
      
        4606
            implementation_plan.write_text(
      
        4607
                "\n".join(
      
        4608
                    [
      
        4609
                        "# Implementation Plan",
      
        4610
                        "",
      
        4611
                        "## File Changes",
      
        4612
                        f"- `{guide_root}/`",
      
        4613
                        f"- `{chapters}/`",
      
        4614
                        f"- `{index_path}`",
      
        4615
                        "",
      
        4616
                    ]
      
        4617
                )
      
        4618
            )
      
        4619
        
        4620
            context = build_context(
      
        4621
                temp_dir=temp_dir,
      
        4622
                messages=[],
      
        4623
                safeguards=FakeSafeguards(),
      
        4624
                assess_confidence=assess_confidence,
      
        4625
                verify_action=verify_action,
      
        4626
                auto_recover=False,
      
        4627
            )
      
        4628
            queued_messages: list[str] = []
      
        4629
            context.queue_steering_message_callback = queued_messages.append
      
        4630
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4631
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4632
            dod.implementation_plan = str(implementation_plan)
      
        4633
            dod.touched_files.append(str(index_path))
      
        4634
            sync_todos_to_definition_of_done(
      
        4635
                dod,
      
        4636
                [
      
        4637
                    {
      
        4638
                        "content": "Examine the existing Fortran guide structure",
      
        4639
                        "active_form": "Examining the existing Fortran guide structure",
      
        4640
                        "status": "completed",
      
        4641
                    },
      
        4642
                    {
      
        4643
                        "content": "Create the nginx directory structure",
      
        4644
                        "active_form": "Creating the nginx directory structure",
      
        4645
                        "status": "completed",
      
        4646
                    },
      
        4647
                    {
      
        4648
                        "content": "Write the introduction chapter",
      
        4649
                        "active_form": "Writing the introduction chapter",
      
        4650
                        "status": "pending",
      
        4651
                    },
      
        4652
                ],
      
        4653
                project_root=temp_dir,
      
        4654
            )
      
        4655
        
        4656
            tool_call = ToolCall(
      
        4657
                id="todo-next-mutation",
      
        4658
                name="TodoWrite",
      
        4659
                arguments={
      
        4660
                    "todos": [
      
        4661
                        {
      
        4662
                            "content": "Examine the existing Fortran guide structure",
      
        4663
                            "active_form": "Examining the existing Fortran guide structure",
      
        4664
                            "status": "completed",
      
        4665
                        },
      
        4666
                        {
      
        4667
                            "content": "Create the nginx directory structure",
      
        4668
                            "active_form": "Creating the nginx directory structure",
      
        4669
                            "status": "completed",
      
        4670
                        },
      
        4671
                        {
      
        4672
                            "content": "Write the introduction chapter",
      
        4673
                            "active_form": "Writing the introduction chapter",
      
        4674
                            "status": "pending",
      
        4675
                        },
      
        4676
                    ]
      
        4677
                },
      
        4678
            )
      
        4679
            executor = FakeExecutor(
      
        4680
                [
      
        4681
                    tool_outcome(
      
        4682
                        tool_call=tool_call,
      
        4683
                        output="Todos updated",
      
        4684
                        is_error=False,
      
        4685
                        metadata={
      
        4686
                            "new_todos": [
      
        4687
                                {
      
        4688
                                    "content": "Examine the existing Fortran guide structure",
      
        4689
                                    "active_form": "Examining the existing Fortran guide structure",
      
        4690
                                    "status": "completed",
      
        4691
                                },
      
        4692
                                {
      
        4693
                                    "content": "Create the nginx directory structure",
      
        4694
                                    "active_form": "Creating the nginx directory structure",
      
        4695
                                    "status": "completed",
      
        4696
                                },
      
        4697
                                {
      
        4698
                                    "content": "Write the introduction chapter",
      
        4699
                                    "active_form": "Writing the introduction chapter",
      
        4700
                                    "status": "pending",
      
        4701
                                },
      
        4702
                            ]
      
        4703
                        },
      
        4704
                    )
      
        4705
                ]
      
        4706
            )
      
        4707
        
        4708
            summary = TurnSummary(final_response="")
      
        4709
            await runner.execute_batch(
      
        4710
                tool_calls=[tool_call],
      
        4711
                tool_source="assistant",
      
        4712
                pending_tool_calls_seen=set(),
      
        4713
                emit=_noop_emit,
      
        4714
                summary=summary,
      
        4715
                dod=dod,
      
        4716
                executor=executor,  # type: ignore[arg-type]
      
        4717
                on_confirmation=None,
      
        4718
                on_user_question=None,
      
        4719
                emit_confirmation=None,
      
        4720
                consecutive_errors=0,
      
        4721
            )
      
        4722
        
        4723
            assert queued_messages
      
        4724
            message = queued_messages[-1]
      
        4725
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        4726
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4727
            assert "Make your next response the concrete mutation tool call itself." in message
      
        4728
        
        4729
        
        4730
        @pytest.mark.asyncio
      
        4731
        async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
      
        4732
            temp_dir: Path,
      
        4733
        ) -> None:
      
        4734
            async def assess_confidence(
      
        4735
                tool_name: str,
      
        4736
                tool_args: dict,
      
        4737
                context: str,
      
        4738
            ) -> ConfidenceAssessment:
      
        4739
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4740
        
        4741
            async def verify_action(
      
        4742
                tool_name: str,
      
        4743
                tool_args: dict,
      
        4744
                result: str,
      
        4745
                expected: str = "",
      
        4746
            ) -> ActionVerification:
      
        4747
                raise AssertionError("Verification should not run in this scenario")
      
        4748
        
        4749
            guide_root = temp_dir / "Loader" / "guides" / "nginx"
      
        4750
            chapters = guide_root / "chapters"
      
        4751
            chapters.mkdir(parents=True)
      
        4752
            index_path = guide_root / "index.html"
      
        4753
            implementation_plan = temp_dir / "implementation.md"
      
        4754
            implementation_plan.write_text(
      
        4755
                "\n".join(
      
        4756
                    [
      
        4757
                        "# Implementation Plan",
      
        4758
                        "",
      
        4759
                        "## File Changes",
      
        4760
                        f"- `{chapters}/`",
      
        4761
                        f"- `{index_path}`",
      
        4762
                        "",
      
        4763
                    ]
      
        4764
                )
      
        4765
            )
      
        4766
        
        4767
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4768
            dod.implementation_plan = str(implementation_plan)
      
        4769
            sync_todos_to_definition_of_done(
      
        4770
                dod,
      
        4771
                [
      
        4772
                    {
      
        4773
                        "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        4774
                        "active_form": "Examining the existing Fortran guide structure",
      
        4775
                        "status": "completed",
      
        4776
                    },
      
        4777
                    {
      
        4778
                        "content": "Create the new nginx guide directory structure",
      
        4779
                        "active_form": "Creating the new nginx guide directory structure",
      
        4780
                        "status": "completed",
      
        4781
                    },
      
        4782
                    {
      
        4783
                        "content": "Create a new index.html for the nginx guide",
      
        4784
                        "active_form": "Creating a new index.html for the nginx guide",
      
        4785
                        "status": "pending",
      
        4786
                    },
      
        4787
                    {
      
        4788
                        "content": "Create the first chapter for the nginx guide",
      
        4789
                        "active_form": "Creating the first chapter for the nginx guide",
      
        4790
                        "status": "pending",
      
        4791
                    },
      
        4792
                ],
      
        4793
                project_root=temp_dir,
      
        4794
            )
      
        4795
        
        4796
            queued_messages: list[str] = []
      
        4797
            context = build_context(
      
        4798
                temp_dir=temp_dir,
      
        4799
                messages=[],
      
        4800
                safeguards=FakeSafeguards(),
      
        4801
                assess_confidence=assess_confidence,
      
        4802
                verify_action=verify_action,
      
        4803
                auto_recover=False,
      
        4804
            )
      
        4805
            context.queue_steering_message_callback = queued_messages.append
      
        4806
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4807
        
        4808
            todos = [
      
        4809
                {
      
        4810
                    "content": "Examine the existing Fortran guide structure to understand the format and depth",
      
        4811
                    "active_form": "Examining the existing Fortran guide structure",
      
        4812
                    "status": "completed",
      
        4813
                },
      
        4814
                {
      
        4815
                    "content": "Create the new nginx guide directory structure",
      
        4816
                    "active_form": "Creating the new nginx guide directory structure",
      
        4817
                    "status": "completed",
      
        4818
                },
      
        4819
                {
      
        4820
                    "content": "Create a new index.html for the nginx guide",
      
        4821
                    "active_form": "Creating a new index.html for the nginx guide",
      
        4822
                    "status": "pending",
      
        4823
                },
      
        4824
                {
      
        4825
                    "content": "Create the first chapter for the nginx guide",
      
        4826
                    "active_form": "Creating the first chapter for the nginx guide",
      
        4827
                    "status": "pending",
      
        4828
                },
      
        4829
            ]
      
        4830
            tool_call = ToolCall(
      
        4831
                id="todo-index-before-chapter",
      
        4832
                name="TodoWrite",
      
        4833
                arguments={"todos": todos},
      
        4834
            )
      
        4835
            executor = FakeExecutor(
      
        4836
                [
      
        4837
                    tool_outcome(
      
        4838
                        tool_call=tool_call,
      
        4839
                        output="Todos updated",
      
        4840
                        is_error=False,
      
        4841
                        metadata={"new_todos": todos},
      
        4842
                    )
      
        4843
                ]
      
        4844
            )
      
        4845
        
        4846
            summary = TurnSummary(final_response="")
      
        4847
            await runner.execute_batch(
      
        4848
                tool_calls=[tool_call],
      
        4849
                tool_source="assistant",
      
        4850
                pending_tool_calls_seen=set(),
      
        4851
                emit=_noop_emit,
      
        4852
                summary=summary,
      
        4853
                dod=dod,
      
        4854
                executor=executor,  # type: ignore[arg-type]
      
        4855
                on_confirmation=None,
      
        4856
                on_user_question=None,
      
        4857
                emit_confirmation=None,
      
        4858
                consecutive_errors=0,
      
        4859
            )
      
        4860
        
        4861
            assert queued_messages
      
        4862
            message = queued_messages[-1]
      
        4863
            assert "Todo tracking is updated. Next step: create `index.html`." in message
      
        4864
            assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
      
        4865
            assert "01-introduction.html" not in message
      
        4866
        
        4867
        
        4868
        @pytest.mark.asyncio
      
        4869
        async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
      
        4870
            temp_dir: Path,
      
        4871
        ) -> None:
      
        4872
            async def assess_confidence(
      
        4873
                tool_name: str,
      
        4874
                tool_args: dict,
      
        4875
                context: str,
      
        4876
            ) -> ConfidenceAssessment:
      
        4877
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        4878
        
        4879
            async def verify_action(
      
        4880
                tool_name: str,
      
        4881
                tool_args: dict,
      
        4882
                result: str,
      
        4883
                expected: str = "",
      
        4884
            ) -> ActionVerification:
      
        4885
                raise AssertionError("Verification should not run in this scenario")
      
        4886
        
        4887
            guide_root = temp_dir / "guides" / "nginx"
      
        4888
            chapters = guide_root / "chapters"
      
        4889
            guide_root.mkdir(parents=True)
      
        4890
            chapters.mkdir()
      
        4891
            index_path = guide_root / "index.html"
      
        4892
            index_path.write_text(
      
        4893
                "\n".join(
      
        4894
                    [
      
        4895
                        "<html>",
      
        4896
                        '<a href="chapters/introduction.html">Introduction</a>',
      
        4897
                        '<a href="chapters/installation.html">Installation</a>',
      
        4898
                        "</html>",
      
        4899
                    ]
      
        4900
                )
      
        4901
                + "\n"
      
        4902
            )
      
        4903
        
        4904
            implementation_plan = temp_dir / "implementation.md"
      
        4905
            implementation_plan.write_text(
      
        4906
                "\n".join(
      
        4907
                    [
      
        4908
                        "# Implementation Plan",
      
        4909
                        "",
      
        4910
                        "## File Changes",
      
        4911
                        f"- `{guide_root}/`",
      
        4912
                        f"- `{chapters}/`",
      
        4913
                        f"- `{index_path}`",
      
        4914
                        "",
      
        4915
                    ]
      
        4916
                )
      
        4917
            )
      
        4918
        
        4919
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        4920
            dod.implementation_plan = str(implementation_plan)
      
        4921
            dod.pending_items = [
      
        4922
                "Write the introduction chapter",
      
        4923
                "Complete the requested work",
      
        4924
            ]
      
        4925
            dod.touched_files.append(str(index_path))
      
        4926
        
        4927
            queued_messages: list[str] = []
      
        4928
            context = build_context(
      
        4929
                temp_dir=temp_dir,
      
        4930
                messages=[],
      
        4931
                safeguards=FakeSafeguards(),
      
        4932
                assess_confidence=assess_confidence,
      
        4933
                verify_action=verify_action,
      
        4934
                auto_recover=False,
      
        4935
            )
      
        4936
            context.queue_steering_message_callback = queued_messages.append
      
        4937
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        4938
        
        4939
            tool_call = ToolCall(
      
        4940
                id="todo-1",
      
        4941
                name="TodoWrite",
      
        4942
                arguments={
      
        4943
                    "todos": [
      
        4944
                        {
      
        4945
                            "content": "Write the introduction chapter",
      
        4946
                            "activeForm": "Writing the introduction chapter",
      
        4947
                            "status": "pending",
      
        4948
                        }
      
        4949
                    ]
      
        4950
                },
      
        4951
            )
      
        4952
            executor = FakeExecutor(
      
        4953
                [
      
        4954
                    tool_outcome(
      
        4955
                        tool_call=tool_call,
      
        4956
                        output="Todos updated",
      
        4957
                        is_error=False,
      
        4958
                        metadata={
      
        4959
                            "new_todos": [
      
        4960
                                {
      
        4961
                                    "content": "Write the introduction chapter",
      
        4962
                                    "active_form": "Writing the introduction chapter",
      
        4963
                                    "status": "pending",
      
        4964
                                }
      
        4965
                            ]
      
        4966
                        },
      
        4967
                    )
      
        4968
                ]
      
        4969
            )
      
        4970
        
        4971
            summary = TurnSummary(final_response="")
      
        4972
            await runner.execute_batch(
      
        4973
                tool_calls=[tool_call],
      
        4974
                tool_source="assistant",
      
        4975
                pending_tool_calls_seen=set(),
      
        4976
                emit=_noop_emit,
      
        4977
                summary=summary,
      
        4978
                dod=dod,
      
        4979
                executor=executor,  # type: ignore[arg-type]
      
        4980
                on_confirmation=None,
      
        4981
                on_user_question=None,
      
        4982
                emit_confirmation=None,
      
        4983
                consecutive_errors=0,
      
        4984
            )
      
        4985
        
        4986
            assert queued_messages
      
        4987
            message = queued_messages[-1]
      
        4988
            assert "Todo tracking is updated. Next step: create `introduction.html`." in message
      
        4989
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        4990
            assert "Make your next response the concrete mutation tool call itself." in message
      
        4991
        
        4992
        
        4993
        @pytest.mark.asyncio
      
        4994
        async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
      
        4995
            temp_dir: Path,
      
        4996
        ) -> None:
      
        4997
            async def assess_confidence(
      
        4998
                tool_name: str,
      
        4999
                tool_args: dict,
      
        5000
                context: str,
      
        5001
            ) -> ConfidenceAssessment:
      
        5002
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5003
        
        5004
            async def verify_action(
      
        5005
                tool_name: str,
      
        5006
                tool_args: dict,
      
        5007
                result: str,
      
        5008
                expected: str = "",
      
        5009
            ) -> ActionVerification:
      
        5010
                raise AssertionError("Verification should not run in this scenario")
      
        5011
        
        5012
            guide_root = temp_dir / "guides" / "nginx"
      
        5013
            chapters = guide_root / "chapters"
      
        5014
            guide_root.mkdir(parents=True)
      
        5015
            chapters.mkdir()
      
        5016
            index_path = guide_root / "index.html"
      
        5017
            chapter_one = chapters / "01-introduction.html"
      
        5018
            index_path.write_text(
      
        5019
                "\n".join(
      
        5020
                    [
      
        5021
                        "<html>",
      
        5022
                        '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
      
        5023
                        '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
      
        5024
                        "</html>",
      
        5025
                    ]
      
        5026
                )
      
        5027
                + "\n"
      
        5028
            )
      
        5029
            chapter_one.write_text("<html></html>\n")
      
        5030
        
        5031
            implementation_plan = temp_dir / "implementation.md"
      
        5032
            implementation_plan.write_text(
      
        5033
                "\n".join(
      
        5034
                    [
      
        5035
                        "# Implementation Plan",
      
        5036
                        "",
      
        5037
                        "## File Changes",
      
        5038
                        f"- `{guide_root}/`",
      
        5039
                        f"- `{chapters}/`",
      
        5040
                        f"- `{index_path}`",
      
        5041
                        "",
      
        5042
                    ]
      
        5043
                )
      
        5044
            )
      
        5045
        
        5046
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5047
            dod.implementation_plan = str(implementation_plan)
      
        5048
            dod.pending_items = [
      
        5049
                "Creating Chapter 2: Installation and Setup",
      
        5050
                "Complete the requested work",
      
        5051
            ]
      
        5052
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5053
        
        5054
            queued_messages: list[str] = []
      
        5055
            context = build_context(
      
        5056
                temp_dir=temp_dir,
      
        5057
                messages=[],
      
        5058
                safeguards=FakeSafeguards(),
      
        5059
                assess_confidence=assess_confidence,
      
        5060
                verify_action=verify_action,
      
        5061
                auto_recover=False,
      
        5062
            )
      
        5063
            context.queue_steering_message_callback = queued_messages.append
      
        5064
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5065
        
        5066
            tool_call = ToolCall(
      
        5067
                id="todo-1",
      
        5068
                name="TodoWrite",
      
        5069
                arguments={
      
        5070
                    "todos": [
      
        5071
                        {
      
        5072
                            "content": "Creating Chapter 2: Installation and Setup",
      
        5073
                            "activeForm": "Creating Chapter 2: Installation and Setup",
      
        5074
                            "status": "pending",
      
        5075
                        }
      
        5076
                    ]
      
        5077
                },
      
        5078
            )
      
        5079
            executor = FakeExecutor(
      
        5080
                [
      
        5081
                    tool_outcome(
      
        5082
                        tool_call=tool_call,
      
        5083
                        output="Todos updated",
      
        5084
                        is_error=False,
      
        5085
                        metadata={
      
        5086
                            "new_todos": [
      
        5087
                                {
      
        5088
                                    "content": "Creating Chapter 2: Installation and Setup",
      
        5089
                                    "active_form": "Creating Chapter 2: Installation and Setup",
      
        5090
                                    "status": "pending",
      
        5091
                                }
      
        5092
                            ]
      
        5093
                        },
      
        5094
                    )
      
        5095
                ]
      
        5096
            )
      
        5097
        
        5098
            summary = TurnSummary(final_response="")
      
        5099
            await runner.execute_batch(
      
        5100
                tool_calls=[tool_call],
      
        5101
                tool_source="assistant",
      
        5102
                pending_tool_calls_seen=set(),
      
        5103
                emit=_noop_emit,
      
        5104
                summary=summary,
      
        5105
                dod=dod,
      
        5106
                executor=executor,  # type: ignore[arg-type]
      
        5107
                on_confirmation=None,
      
        5108
                on_user_question=None,
      
        5109
                emit_confirmation=None,
      
        5110
                consecutive_errors=0,
      
        5111
            )
      
        5112
        
        5113
            assert queued_messages
      
        5114
            message = queued_messages[-1]
      
        5115
            assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
      
        5116
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        5117
            assert "Make your next response the concrete mutation tool call itself" in message
      
        5118
        
        5119
        
        5120
        @pytest.mark.asyncio
      
        5121
        async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      
        5122
            temp_dir: Path,
      
        5123
        ) -> None:
      
        5124
            async def assess_confidence(
      
        5125
                tool_name: str,
      
        5126
                tool_args: dict,
      
        5127
                context: str,
      
        5128
            ) -> ConfidenceAssessment:
      
        5129
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5130
        
        5131
            async def verify_action(
      
        5132
                tool_name: str,
      
        5133
                tool_args: dict,
      
        5134
                result: str,
      
        5135
                expected: str = "",
      
        5136
            ) -> ActionVerification:
      
        5137
                raise AssertionError("Verification should not run in this scenario")
      
        5138
        
        5139
            reference_chapters = temp_dir / "fortran" / "chapters"
      
        5140
            reference_chapters.mkdir(parents=True)
      
        5141
            (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
      
        5142
        
        5143
            guide_root = temp_dir / "guides" / "nginx"
      
        5144
            chapters = guide_root / "chapters"
      
        5145
            guide_root.mkdir(parents=True)
      
        5146
            chapters.mkdir()
      
        5147
            index_path = guide_root / "index.html"
      
        5148
            index_path.write_text("<html></html>\n")
      
        5149
        
        5150
            implementation_plan = temp_dir / "implementation.md"
      
        5151
            implementation_plan.write_text(
      
        5152
                "\n".join(
      
        5153
                    [
      
        5154
                        "# Implementation Plan",
      
        5155
                        "",
      
        5156
                        "## File Changes",
      
        5157
                        f"- `{guide_root}/`",
      
        5158
                        f"- `{chapters}/`",
      
        5159
                        f"- `{index_path}`",
      
        5160
                        "",
      
        5161
                    ]
      
        5162
                )
      
        5163
            )
      
        5164
        
        5165
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5166
            dod.implementation_plan = str(implementation_plan)
      
        5167
            dod.pending_items = [
      
        5168
                "Write the introduction chapter",
      
        5169
                "Complete the requested work",
      
        5170
            ]
      
        5171
            dod.touched_files.append(str(index_path))
      
        5172
        
        5173
            queued_messages: list[str] = []
      
        5174
            context = build_context(
      
        5175
                temp_dir=temp_dir,
      
        5176
                messages=[
      
        5177
                    Message(
      
        5178
                        role=Role.ASSISTANT,
      
        5179
                        content="",
      
        5180
                        tool_calls=[
      
        5181
                            ToolCall(
      
        5182
                                id="read-ref-1",
      
        5183
                                name="read",
      
        5184
                                arguments={"file_path": str(reference_chapters / "01-introduction.html")},
      
        5185
                            )
      
        5186
                        ],
      
        5187
                    )
      
        5188
                ],
      
        5189
                safeguards=FakeSafeguards(),
      
        5190
                assess_confidence=assess_confidence,
      
        5191
                verify_action=verify_action,
      
        5192
                auto_recover=False,
      
        5193
            )
      
        5194
            context.queue_steering_message_callback = queued_messages.append
      
        5195
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5196
        
        5197
            tool_call = ToolCall(
      
        5198
                id="todo-observed-1",
      
        5199
                name="TodoWrite",
      
        5200
                arguments={
      
        5201
                    "todos": [
      
        5202
                        {
      
        5203
                            "content": "Write the introduction chapter",
      
        5204
                            "activeForm": "Writing the introduction chapter",
      
        5205
                            "status": "pending",
      
        5206
                        }
      
        5207
                    ]
      
        5208
                },
      
        5209
            )
      
        5210
            executor = FakeExecutor(
      
        5211
                [
      
        5212
                    tool_outcome(
      
        5213
                        tool_call=tool_call,
      
        5214
                        output="Todos updated",
      
        5215
                        is_error=False,
      
        5216
                        metadata={
      
        5217
                            "new_todos": [
      
        5218
                                {
      
        5219
                                    "content": "Write the introduction chapter",
      
        5220
                                    "active_form": "Writing the introduction chapter",
      
        5221
                                    "status": "pending",
      
        5222
                                }
      
        5223
                            ]
      
        5224
                        },
      
        5225
                    )
      
        5226
                ]
      
        5227
            )
      
        5228
        
        5229
            summary = TurnSummary(final_response="")
      
        5230
            await runner.execute_batch(
      
        5231
                tool_calls=[tool_call],
      
        5232
                tool_source="assistant",
      
        5233
                pending_tool_calls_seen=set(),
      
        5234
                emit=_noop_emit,
      
        5235
                summary=summary,
      
        5236
                dod=dod,
      
        5237
                executor=executor,  # type: ignore[arg-type]
      
        5238
                on_confirmation=None,
      
        5239
                on_user_question=None,
      
        5240
                emit_confirmation=None,
      
        5241
                consecutive_errors=0,
      
        5242
            )
      
        5243
        
        5244
            assert queued_messages
      
        5245
            message = queued_messages[-1]
      
        5246
            assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
      
        5247
            assert "Prefer one `write(file_path=..., content=...)` call" in message
      
        5248
        
        5249
        
        5250
        @pytest.mark.asyncio
      
        5251
        async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
      
        5252
            temp_dir: Path,
      
        5253
        ) -> None:
      
        5254
            async def assess_confidence(
      
        5255
                tool_name: str,
      
        5256
                tool_args: dict,
      
        5257
                context: str,
      
        5258
            ) -> ConfidenceAssessment:
      
        5259
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5260
        
        5261
            async def verify_action(
      
        5262
                tool_name: str,
      
        5263
                tool_args: dict,
      
        5264
                result: str,
      
        5265
                expected: str = "",
      
        5266
            ) -> ActionVerification:
      
        5267
                raise AssertionError("Verification should not run in this scenario")
      
        5268
        
        5269
            guide_root = temp_dir / "guides" / "nginx"
      
        5270
            chapters = guide_root / "chapters"
      
        5271
            guide_root.mkdir(parents=True)
      
        5272
            chapters.mkdir()
      
        5273
            index_path = guide_root / "index.html"
      
        5274
            chapter_one = chapters / "01-getting-started.html"
      
        5275
            chapter_two = chapters / "02-installation.html"
      
        5276
            index_path.write_text("<html></html>\n")
      
        5277
            chapter_one.write_text("<h1>One</h1>\n")
      
        5278
        
        5279
            implementation_plan = temp_dir / "implementation.md"
      
        5280
            implementation_plan.write_text(
      
        5281
                "\n".join(
      
        5282
                    [
      
        5283
                        "# Implementation Plan",
      
        5284
                        "",
      
        5285
                        "## File Changes",
      
        5286
                        f"- `{guide_root}/`",
      
        5287
                        f"- `{chapters}/`",
      
        5288
                        f"- `{index_path}`",
      
        5289
                        f"- `{chapter_one}`",
      
        5290
                        f"- `{chapter_two}`",
      
        5291
                        "",
      
        5292
                    ]
      
        5293
                )
      
        5294
            )
      
        5295
        
        5296
            context = build_context(
      
        5297
                temp_dir=temp_dir,
      
        5298
                messages=[],
      
        5299
                safeguards=FakeSafeguards(),
      
        5300
                assess_confidence=assess_confidence,
      
        5301
                verify_action=verify_action,
      
        5302
                auto_recover=False,
      
        5303
            )
      
        5304
            queued_messages: list[str] = []
      
        5305
            context.queue_steering_message_callback = queued_messages.append
      
        5306
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5307
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5308
            dod.implementation_plan = str(implementation_plan)
      
        5309
            sync_todos_to_definition_of_done(
      
        5310
                dod,
      
        5311
                [
      
        5312
                    {
      
        5313
                        "content": "Create 01-getting-started.html",
      
        5314
                        "active_form": "Creating 01-getting-started.html",
      
        5315
                        "status": "completed",
      
        5316
                    },
      
        5317
                    {
      
        5318
                        "content": "Create 02-installation.html",
      
        5319
                        "active_form": "Creating 02-installation.html",
      
        5320
                        "status": "pending",
      
        5321
                    },
      
        5322
                ],
      
        5323
                project_root=temp_dir,
      
        5324
            )
      
        5325
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5326
        
        5327
            tool_call = ToolCall(
      
        5328
                id="working-note",
      
        5329
                name="notepad_write_working",
      
        5330
                arguments={"content": "Creating the second chapter file: Installation"},
      
        5331
            )
      
        5332
            executor = FakeExecutor(
      
        5333
                [
      
        5334
                    tool_outcome(
      
        5335
                        tool_call=tool_call,
      
        5336
                        output="Working note recorded",
      
        5337
                        is_error=False,
      
        5338
                    )
      
        5339
                ]
      
        5340
            )
      
        5341
        
        5342
            summary = TurnSummary(final_response="")
      
        5343
            await runner.execute_batch(
      
        5344
                tool_calls=[tool_call],
      
        5345
                tool_source="assistant",
      
        5346
                pending_tool_calls_seen=set(),
      
        5347
                emit=_noop_emit,
      
        5348
                summary=summary,
      
        5349
                dod=dod,
      
        5350
                executor=executor,  # type: ignore[arg-type]
      
        5351
                on_confirmation=None,
      
        5352
                on_user_question=None,
      
        5353
                emit_confirmation=None,
      
        5354
                consecutive_errors=0,
      
        5355
            )
      
        5356
        
        5357
            assert queued_messages
      
        5358
            message = queued_messages[-1]
      
        5359
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        5360
            assert "Resume by creating `02-installation.html` now." in message
      
        5361
            assert "Make your next response the concrete mutation tool call itself" in message
      
        5362
            assert "refresh `TodoWrite`" in message
      
        5363
            assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
      
        5364
        
        5365
        
        5366
        @pytest.mark.asyncio
      
        5367
        async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
      
        5368
            temp_dir: Path,
      
        5369
        ) -> None:
      
        5370
            async def assess_confidence(
      
        5371
                tool_name: str,
      
        5372
                tool_args: dict,
      
        5373
                context: str,
      
        5374
            ) -> ConfidenceAssessment:
      
        5375
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5376
        
        5377
            async def verify_action(
      
        5378
                tool_name: str,
      
        5379
                tool_args: dict,
      
        5380
                result: str,
      
        5381
                expected: str = "",
      
        5382
            ) -> ActionVerification:
      
        5383
                raise AssertionError("Verification should not run in this scenario")
      
        5384
        
        5385
            implementation_plan = temp_dir / "implementation.md"
      
        5386
            implementation_plan.write_text(
      
        5387
                "\n".join(
      
        5388
                    [
      
        5389
                        "# Implementation Plan",
      
        5390
                        "",
      
        5391
                        "## File Changes",
      
        5392
                        f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
      
        5393
                        f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
      
        5394
                        "",
      
        5395
                    ]
      
        5396
                )
      
        5397
            )
      
        5398
        
        5399
            context = build_context(
      
        5400
                temp_dir=temp_dir,
      
        5401
                messages=[],
      
        5402
                safeguards=FakeSafeguards(),
      
        5403
                assess_confidence=assess_confidence,
      
        5404
                verify_action=verify_action,
      
        5405
                auto_recover=False,
      
        5406
            )
      
        5407
            queued_messages: list[str] = []
      
        5408
            context.queue_steering_message_callback = queued_messages.append
      
        5409
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5410
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5411
            dod.implementation_plan = str(implementation_plan)
      
        5412
            dod.pending_items.extend(
      
        5413
                [
      
        5414
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        5415
                    "Create the nginx directory structure",
      
        5416
                    "Develop the main index.html file for the nginx guide",
      
        5417
                ]
      
        5418
            )
      
        5419
        
        5420
            tool_call = ToolCall(
      
        5421
                id="working-note",
      
        5422
                name="notepad_write_working",
      
        5423
                arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
      
        5424
            )
      
        5425
            executor = FakeExecutor(
      
        5426
                [
      
        5427
                    tool_outcome(
      
        5428
                        tool_call=tool_call,
      
        5429
                        output="Working note recorded",
      
        5430
                        is_error=False,
      
        5431
                    )
      
        5432
                ]
      
        5433
            )
      
        5434
        
        5435
            summary = TurnSummary(final_response="")
      
        5436
            await runner.execute_batch(
      
        5437
                tool_calls=[tool_call],
      
        5438
                tool_source="assistant",
      
        5439
                pending_tool_calls_seen=set(),
      
        5440
                emit=_noop_emit,
      
        5441
                summary=summary,
      
        5442
                dod=dod,
      
        5443
                executor=executor,  # type: ignore[arg-type]
      
        5444
                on_confirmation=None,
      
        5445
                on_user_question=None,
      
        5446
                emit_confirmation=None,
      
        5447
                consecutive_errors=0,
      
        5448
            )
      
        5449
        
        5450
            assert queued_messages
      
        5451
            message = queued_messages[-1]
      
        5452
            assert (
      
        5453
                "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
      
        5454
                in message
      
        5455
            )
      
        5456
            assert "one concrete evidence-gathering tool call" in message
      
        5457
            assert "Resume by creating `index.html` now." not in message
      
        5458
        
        5459
        
        5460
        @pytest.mark.asyncio
      
        5461
        async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
      
        5462
            temp_dir: Path,
      
        5463
        ) -> None:
      
        5464
            async def assess_confidence(
      
        5465
                tool_name: str,
      
        5466
                tool_args: dict,
      
        5467
                context: str,
      
        5468
            ) -> ConfidenceAssessment:
      
        5469
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5470
        
        5471
            async def verify_action(
      
        5472
                tool_name: str,
      
        5473
                tool_args: dict,
      
        5474
                result: str,
      
        5475
                expected: str = "",
      
        5476
            ) -> ActionVerification:
      
        5477
                raise AssertionError("Verification should not run in this scenario")
      
        5478
        
        5479
            guide_root = temp_dir / "guides" / "nginx"
      
        5480
            chapters_dir = guide_root / "chapters"
      
        5481
            chapters_dir.mkdir(parents=True)
      
        5482
            index_path = guide_root / "index.html"
      
        5483
            first_chapter = chapters_dir / "01-introduction.html"
      
        5484
            index_path.write_text(
      
        5485
                "\n".join(
      
        5486
                    [
      
        5487
                        '<a href="chapters/01-introduction.html">Introduction</a>',
      
        5488
                        '<a href="chapters/02-installation.html">Installation</a>',
      
        5489
                        '<a href="chapters/03-configuration.html">Configuration</a>',
      
        5490
                    ]
      
        5491
                )
      
        5492
            )
      
        5493
            first_chapter.write_text("<h1>Introduction</h1>\n")
      
        5494
        
        5495
            implementation_plan = temp_dir / "implementation.md"
      
        5496
            implementation_plan.write_text(
      
        5497
                "\n".join(
      
        5498
                    [
      
        5499
                        "# Implementation Plan",
      
        5500
                        "",
      
        5501
                        "## File Changes",
      
        5502
                        f"- `{guide_root / 'index.html'}`",
      
        5503
                        f"- `{chapters_dir}/`",
      
        5504
                        "",
      
        5505
                    ]
      
        5506
                )
      
        5507
            )
      
        5508
        
        5509
            context = build_context(
      
        5510
                temp_dir=temp_dir,
      
        5511
                messages=[],
      
        5512
                safeguards=FakeSafeguards(),
      
        5513
                assess_confidence=assess_confidence,
      
        5514
                verify_action=verify_action,
      
        5515
                auto_recover=False,
      
        5516
            )
      
        5517
            queued_messages: list[str] = []
      
        5518
            context.queue_steering_message_callback = queued_messages.append
      
        5519
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5520
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5521
            dod.implementation_plan = str(implementation_plan)
      
        5522
            dod.pending_items.extend(
      
        5523
                [
      
        5524
                    "First, examine the existing fortran guide structure and content to understand the format",
      
        5525
                    "Create chapter files following the established pattern",
      
        5526
                ]
      
        5527
            )
      
        5528
            dod.touched_files.extend([str(index_path), str(first_chapter)])
      
        5529
        
        5530
            tool_call = ToolCall(
      
        5531
                id="working-note",
      
        5532
                name="notepad_write_working",
      
        5533
                arguments={"content": "Created index and first chapter; next is chapter 2"},
      
        5534
            )
      
        5535
            executor = FakeExecutor(
      
        5536
                [
      
        5537
                    tool_outcome(
      
        5538
                        tool_call=tool_call,
      
        5539
                        output="Working note recorded",
      
        5540
                        is_error=False,
      
        5541
                    )
      
        5542
                ]
      
        5543
            )
      
        5544
        
        5545
            summary = TurnSummary(final_response="")
      
        5546
            await runner.execute_batch(
      
        5547
                tool_calls=[tool_call],
      
        5548
                tool_source="assistant",
      
        5549
                pending_tool_calls_seen=set(),
      
        5550
                emit=_noop_emit,
      
        5551
                summary=summary,
      
        5552
                dod=dod,
      
        5553
                executor=executor,  # type: ignore[arg-type]
      
        5554
                on_confirmation=None,
      
        5555
                on_user_question=None,
      
        5556
                emit_confirmation=None,
      
        5557
                consecutive_errors=0,
      
        5558
            )
      
        5559
        
        5560
            assert queued_messages
      
        5561
            message = queued_messages[-1]
      
        5562
            assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
      
        5563
            assert "Resume by creating `02-installation.html` now." in message
      
        5564
            assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
      
        5565
        
        5566
        
        5567
        @pytest.mark.asyncio
      
        5568
        async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
      
        5569
            temp_dir: Path,
      
        5570
        ) -> None:
      
        5571
            async def assess_confidence(
      
        5572
                tool_name: str,
      
        5573
                tool_args: dict,
      
        5574
                context: str,
      
        5575
            ) -> ConfidenceAssessment:
      
        5576
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5577
        
        5578
            async def verify_action(
      
        5579
                tool_name: str,
      
        5580
                tool_args: dict,
      
        5581
                result: str,
      
        5582
                expected: str = "",
      
        5583
            ) -> ActionVerification:
      
        5584
                raise AssertionError("Verification should not run in this scenario")
      
        5585
        
        5586
            fortran_root = temp_dir / "Loader" / "guides" / "fortran"
      
        5587
            chapters_dir = fortran_root / "chapters"
      
        5588
            chapters_dir.mkdir(parents=True)
      
        5589
        
        5590
            implementation_plan = temp_dir / "implementation.md"
      
        5591
            implementation_plan.write_text(
      
        5592
                "\n".join(
      
        5593
                    [
      
        5594
                        "# Implementation Plan",
      
        5595
                        "",
      
        5596
                        "## File Changes",
      
        5597
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
      
        5598
                        f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
      
        5599
                        "",
      
        5600
                    ]
      
        5601
                )
      
        5602
            )
      
        5603
        
        5604
            context = build_context(
      
        5605
                temp_dir=temp_dir,
      
        5606
                messages=[],
      
        5607
                safeguards=FakeSafeguards(),
      
        5608
                assess_confidence=assess_confidence,
      
        5609
                verify_action=verify_action,
      
        5610
                auto_recover=False,
      
        5611
            )
      
        5612
            queued_messages: list[str] = []
      
        5613
            context.queue_steering_message_callback = queued_messages.append
      
        5614
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5615
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        5616
            dod.implementation_plan = str(implementation_plan)
      
        5617
            dod.pending_items.extend(
      
        5618
                [
      
        5619
                    "First, examine the existing fortran guide structure and content",
      
        5620
                    "Create the nginx directory structure",
      
        5621
                    "Develop the main index.html file for nginx guide",
      
        5622
                ]
      
        5623
            )
      
        5624
        
        5625
            tool_call = ToolCall(
      
        5626
                id="glob-1",
      
        5627
                name="glob",
      
        5628
                arguments={"pattern": "**", "path": str(fortran_root)},
      
        5629
            )
      
        5630
            executor = FakeExecutor(
      
        5631
                [
      
        5632
                    tool_outcome(
      
        5633
                        tool_call=tool_call,
      
        5634
                        output=f"{fortran_root}\n{chapters_dir}",
      
        5635
                        is_error=False,
      
        5636
                    )
      
        5637
                ]
      
        5638
            )
      
        5639
        
        5640
            summary = TurnSummary(final_response="")
      
        5641
            await runner.execute_batch(
      
        5642
                tool_calls=[tool_call],
      
        5643
                tool_source="assistant",
      
        5644
                pending_tool_calls_seen=set(),
      
        5645
                emit=_noop_emit,
      
        5646
                summary=summary,
      
        5647
                dod=dod,
      
        5648
                executor=executor,  # type: ignore[arg-type]
      
        5649
                on_confirmation=None,
      
        5650
                on_user_question=None,
      
        5651
                emit_confirmation=None,
      
        5652
                consecutive_errors=0,
      
        5653
            )
      
        5654
        
        5655
            assert queued_messages == []
      
        5656
        
        5657
        
        5658
        @pytest.mark.asyncio
      
        5659
        async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      
        5660
            temp_dir: Path,
      
        5661
        ) -> None:
      
        5662
            async def assess_confidence(
      
        5663
                tool_name: str,
      
        5664
                tool_args: dict,
      
        5665
                context: str,
      
        5666
            ) -> ConfidenceAssessment:
      
        5667
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        5668
        
        5669
            async def verify_action(
      
        5670
                tool_name: str,
      
        5671
                tool_args: dict,
      
        5672
                result: str,
      
        5673
                expected: str = "",
      
        5674
            ) -> ActionVerification:
      
        5675
                raise AssertionError("Verification should not run in this scenario")
      
        5676
        
        5677
            prompt = (
      
        5678
                "Have a look at ~/Loader/guides/fortran/index.html, then "
      
        5679
                "~/Loader/guides/fortran/chapters. The table of contents links in "
      
        5680
                "index.html are inaccurate and the href’s are wrong. Let’s update the "
      
        5681
                "links and their link texts to be correct."
      
        5682
            )
      
        5683
            chapters = temp_dir / "chapters"
      
        5684
            chapters.mkdir()
      
        5685
            (chapters / "01-introduction.html").write_text(
      
        5686
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        5687
            )
      
        5688
            (chapters / "02-setup.html").write_text(
      
        5689
                "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
      
        5690
            )
      
        5691
            current_block = (
      
        5692
                "<h2>Table of Contents</h2>\n"
      
        5693
                '        <ul class="chapter-list">\n'
      
        5694
                '            <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
      
        5695
                '            <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
      
        5696
                "        </ul>\n"
      
        5697
            )
      
        5698
            index_path = temp_dir / "index.html"
      
        5699
            index_path.write_text(current_block)
      
        5700
        
        5701
            context = build_context(
      
        5702
                temp_dir=temp_dir,
      
        5703
                messages=[],
      
        5704
                safeguards=FakeSafeguards(),
      
        5705
                assess_confidence=assess_confidence,
      
        5706
                verify_action=verify_action,
      
        5707
                auto_recover=False,
      
        5708
            )
      
        5709
            context.session.current_task = prompt  # type: ignore[attr-defined]
      
        5710
            queued_messages: list[str] = []
      
        5711
            context.queue_steering_message_callback = queued_messages.append
      
        5712
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5713
            tool_call = ToolCall(
      
        5714
                id="edit-1",
      
        5715
                name="edit",
      
        5716
                arguments={
      
        5717
                    "file_path": str(index_path),
      
        5718
                    "old_string": current_block,
      
        5719
                    "new_string": current_block,
      
        5720
                },
      
        5721
            )
      
        5722
            executor = FakeExecutor(
      
        5723
                [
      
        5724
                    tool_outcome(
      
        5725
                        tool_call=tool_call,
      
        5726
                        output=(
      
        5727
                            "[Blocked - old_string and new_string are identical - no change "
      
        5728
                            "would occur] Suggestion: Provide different old and new strings"
      
        5729
                        ),
      
        5730
                        is_error=True,
      
        5731
                        state=ToolExecutionState.BLOCKED,
      
        5732
                    )
      
        5733
                ]
      
        5734
            )
      
        5735
        
        5736
            await runner.execute_batch(
      
        5737
                tool_calls=[tool_call],
      
        5738
                tool_source="assistant",
      
        5739
                pending_tool_calls_seen=set(),
      
        5740
                emit=_noop_emit,
      
        5741
                summary=TurnSummary(final_response=""),
      
        5742
                dod=create_definition_of_done(prompt),
      
        5743
                executor=executor,  # type: ignore[arg-type]
      
        5744
                on_confirmation=None,
      
        5745
                on_user_question=None,
      
        5746
                emit_confirmation=None,
      
        5747
                consecutive_errors=0,
      
        5748
            )
      
        5749
        
        5750
            assert queued_messages == []
      
        5751
        
        5752
        
        5753
        def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
      
        5754
            temp_dir: Path,
      
        5755
        ) -> None:
      
        5756
            async def assess_confidence(
      
        5757
                tool_name: str,
      
        5758
                tool_args: dict,
      
        5759
                context: str,
      
        5760
            ) -> ConfidenceAssessment:
      
        5761
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5762
        
        5763
            async def verify_action(
      
        5764
                tool_name: str,
      
        5765
                tool_args: dict,
      
        5766
                result: str,
      
        5767
                expected: str = "",
      
        5768
            ) -> ActionVerification:
      
        5769
                raise AssertionError("Verification should not run in this scenario")
      
        5770
        
        5771
            repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
      
        5772
            context = build_context(
      
        5773
                temp_dir=temp_dir,
      
        5774
                messages=[
      
        5775
                    Message(
      
        5776
                        role=Role.ASSISTANT,
      
        5777
                        content=(
      
        5778
                            "Repair focus:\n"
      
        5779
                            f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
      
        5780
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        5781
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
      
        5782
                        ),
      
        5783
                    )
      
        5784
                ],
      
        5785
                safeguards=FakeSafeguards(),
      
        5786
                assess_confidence=assess_confidence,
      
        5787
                verify_action=verify_action,
      
        5788
            )
      
        5789
            queued: list[str] = []
      
        5790
            context.queue_steering_message_callback = queued.append
      
        5791
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5792
            dod = create_definition_of_done("Repair a guide page.")
      
        5793
        
        5794
            runner._queue_blocked_html_edit_nudge(
      
        5795
                ToolCall(
      
        5796
                    id="edit-1",
      
        5797
                    name="edit",
      
        5798
                    arguments={
      
        5799
                        "file_path": str(repair_target),
      
        5800
                        "old_string": "same",
      
        5801
                        "new_string": "same",
      
        5802
                    },
      
        5803
                ),
      
        5804
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        5805
                dod=dod,
      
        5806
            )
      
        5807
        
        5808
            assert queued
      
        5809
            assert str(repair_target) in queued[0]
      
        5810
            assert "no on-disk change" in queued[0]
      
        5811
            assert "replace the surrounding block" in queued[0]
      
        5812
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        5813
        
        5814
        
        5815
        def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
      
        5816
            temp_dir: Path,
      
        5817
        ) -> None:
      
        5818
            async def assess_confidence(
      
        5819
                tool_name: str,
      
        5820
                tool_args: dict,
      
        5821
                context: str,
      
        5822
            ) -> ConfidenceAssessment:
      
        5823
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5824
        
        5825
            async def verify_action(
      
        5826
                tool_name: str,
      
        5827
                tool_args: dict,
      
        5828
                result: str,
      
        5829
                expected: str = "",
      
        5830
            ) -> ActionVerification:
      
        5831
                raise AssertionError("Verification should not run in this scenario")
      
        5832
        
        5833
            guide_root = temp_dir / "guide"
      
        5834
            chapters = guide_root / "chapters"
      
        5835
            chapters.mkdir(parents=True)
      
        5836
            index_path = guide_root / "index.html"
      
        5837
            chapter_one = chapters / "01-introduction.html"
      
        5838
            index_path.write_text("<html></html>\n")
      
        5839
            chapter_one.write_text("<html></html>\n")
      
        5840
        
        5841
            implementation_plan = temp_dir / "implementation.md"
      
        5842
            implementation_plan.write_text(
      
        5843
                "\n".join(
      
        5844
                    [
      
        5845
                        "# Implementation Plan",
      
        5846
                        "",
      
        5847
                        "## File Changes",
      
        5848
                        f"- `{index_path}`",
      
        5849
                        f"- `{chapter_one}`",
      
        5850
                        "",
      
        5851
                    ]
      
        5852
                )
      
        5853
            )
      
        5854
        
        5855
            context = build_context(
      
        5856
                temp_dir=temp_dir,
      
        5857
                messages=[
      
        5858
                    Message(
      
        5859
                        role=Role.ASSISTANT,
      
        5860
                        content=(
      
        5861
                            "Repair focus:\n"
      
        5862
                            f"- Confirm the final guide state in `{index_path}`.\n"
      
        5863
                            f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
      
        5864
                        ),
      
        5865
                    )
      
        5866
                ],
      
        5867
                safeguards=FakeSafeguards(),
      
        5868
                assess_confidence=assess_confidence,
      
        5869
                verify_action=verify_action,
      
        5870
            )
      
        5871
            queued: list[str] = []
      
        5872
            context.queue_steering_message_callback = queued.append
      
        5873
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5874
        
        5875
            dod = create_definition_of_done("Create a multi-file guide.")
      
        5876
            dod.implementation_plan = str(implementation_plan)
      
        5877
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        5878
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        5879
        
        5880
            runner._queue_blocked_html_edit_nudge(
      
        5881
                ToolCall(
      
        5882
                    id="edit-1",
      
        5883
                    name="edit",
      
        5884
                    arguments={
      
        5885
                        "file_path": str(index_path),
      
        5886
                        "old_string": "same",
      
        5887
                        "new_string": "same",
      
        5888
                    },
      
        5889
                ),
      
        5890
                "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
      
        5891
                dod=dod,
      
        5892
            )
      
        5893
        
        5894
            assert queued
      
        5895
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        5896
            assert "Move to verification or final confirmation using the files already on disk." in queued[0]
      
        5897
            assert "replace the surrounding block" not in queued[0]
      
        5898
        
        5899
        
        5900
        async def _noop_emit(event: AgentEvent) -> None:
      
        5901
            return None
      
        5902
        
        5903
        
        5904
        @pytest.mark.asyncio
      
        5905
        async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
      
        5906
            temp_dir: Path,
      
        5907
        ) -> None:
      
        5908
            async def assess_confidence(
      
        5909
                tool_name: str,
      
        5910
                tool_args: dict,
      
        5911
                context: str,
      
        5912
            ) -> ConfidenceAssessment:
      
        5913
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5914
        
        5915
            async def verify_action(
      
        5916
                tool_name: str,
      
        5917
                tool_args: dict,
      
        5918
                result: str,
      
        5919
                expected: str = "",
      
        5920
            ) -> ActionVerification:
      
        5921
                raise AssertionError("Verification should not run for this scenario")
      
        5922
        
        5923
            context = build_context(
      
        5924
                temp_dir=temp_dir,
      
        5925
                messages=[],
      
        5926
                safeguards=FakeSafeguards(),
      
        5927
                assess_confidence=assess_confidence,
      
        5928
                verify_action=verify_action,
      
        5929
            )
      
        5930
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        5931
            tool_call = ToolCall(
      
        5932
                id="write-1",
      
        5933
                name="write",
      
        5934
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        5935
            )
      
        5936
            executor = FakeExecutor(
      
        5937
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        5938
            )
      
        5939
            summary = TurnSummary(final_response="")
      
        5940
            dod = create_definition_of_done("Update README and verify it still works.")
      
        5941
            events: list[AgentEvent] = []
      
        5942
        
        5943
            async def emit(event: AgentEvent) -> None:
      
        5944
                events.append(event)
      
        5945
        
        5946
            await runner.execute_batch(
      
        5947
                tool_calls=[tool_call],
      
        5948
                tool_source="assistant",
      
        5949
                pending_tool_calls_seen=set(),
      
        5950
                emit=emit,
      
        5951
                summary=summary,
      
        5952
                dod=dod,
      
        5953
                executor=executor,  # type: ignore[arg-type]
      
        5954
                on_confirmation=None,
      
        5955
                on_user_question=None,
      
        5956
                emit_confirmation=None,
      
        5957
                consecutive_errors=0,
      
        5958
            )
      
        5959
        
        5960
            assert dod.last_verification_result == "planned"
      
        5961
            assert dod.verification_commands
      
        5962
            assert "Collect verification evidence" in dod.pending_items
      
        5963
            assert dod.active_verification_attempt_id == "verification-attempt-1"
      
        5964
            assert dod.active_verification_attempt_number == 1
      
        5965
            assert summary.workflow_timeline[-1].reason_code == "verification_planned"
      
        5966
            assert summary.workflow_timeline[-1].policy_outcome == "planned"
      
        5967
            assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
      
        5968
            assert (
      
        5969
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        5970
                == "verification-attempt-1"
      
        5971
            )
      
        5972
            assert (
      
        5973
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        5974
            )
      
        5975
        
        5976
        
        5977
        @pytest.mark.asyncio
      
        5978
        async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
      
        5979
            temp_dir: Path,
      
        5980
        ) -> None:
      
        5981
            async def assess_confidence(
      
        5982
                tool_name: str,
      
        5983
                tool_args: dict,
      
        5984
                context: str,
      
        5985
            ) -> ConfidenceAssessment:
      
        5986
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        5987
        
        5988
            async def verify_action(
      
        5989
                tool_name: str,
      
        5990
                tool_args: dict,
      
        5991
                result: str,
      
        5992
                expected: str = "",
      
        5993
            ) -> ActionVerification:
      
        5994
                raise AssertionError("Verification should not run in this scenario")
      
        5995
        
        5996
            context = build_context(
      
        5997
                temp_dir=temp_dir,
      
        5998
                messages=[],
      
        5999
                safeguards=FakeSafeguards(),
      
        6000
                assess_confidence=assess_confidence,
      
        6001
                verify_action=verify_action,
      
        6002
            )
      
        6003
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6004
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        6005
            chapters = nginx_root / "chapters"
      
        6006
            implementation_plan = temp_dir / "implementation.md"
      
        6007
            implementation_plan.write_text(
      
        6008
                "\n".join(
      
        6009
                    [
      
        6010
                        "# Implementation Plan",
      
        6011
                        "",
      
        6012
                        "## File Changes",
      
        6013
                        f"- `{chapters}/`",
      
        6014
                        f"- `{nginx_root / 'index.html'}`",
      
        6015
                        "",
      
        6016
                    ]
      
        6017
                )
      
        6018
            )
      
        6019
        
        6020
            tool_call = ToolCall(
      
        6021
                id="mkdir-1",
      
        6022
                name="bash",
      
        6023
                arguments={"command": f"mkdir -p {chapters}"},
      
        6024
            )
      
        6025
            executor = FakeExecutor(
      
        6026
                [tool_outcome(tool_call=tool_call, output="", is_error=False)]
      
        6027
            )
      
        6028
            summary = TurnSummary(final_response="")
      
        6029
            dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
      
        6030
            dod.implementation_plan = str(implementation_plan)
      
        6031
            events: list[AgentEvent] = []
      
        6032
        
        6033
            async def emit(event: AgentEvent) -> None:
      
        6034
                events.append(event)
      
        6035
        
        6036
            await runner.execute_batch(
      
        6037
                tool_calls=[tool_call],
      
        6038
                tool_source="assistant",
      
        6039
                pending_tool_calls_seen=set(),
      
        6040
                emit=emit,
      
        6041
                summary=summary,
      
        6042
                dod=dod,
      
        6043
                executor=executor,  # type: ignore[arg-type]
      
        6044
                on_confirmation=None,
      
        6045
                on_user_question=None,
      
        6046
                emit_confirmation=None,
      
        6047
                consecutive_errors=0,
      
        6048
            )
      
        6049
        
        6050
            assert dod.last_verification_result is None
      
        6051
            assert "Collect verification evidence" not in dod.pending_items
      
        6052
            assert not any(
      
        6053
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        6054
            )
      
        6055
        
        6056
        
        6057
        @pytest.mark.asyncio
      
        6058
        async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
      
        6059
            temp_dir: Path,
      
        6060
        ) -> None:
      
        6061
            async def assess_confidence(
      
        6062
                tool_name: str,
      
        6063
                tool_args: dict,
      
        6064
                context: str,
      
        6065
            ) -> ConfidenceAssessment:
      
        6066
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6067
        
        6068
            async def verify_action(
      
        6069
                tool_name: str,
      
        6070
                tool_args: dict,
      
        6071
                result: str,
      
        6072
                expected: str = "",
      
        6073
            ) -> ActionVerification:
      
        6074
                raise AssertionError("Verification should not run in this scenario")
      
        6075
        
        6076
            context = build_context(
      
        6077
                temp_dir=temp_dir,
      
        6078
                messages=[],
      
        6079
                safeguards=FakeSafeguards(),
      
        6080
                assess_confidence=assess_confidence,
      
        6081
                verify_action=verify_action,
      
        6082
            )
      
        6083
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6084
            nginx_root = temp_dir / "Loader" / "guides" / "nginx"
      
        6085
            chapters = nginx_root / "chapters"
      
        6086
            chapters.mkdir(parents=True)
      
        6087
            index_path = nginx_root / "index.html"
      
        6088
            implementation_plan = temp_dir / "implementation.md"
      
        6089
            implementation_plan.write_text(
      
        6090
                "\n".join(
      
        6091
                    [
      
        6092
                        "# Implementation Plan",
      
        6093
                        "",
      
        6094
                        "## File Changes",
      
        6095
                        f"- `{nginx_root}/`",
      
        6096
                        f"- `{chapters}/`",
      
        6097
                        f"- `{index_path}`",
      
        6098
                        "",
      
        6099
                    ]
      
        6100
                )
      
        6101
            )
      
        6102
        
        6103
            tool_call = ToolCall(
      
        6104
                id="write-index",
      
        6105
                name="write",
      
        6106
                arguments={"file_path": str(index_path), "content": "<html></html>\n"},
      
        6107
            )
      
        6108
            executor = FakeExecutor(
      
        6109
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        6110
            )
      
        6111
            summary = TurnSummary(final_response="")
      
        6112
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        6113
            dod.implementation_plan = str(implementation_plan)
      
        6114
            dod.pending_items.extend(
      
        6115
                [
      
        6116
                    "Develop the main index.html file with proper structure",
      
        6117
                    "Create first nginx chapter",
      
        6118
                ]
      
        6119
            )
      
        6120
            events: list[AgentEvent] = []
      
        6121
        
        6122
            async def emit(event: AgentEvent) -> None:
      
        6123
                events.append(event)
      
        6124
        
        6125
            await runner.execute_batch(
      
        6126
                tool_calls=[tool_call],
      
        6127
                tool_source="assistant",
      
        6128
                pending_tool_calls_seen=set(),
      
        6129
                emit=emit,
      
        6130
                summary=summary,
      
        6131
                dod=dod,
      
        6132
                executor=executor,  # type: ignore[arg-type]
      
        6133
                on_confirmation=None,
      
        6134
                on_user_question=None,
      
        6135
                emit_confirmation=None,
      
        6136
                consecutive_errors=0,
      
        6137
            )
      
        6138
        
        6139
            assert dod.last_verification_result is None
      
        6140
            assert "Collect verification evidence" not in dod.pending_items
      
        6141
            assert "Create first nginx chapter" in dod.pending_items
      
        6142
            assert not any(
      
        6143
                entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
      
        6144
            )
      
        6145
        
        6146
        
        6147
        @pytest.mark.asyncio
      
        6148
        async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      
        6149
            temp_dir: Path,
      
        6150
        ) -> None:
      
        6151
            async def assess_confidence(
      
        6152
                tool_name: str,
      
        6153
                tool_args: dict,
      
        6154
                context: str,
      
        6155
            ) -> ConfidenceAssessment:
      
        6156
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6157
        
        6158
            async def verify_action(
      
        6159
                tool_name: str,
      
        6160
                tool_args: dict,
      
        6161
                result: str,
      
        6162
                expected: str = "",
      
        6163
            ) -> ActionVerification:
      
        6164
                raise AssertionError("Verification should not run for this scenario")
      
        6165
        
        6166
            context = build_context(
      
        6167
                temp_dir=temp_dir,
      
        6168
                messages=[],
      
        6169
                safeguards=FakeSafeguards(),
      
        6170
                assess_confidence=assess_confidence,
      
        6171
                verify_action=verify_action,
      
        6172
            )
      
        6173
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6174
            tool_call = ToolCall(
      
        6175
                id="write-1",
      
        6176
                name="write",
      
        6177
                arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
      
        6178
            )
      
        6179
            executor = FakeExecutor(
      
        6180
                [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
      
        6181
            )
      
        6182
            summary = TurnSummary(final_response="")
      
        6183
            dod = create_definition_of_done("Update README and verify it still works.")
      
        6184
            dod.verification_commands = ["uv run pytest -q"]
      
        6185
            dod.last_verification_result = "passed"
      
        6186
            dod.verification_attempt_counter = 1
      
        6187
            dod.active_verification_attempt_id = "verification-attempt-1"
      
        6188
            dod.active_verification_attempt_number = 1
      
        6189
            dod.evidence = [
      
        6190
                VerificationEvidence(
      
        6191
                    command="uv run pytest -q",
      
        6192
                    passed=True,
      
        6193
                    stdout="401 passed",
      
        6194
                    kind="test",
      
        6195
                )
      
        6196
            ]
      
        6197
            dod.completed_items.append("Collect verification evidence")
      
        6198
            events: list[AgentEvent] = []
      
        6199
        
        6200
            async def emit(event: AgentEvent) -> None:
      
        6201
                events.append(event)
      
        6202
        
        6203
            await runner.execute_batch(
      
        6204
                tool_calls=[tool_call],
      
        6205
                tool_source="assistant",
      
        6206
                pending_tool_calls_seen=set(),
      
        6207
                emit=emit,
      
        6208
                summary=summary,
      
        6209
                dod=dod,
      
        6210
                executor=executor,  # type: ignore[arg-type]
      
        6211
                on_confirmation=None,
      
        6212
                on_user_question=None,
      
        6213
                emit_confirmation=None,
      
        6214
                consecutive_errors=0,
      
        6215
            )
      
        6216
        
        6217
            assert dod.last_verification_result == "stale"
      
        6218
            assert dod.evidence == []
      
        6219
            assert "Collect verification evidence" in dod.pending_items
      
        6220
            assert "Collect verification evidence" not in dod.completed_items
      
        6221
            assert dod.active_verification_attempt_id == "verification-attempt-2"
      
        6222
            assert dod.active_verification_attempt_number == 2
      
        6223
            assert summary.workflow_timeline[-1].reason_code == "verification_stale"
      
        6224
            assert summary.workflow_timeline[-1].policy_outcome == "stale"
      
        6225
            assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
      
        6226
            assert (
      
        6227
                summary.workflow_timeline[-1].verification_observations[0].attempt_id
      
        6228
                == "verification-attempt-1"
      
        6229
            )
      
        6230
            assert (
      
        6231
                summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
      
        6232
            )
      
        6233
            assert (
      
        6234
                summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
      
        6235
                == "verification-attempt-2"
      
        6236
            )
      
        6237
            assert (
      
        6238
                summary.workflow_timeline[-1].verification_observations[0].command
      
        6239
                == "uv run pytest -q"
      
        6240
            )
      
        6241
        
        6242
        
        6243
        def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
      
        6244
            async def assess_confidence(
      
        6245
                tool_name: str,
      
        6246
                tool_args: dict,
      
        6247
                context: str,
      
        6248
            ) -> ConfidenceAssessment:
      
        6249
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6250
        
        6251
            async def verify_action(
      
        6252
                tool_name: str,
      
        6253
                tool_args: dict,
      
        6254
                result: str,
      
        6255
                expected: str = "",
      
        6256
            ) -> ActionVerification:
      
        6257
                raise AssertionError("Verification should not run in this scenario")
      
        6258
        
        6259
            repair_target = temp_dir / "guide" / "index.html"
      
        6260
            context = build_context(
      
        6261
                temp_dir=temp_dir,
      
        6262
                messages=[
      
        6263
                    Message(
      
        6264
                        role=Role.ASSISTANT,
      
        6265
                        content=(
      
        6266
                            "Repair focus:\n"
      
        6267
                            f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
      
        6268
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        6269
                            f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
      
        6270
                        ),
      
        6271
                    )
      
        6272
                ],
      
        6273
                safeguards=FakeSafeguards(),
      
        6274
                assess_confidence=assess_confidence,
      
        6275
                verify_action=verify_action,
      
        6276
            )
      
        6277
            queued: list[str] = []
      
        6278
            context.queue_steering_message_callback = queued.append
      
        6279
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6280
        
        6281
            runner._queue_blocked_active_repair_nudge(
      
        6282
                "[Blocked - active repair scope: verification already identified the repair target.]"
      
        6283
            )
      
        6284
        
        6285
            assert queued
      
        6286
            assert str(repair_target) in queued[0]
      
        6287
            assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
      
        6288
            assert "Do not reopen unrelated reference materials" in queued[0]
      
        6289
        
        6290
        
        6291
        def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
      
        6292
            temp_dir: Path,
      
        6293
        ) -> None:
      
        6294
            async def assess_confidence(
      
        6295
                tool_name: str,
      
        6296
                tool_args: dict,
      
        6297
                context: str,
      
        6298
            ) -> ConfidenceAssessment:
      
        6299
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6300
        
        6301
            async def verify_action(
      
        6302
                tool_name: str,
      
        6303
                tool_args: dict,
      
        6304
                result: str,
      
        6305
                expected: str = "",
      
        6306
            ) -> ActionVerification:
      
        6307
                raise AssertionError("Verification should not run in this scenario")
      
        6308
        
        6309
            repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
      
        6310
            stylesheet = temp_dir / "guide" / "styles.css"
      
        6311
            context = build_context(
      
        6312
                temp_dir=temp_dir,
      
        6313
                messages=[
      
        6314
                    Message(
      
        6315
                        role=Role.ASSISTANT,
      
        6316
                        content=(
      
        6317
                            "Repair focus:\n"
      
        6318
                            f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
      
        6319
                            f"- Immediate next step: edit `{repair_target}`.\n"
      
        6320
                            f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
      
        6321
                        ),
      
        6322
                    )
      
        6323
                ],
      
        6324
                safeguards=FakeSafeguards(),
      
        6325
                assess_confidence=assess_confidence,
      
        6326
                verify_action=verify_action,
      
        6327
            )
      
        6328
            queued: list[str] = []
      
        6329
            context.queue_steering_message_callback = queued.append
      
        6330
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6331
        
        6332
            runner._queue_blocked_active_repair_mutation_nudge(
      
        6333
                "[Blocked - active repair mutation scope: verification already identified the repair target.]"
      
        6334
            )
      
        6335
        
        6336
            assert queued
      
        6337
            assert str(repair_target) in queued[0]
      
        6338
            assert str(stylesheet) in queued[0]
      
        6339
            assert "before widening the change set" in queued[0]
      
        6340
        
        6341
        
        6342
        def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
      
        6343
            temp_dir: Path,
      
        6344
        ) -> None:
      
        6345
            async def assess_confidence(
      
        6346
                tool_name: str,
      
        6347
                tool_args: dict,
      
        6348
                context: str,
      
        6349
            ) -> ConfidenceAssessment:
      
        6350
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6351
        
        6352
            async def verify_action(
      
        6353
                tool_name: str,
      
        6354
                tool_args: dict,
      
        6355
                result: str,
      
        6356
                expected: str = "",
      
        6357
            ) -> ActionVerification:
      
        6358
                raise AssertionError("Verification should not run in this scenario")
      
        6359
        
        6360
            context = build_context(
      
        6361
                temp_dir=temp_dir,
      
        6362
                messages=[],
      
        6363
                safeguards=FakeSafeguards(),
      
        6364
                assess_confidence=assess_confidence,
      
        6365
                verify_action=verify_action,
      
        6366
            )
      
        6367
            queued: list[str] = []
      
        6368
            context.queue_steering_message_callback = queued.append
      
        6369
            store = DefinitionOfDoneStore(temp_dir)
      
        6370
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6371
            plan_path = temp_dir / "implementation.md"
      
        6372
            plan_path.write_text(
      
        6373
                "# File Changes\n"
      
        6374
                "- `guide/index.html`\n"
      
        6375
                "- `guide/chapters/01-getting-started.html`\n"
      
        6376
                "- `guide/chapters/02-installation.html`\n"
      
        6377
                "- `guide/chapters/03-first-website.html`\n"
      
        6378
            )
      
        6379
            dod.implementation_plan = str(plan_path)
      
        6380
            (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
      
        6381
            (temp_dir / "guide" / "index.html").write_text("index")
      
        6382
            (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
      
        6383
            (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
      
        6384
            runner = ToolBatchRunner(context, store)
      
        6385
        
        6386
            runner._queue_blocked_late_reference_drift_nudge(
      
        6387
                "[Blocked - late reference drift: several planned artifacts already exist.]",
      
        6388
                dod=dod,
      
        6389
            )
      
        6390
        
        6391
            assert queued
      
        6392
            assert "03-first-website.html" in queued[0]
      
        6393
            assert "older reference materials" in queued[0]
      
        6394
        
        6395
        
        6396
        def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
      
        6397
            temp_dir: Path,
      
        6398
        ) -> None:
      
        6399
            async def assess_confidence(
      
        6400
                tool_name: str,
      
        6401
                tool_args: dict,
      
        6402
                context: str,
      
        6403
            ) -> ConfidenceAssessment:
      
        6404
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6405
        
        6406
            async def verify_action(
      
        6407
                tool_name: str,
      
        6408
                tool_args: dict,
      
        6409
                result: str,
      
        6410
                expected: str = "",
      
        6411
            ) -> ActionVerification:
      
        6412
                raise AssertionError("Verification should not run in this scenario")
      
        6413
        
        6414
            guide_root = temp_dir / "guide"
      
        6415
            chapters = guide_root / "chapters"
      
        6416
            guide_root.mkdir(parents=True)
      
        6417
            chapters.mkdir()
      
        6418
            index_path = guide_root / "index.html"
      
        6419
            chapter_one = chapters / "01-getting-started.html"
      
        6420
            chapter_two = chapters / "02-installation.html"
      
        6421
            index_path.write_text("index")
      
        6422
            chapter_one.write_text("one")
      
        6423
            chapter_two.write_text("two")
      
        6424
        
        6425
            implementation_plan = temp_dir / "implementation.md"
      
        6426
            implementation_plan.write_text(
      
        6427
                "\n".join(
      
        6428
                    [
      
        6429
                        "# Implementation Plan",
      
        6430
                        "",
      
        6431
                        "## File Changes",
      
        6432
                        f"- `{guide_root}`",
      
        6433
                        f"- `{chapters}`",
      
        6434
                        f"- `{index_path}`",
      
        6435
                        f"- `{chapter_one}`",
      
        6436
                        f"- `{chapter_two}`",
      
        6437
                        "",
      
        6438
                    ]
      
        6439
                )
      
        6440
            )
      
        6441
        
        6442
            context = build_context(
      
        6443
                temp_dir=temp_dir,
      
        6444
                messages=[],
      
        6445
                safeguards=FakeSafeguards(),
      
        6446
                assess_confidence=assess_confidence,
      
        6447
                verify_action=verify_action,
      
        6448
            )
      
        6449
            queued: list[str] = []
      
        6450
            context.queue_steering_message_callback = queued.append
      
        6451
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6452
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6453
            dod.implementation_plan = str(implementation_plan)
      
        6454
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6455
            sync_todos_to_definition_of_done(
      
        6456
                dod,
      
        6457
                [
      
        6458
                    {
      
        6459
                        "content": "Verify all guide files are linked and complete",
      
        6460
                        "active_form": "Working on: Verify all guide files are linked and complete",
      
        6461
                        "status": "pending",
      
        6462
                    }
      
        6463
                ],
      
        6464
                project_root=temp_dir,
      
        6465
            )
      
        6466
        
        6467
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        6468
                "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
      
        6469
                dod=dod,
      
        6470
            )
      
        6471
        
        6472
            assert queued
      
        6473
            assert context.workflow_mode == "verify"
      
        6474
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        6475
            assert "Verify all guide files are linked and complete" in queued[0]
      
        6476
            assert "Do not reopen earlier reference materials." in queued[0]
      
        6477
            assert "Verification should run next" in queued[0]
      
        6478
        
        6479
        
        6480
        def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
      
        6481
            temp_dir: Path,
      
        6482
        ) -> None:
      
        6483
            async def assess_confidence(
      
        6484
                tool_name: str,
      
        6485
                tool_args: dict,
      
        6486
                context: str,
      
        6487
            ) -> ConfidenceAssessment:
      
        6488
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6489
        
        6490
            async def verify_action(
      
        6491
                tool_name: str,
      
        6492
                tool_args: dict,
      
        6493
                result: str,
      
        6494
                expected: str = "",
      
        6495
            ) -> ActionVerification:
      
        6496
                raise AssertionError("Verification should not run in this scenario")
      
        6497
        
        6498
            guide_root = temp_dir / "guide"
      
        6499
            chapters = guide_root / "chapters"
      
        6500
            guide_root.mkdir(parents=True)
      
        6501
            chapters.mkdir()
      
        6502
            index_path = guide_root / "index.html"
      
        6503
            chapter_one = chapters / "01-getting-started.html"
      
        6504
            chapter_two = chapters / "02-installation.html"
      
        6505
            index_path.write_text("index")
      
        6506
            chapter_one.write_text("one")
      
        6507
            chapter_two.write_text("two")
      
        6508
        
        6509
            implementation_plan = temp_dir / "implementation.md"
      
        6510
            implementation_plan.write_text(
      
        6511
                "\n".join(
      
        6512
                    [
      
        6513
                        "# Implementation Plan",
      
        6514
                        "",
      
        6515
                        "## File Changes",
      
        6516
                        f"- `{guide_root}`",
      
        6517
                        f"- `{chapters}`",
      
        6518
                        f"- `{index_path}`",
      
        6519
                        f"- `{chapter_one}`",
      
        6520
                        f"- `{chapter_two}`",
      
        6521
                        "",
      
        6522
                    ]
      
        6523
                )
      
        6524
            )
      
        6525
        
        6526
            context = build_context(
      
        6527
                temp_dir=temp_dir,
      
        6528
                messages=[],
      
        6529
                safeguards=FakeSafeguards(),
      
        6530
                assess_confidence=assess_confidence,
      
        6531
                verify_action=verify_action,
      
        6532
            )
      
        6533
            queued: list[str] = []
      
        6534
            context.queue_steering_message_callback = queued.append
      
        6535
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6536
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6537
            dod.implementation_plan = str(implementation_plan)
      
        6538
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6539
        
        6540
            runner._queue_blocked_completed_artifact_scope_nudge(
      
        6541
                "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
      
        6542
                dod=dod,
      
        6543
            )
      
        6544
        
        6545
            assert queued
      
        6546
            assert context.workflow_mode == "verify"
      
        6547
            assert "All explicitly planned artifacts already exist." in queued[0]
      
        6548
            assert "move to verification or final confirmation" in queued[0]
      
        6549
        
        6550
        
        6551
        @pytest.mark.asyncio
      
        6552
        async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
      
        6553
            temp_dir: Path,
      
        6554
        ) -> None:
      
        6555
            async def assess_confidence(
      
        6556
                tool_name: str,
      
        6557
                tool_args: dict,
      
        6558
                context: str,
      
        6559
            ) -> ConfidenceAssessment:
      
        6560
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6561
        
        6562
            async def verify_action(
      
        6563
                tool_name: str,
      
        6564
                tool_args: dict,
      
        6565
                result: str,
      
        6566
                expected: str = "",
      
        6567
            ) -> ActionVerification:
      
        6568
                raise AssertionError("Verification should not run in this scenario")
      
        6569
        
        6570
            guide_root = temp_dir / "guide"
      
        6571
            chapters = guide_root / "chapters"
      
        6572
            guide_root.mkdir(parents=True)
      
        6573
            chapters.mkdir()
      
        6574
            index_path = guide_root / "index.html"
      
        6575
            chapter_one = chapters / "01-getting-started.html"
      
        6576
            chapter_two = chapters / "02-installation.html"
      
        6577
            index_path.write_text("index")
      
        6578
            chapter_one.write_text("one")
      
        6579
            chapter_two.write_text("two")
      
        6580
        
        6581
            implementation_plan = temp_dir / "implementation.md"
      
        6582
            implementation_plan.write_text(
      
        6583
                "\n".join(
      
        6584
                    [
      
        6585
                        "# Implementation Plan",
      
        6586
                        "",
      
        6587
                        "## File Changes",
      
        6588
                        f"- `{guide_root}`",
      
        6589
                        f"- `{chapters}`",
      
        6590
                        f"- `{index_path}`",
      
        6591
                        f"- `{chapter_one}`",
      
        6592
                        f"- `{chapter_two}`",
      
        6593
                        "",
      
        6594
                    ]
      
        6595
                )
      
        6596
            )
      
        6597
        
        6598
            context = build_context(
      
        6599
                temp_dir=temp_dir,
      
        6600
                messages=[],
      
        6601
                safeguards=FakeSafeguards(),
      
        6602
                assess_confidence=assess_confidence,
      
        6603
                verify_action=verify_action,
      
        6604
            )
      
        6605
            queued: list[str] = []
      
        6606
            context.queue_steering_message_callback = queued.append
      
        6607
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6608
            dod = create_definition_of_done("Create a multi-file guide from a reference")
      
        6609
            dod.implementation_plan = str(implementation_plan)
      
        6610
            dod.verification_commands = [f"ls -la {guide_root}"]
      
        6611
        
        6612
            blocked_message = (
      
        6613
                "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
      
        6614
            )
      
        6615
            tool_calls = [
      
        6616
                ToolCall(
      
        6617
                    id=f"audit-{index}",
      
        6618
                    name="bash",
      
        6619
                    arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
      
        6620
                )
      
        6621
                for index in range(1, 4)
      
        6622
            ]
      
        6623
            executor = FakeExecutor(
      
        6624
                [
      
        6625
                    tool_outcome(
      
        6626
                        tool_call=tool_call,
      
        6627
                        output=blocked_message,
      
        6628
                        is_error=True,
      
        6629
                        state=ToolExecutionState.BLOCKED,
      
        6630
                    )
      
        6631
                    for tool_call in tool_calls
      
        6632
                ]
      
        6633
            )
      
        6634
            events: list[AgentEvent] = []
      
        6635
        
        6636
            async def emit(event: AgentEvent) -> None:
      
        6637
                events.append(event)
      
        6638
        
        6639
            result = await runner.execute_batch(
      
        6640
                tool_calls=tool_calls,
      
        6641
                tool_source="native",
      
        6642
                pending_tool_calls_seen=set(),
      
        6643
                emit=emit,
      
        6644
                summary=TurnSummary(final_response=""),
      
        6645
                dod=dod,
      
        6646
                executor=executor,
      
        6647
                on_confirmation=None,
      
        6648
                on_user_question=None,
      
        6649
                emit_confirmation=None,
      
        6650
                consecutive_errors=0,
      
        6651
            )
      
        6652
        
        6653
            assert result.halted is False
      
        6654
            assert result.consecutive_errors == 0
      
        6655
            assert context.workflow_mode == "verify"
      
        6656
            assert queued
      
        6657
            assert any("move to verification or final confirmation" in message for message in queued)
      
        6658
        
        6659
        
        6660
        def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
      
        6661
            temp_dir: Path,
      
        6662
        ) -> None:
      
        6663
            async def assess_confidence(
      
        6664
                tool_name: str,
      
        6665
                tool_args: dict,
      
        6666
                context: str,
      
        6667
            ) -> ConfidenceAssessment:
      
        6668
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6669
        
        6670
            async def verify_action(
      
        6671
                tool_name: str,
      
        6672
                tool_args: dict,
      
        6673
                result: str,
      
        6674
                expected: str = "",
      
        6675
            ) -> ActionVerification:
      
        6676
                raise AssertionError("Verification should not run in this scenario")
      
        6677
        
        6678
            context = build_context(
      
        6679
                temp_dir=temp_dir,
      
        6680
                messages=[],
      
        6681
                safeguards=FakeSafeguards(),
      
        6682
                assess_confidence=assess_confidence,
      
        6683
                verify_action=verify_action,
      
        6684
            )
      
        6685
            queued: list[str] = []
      
        6686
            context.queue_steering_message_callback = queued.append
      
        6687
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6688
        
        6689
            runner._queue_blocked_html_declared_target_nudge(
      
        6690
                ToolCall(
      
        6691
                    id="write-ch1",
      
        6692
                    name="write",
      
        6693
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
      
        6694
                ),
      
        6695
                (
      
        6696
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        6697
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        6698
                    "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
      
        6699
                    "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
      
        6700
                    "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
      
        6701
                ),
      
        6702
            )
      
        6703
        
        6704
            assert queued
      
        6705
            assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
      
        6706
            assert "`chapters/02-installation.html`" in queued[0]
      
        6707
            assert "same file now" in queued[0]
      
        6708
        
        6709
        
        6710
        def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
      
        6711
            temp_dir: Path,
      
        6712
        ) -> None:
      
        6713
            async def assess_confidence(
      
        6714
                tool_name: str,
      
        6715
                tool_args: dict,
      
        6716
                context: str,
      
        6717
            ) -> ConfidenceAssessment:
      
        6718
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6719
        
        6720
            async def verify_action(
      
        6721
                tool_name: str,
      
        6722
                tool_args: dict,
      
        6723
                result: str,
      
        6724
                expected: str = "",
      
        6725
            ) -> ActionVerification:
      
        6726
                raise AssertionError("Verification should not run in this scenario")
      
        6727
        
        6728
            context = build_context(
      
        6729
                temp_dir=temp_dir,
      
        6730
                messages=[],
      
        6731
                safeguards=FakeSafeguards(),
      
        6732
                assess_confidence=assess_confidence,
      
        6733
                verify_action=verify_action,
      
        6734
            )
      
        6735
            queued: list[str] = []
      
        6736
            context.queue_steering_message_callback = queued.append
      
        6737
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6738
        
        6739
            runner._queue_blocked_html_declared_target_nudge(
      
        6740
                ToolCall(
      
        6741
                    id="write-ch1",
      
        6742
                    name="write",
      
        6743
                    arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
      
        6744
                ),
      
        6745
                (
      
        6746
                    "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
      
        6747
                    "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
      
        6748
                    "introducing new sibling targets that the guide root does not declare; remove or replace "
      
        6749
                    "undeclared hrefs like: troubleshooting.html. "
      
        6750
                    "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
      
        6751
                    "chapters/configuration.html."
      
        6752
                ),
      
        6753
            )
      
        6754
        
        6755
            assert queued
      
        6756
            assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
      
        6757
            assert "`chapters/installation.html`" in queued[0]
      
        6758
            assert "closest declared target(s)" not in queued[0]
      
        6759
        
        6760
        
        6761
        def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
      
        6762
            temp_dir: Path,
      
        6763
        ) -> None:
      
        6764
            async def assess_confidence(
      
        6765
                tool_name: str,
      
        6766
                tool_args: dict,
      
        6767
                context: str,
      
        6768
            ) -> ConfidenceAssessment:
      
        6769
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        6770
        
        6771
            async def verify_action(
      
        6772
                tool_name: str,
      
        6773
                tool_args: dict,
      
        6774
                result: str,
      
        6775
                expected: str = "",
      
        6776
            ) -> ActionVerification:
      
        6777
                raise AssertionError("Verification should not run in this scenario")
      
        6778
        
        6779
            context = build_context(
      
        6780
                temp_dir=temp_dir,
      
        6781
                messages=[],
      
        6782
                safeguards=FakeSafeguards(),
      
        6783
                assess_confidence=assess_confidence,
      
        6784
                verify_action=verify_action,
      
        6785
            )
      
        6786
            queued: list[str] = []
      
        6787
            context.queue_steering_message_callback = queued.append
      
        6788
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6789
            dod = create_definition_of_done("Create a guide.")
      
        6790
        
        6791
            target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
      
        6792
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        6793
                ToolCall(
      
        6794
                    id="write-troubleshooting",
      
        6795
                    name="write",
      
        6796
                    arguments={"file_path": str(target)},
      
        6797
                ),
      
        6798
                (
      
        6799
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        6800
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        6801
                    f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
      
        6802
                    "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
      
        6803
                    "Already-declared local targets include: chapters/advanced-topics.html, "
      
        6804
                    "chapters/basic-usage.html, chapters/configuration.html"
      
        6805
                ),
      
        6806
                dod=dod,
      
        6807
            )
      
        6808
        
        6809
            assert queued
      
        6810
            assert "update" in queued[0].lower()
      
        6811
            assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
      
        6812
            assert "`chapters/troubleshooting.html`" in queued[0]
      
        6813
            assert "retry the file creation" in queued[0]
      
        6814
        
        6815
        
        6816
        def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
      
        6817
            temp_dir: Path,
      
        6818
        ) -> None:
      
        6819
            async def assess_confidence(
      
        6820
                tool_name: str,
      
        6821
                tool_args: dict,
      
        6822
                context: str,
      
        6823
            ) -> ConfidenceAssessment:
      
        6824
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6825
        
        6826
            async def verify_action(
      
        6827
                tool_name: str,
      
        6828
                tool_args: dict,
      
        6829
                result: str,
      
        6830
                expected: str = "",
      
        6831
            ) -> ActionVerification:
      
        6832
                raise AssertionError("Verification should not run in this scenario")
      
        6833
        
        6834
            guide = temp_dir / "guide"
      
        6835
            chapters = guide / "chapters"
      
        6836
            guide.mkdir()
      
        6837
            chapters.mkdir()
      
        6838
            index = guide / "index.html"
      
        6839
            index.write_text(
      
        6840
                "\n".join(
      
        6841
                    [
      
        6842
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        6843
                        '<a href="chapters/02-installation.html">Install</a>',
      
        6844
                        '<a href="../index.html">Back</a>',
      
        6845
                        "",
      
        6846
                    ]
      
        6847
                )
      
        6848
            )
      
        6849
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        6850
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        6851
        
        6852
            implementation_plan = temp_dir / "implementation.md"
      
        6853
            implementation_plan.write_text(
      
        6854
                "\n".join(
      
        6855
                    [
      
        6856
                        "# Implementation Plan",
      
        6857
                        "",
      
        6858
                        "## File Changes",
      
        6859
                        f"- `{index}`",
      
        6860
                        f"- `{chapters / '01-introduction.html'}`",
      
        6861
                        f"- `{chapters / '02-installation.html'}`",
      
        6862
                        "",
      
        6863
                    ]
      
        6864
                )
      
        6865
            )
      
        6866
        
        6867
            context = build_context(
      
        6868
                temp_dir=temp_dir,
      
        6869
                messages=[],
      
        6870
                safeguards=FakeSafeguards(),
      
        6871
                assess_confidence=assess_confidence,
      
        6872
                verify_action=verify_action,
      
        6873
            )
      
        6874
            queued: list[str] = []
      
        6875
            context.queue_steering_message_callback = queued.append
      
        6876
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6877
            dod = create_definition_of_done("Create a guide.")
      
        6878
            dod.implementation_plan = str(implementation_plan)
      
        6879
            dod.verification_commands = [f"ls -la {guide}"]
      
        6880
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        6881
        
        6882
            target = guide / "chapters" / "08-advanced-configuration.html"
      
        6883
            runner._queue_blocked_html_declared_file_creation_nudge(
      
        6884
                ToolCall(
      
        6885
                    id="write-extra",
      
        6886
                    name="write",
      
        6887
                    arguments={"file_path": str(target)},
      
        6888
                ),
      
        6889
                (
      
        6890
                    "[Blocked - HTML file creation falls outside the current declared artifact set] "
      
        6891
                    "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
      
        6892
                    f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
      
        6893
                    "for example: chapters/08-advanced-configuration.html."
      
        6894
                ),
      
        6895
                dod=dod,
      
        6896
            )
      
        6897
        
        6898
            assert queued
      
        6899
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        6900
            assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
      
        6901
            assert "Move to verification or final confirmation using the files already on disk." in queued[0]
      
        6902
            assert "update the guide root" not in queued[0]
      
        6903
        
        6904
        
        6905
        def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
      
        6906
            temp_dir: Path,
      
        6907
        ) -> None:
      
        6908
            async def assess_confidence(
      
        6909
                tool_name: str,
      
        6910
                tool_args: dict,
      
        6911
                context: str,
      
        6912
            ) -> ConfidenceAssessment:
      
        6913
                raise AssertionError("Confidence scoring should not run in this scenario")
      
        6914
        
        6915
            async def verify_action(
      
        6916
                tool_name: str,
      
        6917
                tool_args: dict,
      
        6918
                result: str,
      
        6919
                expected: str = "",
      
        6920
            ) -> ActionVerification:
      
        6921
                raise AssertionError("Verification should not run in this scenario")
      
        6922
        
        6923
            guide = temp_dir / "guide"
      
        6924
            chapters = guide / "chapters"
      
        6925
            guide.mkdir()
      
        6926
            chapters.mkdir()
      
        6927
            index = guide / "index.html"
      
        6928
            index.write_text(
      
        6929
                "\n".join(
      
        6930
                    [
      
        6931
                        '<a href="chapters/01-introduction.html">Intro</a>',
      
        6932
                        '<a href="chapters/02-installation.html">Install</a>',
      
        6933
                        '<a href="../index.html">Back</a>',
      
        6934
                        "",
      
        6935
                    ]
      
        6936
                )
      
        6937
            )
      
        6938
            (chapters / "01-introduction.html").write_text("<html></html>\n")
      
        6939
            (chapters / "02-installation.html").write_text("<html></html>\n")
      
        6940
        
        6941
            implementation_plan = temp_dir / "implementation.md"
      
        6942
            implementation_plan.write_text(
      
        6943
                "\n".join(
      
        6944
                    [
      
        6945
                        "# Implementation Plan",
      
        6946
                        "",
      
        6947
                        "## File Changes",
      
        6948
                        f"- `{index}`",
      
        6949
                        f"- `{chapters / '01-introduction.html'}`",
      
        6950
                        f"- `{chapters / '02-installation.html'}`",
      
        6951
                        "",
      
        6952
                    ]
      
        6953
                )
      
        6954
            )
      
        6955
        
        6956
            context = build_context(
      
        6957
                temp_dir=temp_dir,
      
        6958
                messages=[],
      
        6959
                safeguards=FakeSafeguards(),
      
        6960
                assess_confidence=assess_confidence,
      
        6961
                verify_action=verify_action,
      
        6962
            )
      
        6963
            queued: list[str] = []
      
        6964
            context.queue_steering_message_callback = queued.append
      
        6965
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        6966
            dod = create_definition_of_done("Create a guide.")
      
        6967
            dod.implementation_plan = str(implementation_plan)
      
        6968
            dod.verification_commands = [f"ls -la {guide}"]
      
        6969
            dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
      
        6970
        
        6971
            runner._queue_blocked_html_missing_target_nudge(
      
        6972
                ToolCall(
      
        6973
                    id="edit-root",
      
        6974
                    name="edit",
      
        6975
                    arguments={"file_path": str(index)},
      
        6976
                ),
      
        6977
                (
      
        6978
                    "[Blocked - Edited HTML links point to files that do not exist] "
      
        6979
                    "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
      
        6980
                    "for example fix: chapters/08-advanced-configuration.html"
      
        6981
                ),
      
        6982
                dod=dod,
      
        6983
            )
      
        6984
        
        6985
            assert queued
      
        6986
            assert "All explicitly planned artifacts already exist on disk." in queued[0]
      
        6987
            assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
      
        6988
            assert "Repair the existing generated files instead of expanding the guide." in queued[0]
      
        6989
        
        6990
        
        6991
        @pytest.mark.asyncio
      
        6992
        async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      
        6993
            temp_dir: Path,
      
        6994
        ) -> None:
      
        6995
            async def assess_confidence(
      
        6996
                tool_name: str,
      
        6997
                tool_args: dict,
      
        6998
                context: str,
      
        6999
            ) -> ConfidenceAssessment:
      
        7000
                raise AssertionError("Confidence scoring should be disabled in this scenario")
      
        7001
        
        7002
            async def verify_action(
      
        7003
                tool_name: str,
      
        7004
                tool_args: dict,
      
        7005
                result: str,
      
        7006
                expected: str = "",
      
        7007
            ) -> ActionVerification:
      
        7008
                raise AssertionError("Verification should not run in this scenario")
      
        7009
        
        7010
            guide_root = temp_dir / "guides" / "nginx"
      
        7011
            chapters = guide_root / "chapters"
      
        7012
            chapters.mkdir(parents=True)
      
        7013
            index_path = guide_root / "index.html"
      
        7014
            chapter_one = chapters / "01-introduction.html"
      
        7015
            chapter_two = chapters / "02-installation.html"
      
        7016
            index_path.write_text("<html></html>\n")
      
        7017
            chapter_one.write_text("<h1>Intro</h1>\n")
      
        7018
        
        7019
            implementation_plan = temp_dir / "implementation.md"
      
        7020
            implementation_plan.write_text(
      
        7021
                "\n".join(
      
        7022
                    [
      
        7023
                        "# Implementation Plan",
      
        7024
                        "",
      
        7025
                        "## File Changes",
      
        7026
                        f"- `{index_path}`",
      
        7027
                        f"- `{chapter_one}`",
      
        7028
                        f"- `{chapter_two}`",
      
        7029
                        "",
      
        7030
                    ]
      
        7031
                )
      
        7032
            )
      
        7033
        
        7034
            context = build_context(
      
        7035
                temp_dir=temp_dir,
      
        7036
                messages=[],
      
        7037
                safeguards=FakeSafeguards(),
      
        7038
                assess_confidence=assess_confidence,
      
        7039
                verify_action=verify_action,
      
        7040
                auto_recover=False,
      
        7041
            )
      
        7042
            queued: list[str] = []
      
        7043
            context.queue_steering_message_callback = queued.append
      
        7044
            runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
      
        7045
            tool_call = ToolCall(
      
        7046
                id="write-2",
      
        7047
                name="write",
      
        7048
                arguments={"file_path": "", "content": "<html></html>\n"},
      
        7049
            )
      
        7050
            blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
      
        7051
            executor = FakeExecutor(
      
        7052
                [
      
        7053
                    ToolExecutionOutcome(
      
        7054
                        tool_call=tool_call,
      
        7055
                        state=ToolExecutionState.BLOCKED,
      
        7056
                        message=Message.tool_result_message(
      
        7057
                            tool_call_id=tool_call.id,
      
        7058
                            display_content=blocked_message,
      
        7059
                            result_content=blocked_message,
      
        7060
                            is_error=True,
      
        7061
                        ),
      
        7062
                        event_content=blocked_message,
      
        7063
                        is_error=True,
      
        7064
                        result_output=blocked_message,
      
        7065
                    )
      
        7066
                ]
      
        7067
            )
      
        7068
            dod = create_definition_of_done("Create a multi-file nginx guide.")
      
        7069
            dod.implementation_plan = str(implementation_plan)
      
        7070
            dod.touched_files.extend([str(index_path), str(chapter_one)])
      
        7071
            dod.pending_items.append("Creating Chapter 2: Installation and Setup")
      
        7072
        
        7073
            await runner.execute_batch(
      
        7074
                tool_calls=[tool_call],
      
        7075
                tool_source="assistant",
      
        7076
                pending_tool_calls_seen=set(),
      
        7077
                emit=_noop_emit,
      
        7078
                summary=TurnSummary(final_response=""),
      
        7079
                dod=dod,
      
        7080
                executor=executor,  # type: ignore[arg-type]
      
        7081
                on_confirmation=None,
      
        7082
                on_user_question=None,
      
        7083
                emit_confirmation=None,
      
        7084
                consecutive_errors=0,
      
        7085
            )
      
        7086
        
        7087
            assert queued
      
        7088
            assert "did not provide a valid `file_path`" in queued[0]
      
        7089
            assert "Resume by creating `02-installation.html` now." in queued[0]
      
        7090
            assert (
      
        7091
                f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
      
        7092
                in queued[0]
      
        7093
            )
      
        7094
            assert context.recovery_context is not None
      
        7095
            assert context.recovery_context.attempts[-1].error == blocked_message