Python · 173054 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.permissions import (
20 PermissionMode,
21 build_permission_policy,
22 load_permission_rules,
23 )
24 from loader.runtime.reasoning_types import (
25 ActionVerification,
26 ConfidenceAssessment,
27 ConfidenceLevel,
28 )
29 from loader.runtime.recovery import RecoveryContext
30 from loader.runtime.tool_batches import (
31 ToolBatchRunner,
32 )
33 from loader.runtime.tool_batches import (
34 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
35 )
36 from loader.runtime.workflow import sync_todos_to_definition_of_done
37 from loader.tools.base import ToolResult as RegistryToolResult
38 from loader.tools.base import create_default_registry
39 from tests.helpers.runtime_harness import ScriptedBackend
40
41
42 class FakeSession:
43 def __init__(self, messages: list[Message]) -> None:
44 self.messages = list(messages)
45 self.workflow_timeline = []
46
47 def append(self, message: Message) -> None:
48 self.messages.append(message)
49
50 def append_workflow_timeline_entry(self, entry) -> None:
51 self.workflow_timeline.append(entry)
52
53
54 class FakeCodeFilter:
55 def reset(self) -> None:
56 return None
57
58
59 class FakeSafeguards:
60 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
61 self.action_tracker = object()
62 self.validator = object()
63 self.code_filter = FakeCodeFilter()
64 self._detect_loop_result = detect_loop_result
65
66 def filter_stream_chunk(self, content: str) -> str:
67 return content
68
69 def filter_complete_content(self, content: str) -> str:
70 return content
71
72 def should_steer(self) -> bool:
73 return False
74
75 def get_steering_message(self) -> str | None:
76 return None
77
78 def record_response(self, content: str) -> None:
79 return None
80
81 def detect_text_loop(self, content: str) -> tuple[bool, str]:
82 return False, ""
83
84 def detect_loop(self) -> tuple[bool, str]:
85 return self._detect_loop_result
86
87
88 class FakeExecutor:
89 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
90 self._outcomes = list(outcomes)
91 self.calls: list[ToolCall] = []
92
93 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
94 self.calls.append(tool_call)
95 if not self._outcomes:
96 raise AssertionError("No fake tool outcome queued")
97 return self._outcomes.pop(0)
98
99
100 def build_context(
101 *,
102 temp_dir: Path,
103 messages: list[Message],
104 safeguards: FakeSafeguards,
105 assess_confidence,
106 verify_action,
107 recovery_context: RecoveryContext | None = None,
108 confidence_scoring: bool = False,
109 verification: bool = False,
110 auto_recover: bool = True,
111 min_confidence_for_action: int = 3,
112 ) -> RuntimeContext:
113 registry = create_default_registry(temp_dir)
114 registry.configure_workspace_root(temp_dir)
115 rule_status = load_permission_rules(temp_dir)
116 policy = build_permission_policy(
117 active_mode=PermissionMode.WORKSPACE_WRITE,
118 workspace_root=temp_dir,
119 tool_requirements=registry.get_tool_requirements(),
120 rules=rule_status.rules,
121 )
122 context = RuntimeContext(
123 project_root=temp_dir,
124 backend=ScriptedBackend(),
125 registry=registry,
126 session=FakeSession(messages), # type: ignore[arg-type]
127 config=SimpleNamespace(
128 force_react=False,
129 max_recovery_attempts=2,
130 auto_recover=auto_recover,
131 reasoning=SimpleNamespace(
132 rollback=False,
133 show_rollback_plan=False,
134 completion_check=True,
135 max_continuation_prompts=5,
136 self_critique=False,
137 confidence_scoring=confidence_scoring,
138 min_confidence_for_action=min_confidence_for_action,
139 verification=verification,
140 ),
141 ),
142 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
143 project_context=None,
144 permission_policy=policy,
145 permission_config_status=rule_status,
146 workflow_mode="execute",
147 safeguards=safeguards,
148 reasoning=SimpleNamespace(
149 assess_confidence=assess_confidence,
150 verify_action=verify_action,
151 ),
152 recovery_context=recovery_context,
153 )
154 return context
155
156
157 def tool_outcome(
158 *,
159 tool_call: ToolCall,
160 output: str,
161 is_error: bool,
162 state: ToolExecutionState = ToolExecutionState.EXECUTED,
163 metadata: dict[str, object] | None = None,
164 ) -> ToolExecutionOutcome:
165 return ToolExecutionOutcome(
166 tool_call=tool_call,
167 state=state,
168 message=Message.tool_result_message(
169 tool_call_id=tool_call.id,
170 display_content=output,
171 result_content=output,
172 is_error=is_error,
173 ),
174 event_content=output,
175 is_error=is_error,
176 result_output=output,
177 registry_result=RegistryToolResult(
178 output=output,
179 is_error=is_error,
180 metadata=metadata or {},
181 ),
182 )
183
184
185 @pytest.mark.asyncio
186 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
187 captured: dict[str, str] = {}
188
189 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
190 captured["context"] = context
191 return ConfidenceAssessment(
192 action=f"{tool_name} with {tool_args}",
193 tool_name=tool_name,
194 tool_args=tool_args,
195 level=ConfidenceLevel.LOW,
196 reasoning="Need to inspect the target first.",
197 risks=["Unknown target file"],
198 )
199
200 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
201 raise AssertionError("Verification should not run for skipped actions")
202
203 context = build_context(
204 temp_dir=temp_dir,
205 messages=[
206 Message(role=Role.USER, content="Please inspect the project."),
207 Message(role=Role.ASSISTANT, content="I will read the file next."),
208 ],
209 safeguards=FakeSafeguards(),
210 assess_confidence=assess_confidence,
211 verify_action=verify_action,
212 confidence_scoring=True,
213 min_confidence_for_action=3,
214 )
215 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
216 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
217 events: list[AgentEvent] = []
218
219 async def emit(event: AgentEvent) -> None:
220 events.append(event)
221
222 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
223 result = await runner.execute_batch(
224 tool_calls=[tool_call],
225 tool_source="assistant",
226 pending_tool_calls_seen=set(),
227 emit=emit,
228 summary=TurnSummary(final_response=""),
229 dod=create_definition_of_done("Read the docs"),
230 executor=executor, # type: ignore[arg-type]
231 on_confirmation=None,
232 on_user_question=None,
233 emit_confirmation=None,
234 consecutive_errors=0,
235 )
236
237 assert result.actions_taken == []
238 assert executor.calls == []
239 assert "Please inspect the project." in captured["context"]
240 assert context.session.messages[-1].role == Role.USER
241 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
242 event_types = [event.type for event in events]
243 assert "confidence" in event_types
244
245
246 @pytest.mark.asyncio
247 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
248 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
249 raise AssertionError("Confidence scoring should be disabled in this scenario")
250
251 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
252 raise AssertionError("Verification should not run for failed actions")
253
254 context = build_context(
255 temp_dir=temp_dir,
256 messages=[],
257 safeguards=FakeSafeguards(),
258 assess_confidence=assess_confidence,
259 verify_action=verify_action,
260 auto_recover=True,
261 )
262 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
263 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
264 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
265 summary = TurnSummary(final_response="")
266 events: list[AgentEvent] = []
267
268 async def emit(event: AgentEvent) -> None:
269 events.append(event)
270
271 await runner.execute_batch(
272 tool_calls=[tool_call],
273 tool_source="assistant",
274 pending_tool_calls_seen=set(),
275 emit=emit,
276 summary=summary,
277 dod=create_definition_of_done("Run tests"),
278 executor=executor, # type: ignore[arg-type]
279 on_confirmation=None,
280 on_user_question=None,
281 emit_confirmation=None,
282 consecutive_errors=0,
283 )
284
285 assert context.recovery_context is not None
286 assert summary.tool_result_messages
287 assert context.session.messages[-1] == summary.tool_result_messages[-1]
288 assert any(event.type == "recovery" for event in events)
289
290
291 @pytest.mark.asyncio
292 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
293 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
294 raise AssertionError("Confidence scoring should be disabled in this scenario")
295
296 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
297 raise AssertionError("Verification should not run for this scenario")
298
299 context = build_context(
300 temp_dir=temp_dir,
301 messages=[],
302 safeguards=FakeSafeguards(),
303 assess_confidence=assess_confidence,
304 verify_action=verify_action,
305 auto_recover=False,
306 )
307 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
308 tool_call = ToolCall(
309 id="bash-1",
310 name="bash",
311 arguments={"command": "python -m http.server 8000", "background": True},
312 )
313 metadata = {
314 "job_id": "bash-1",
315 "status": "running",
316 "background": True,
317 }
318 executor = FakeExecutor(
319 [
320 tool_outcome(
321 tool_call=tool_call,
322 output="Started bash job bash-1",
323 is_error=False,
324 metadata=metadata,
325 )
326 ]
327 )
328 events: list[AgentEvent] = []
329
330 async def emit(event: AgentEvent) -> None:
331 events.append(event)
332
333 await runner.execute_batch(
334 tool_calls=[tool_call],
335 tool_source="assistant",
336 pending_tool_calls_seen=set(),
337 emit=emit,
338 summary=TurnSummary(final_response=""),
339 dod=create_definition_of_done("Launch a preview server"),
340 executor=executor, # type: ignore[arg-type]
341 on_confirmation=None,
342 on_user_question=None,
343 emit_confirmation=None,
344 consecutive_errors=0,
345 )
346
347 tool_result = next(event for event in events if event.type == "tool_result")
348 assert tool_result.tool_metadata == metadata
349
350
351 @pytest.mark.asyncio
352 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
353 verification_calls: list[str] = []
354
355 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
356 raise AssertionError("Confidence scoring should be disabled in this scenario")
357
358 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
359 verification_calls.append(result)
360 return ActionVerification(
361 tool_name=tool_name,
362 tool_args=tool_args,
363 expected_outcome="Success",
364 actual_result=result,
365 verified=False,
366 discrepancies=["File contents did not match"],
367 needs_correction=True,
368 correction_suggestion="Read the file before editing again.",
369 )
370
371 existing_recovery = RecoveryContext(
372 original_tool="edit",
373 original_args={"file_path": "README.md"},
374 )
375 context = build_context(
376 temp_dir=temp_dir,
377 messages=[],
378 safeguards=FakeSafeguards(),
379 assess_confidence=assess_confidence,
380 verify_action=verify_action,
381 recovery_context=existing_recovery,
382 verification=True,
383 )
384 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
385 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
386 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
387 events: list[AgentEvent] = []
388
389 async def emit(event: AgentEvent) -> None:
390 events.append(event)
391
392 await runner.execute_batch(
393 tool_calls=[tool_call],
394 tool_source="assistant",
395 pending_tool_calls_seen=set(),
396 emit=emit,
397 summary=TurnSummary(final_response=""),
398 dod=create_definition_of_done("Read the docs"),
399 executor=executor, # type: ignore[arg-type]
400 on_confirmation=None,
401 on_user_question=None,
402 emit_confirmation=None,
403 consecutive_errors=0,
404 )
405
406 assert verification_calls == ["file contents"]
407 assert context.recovery_context is existing_recovery
408 assert existing_recovery.successful_steps == [
409 ("read", {"file_path": "README.md"})
410 ]
411 assert context.session.messages[-1].role == Role.TOOL
412 assert context.session.messages[-1].content == "file contents"
413 assert any(event.type == "verification" for event in events)
414
415
416 @pytest.mark.asyncio
417 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
418 temp_dir: Path,
419 ) -> None:
420 async def assess_confidence(
421 tool_name: str,
422 tool_args: dict,
423 context: str,
424 ) -> ConfidenceAssessment:
425 raise AssertionError("Confidence scoring should be disabled in this scenario")
426
427 async def verify_action(
428 tool_name: str,
429 tool_args: dict,
430 result: str,
431 expected: str = "",
432 ) -> ActionVerification:
433 raise AssertionError("Verification should not run for this scenario")
434
435 existing_recovery = RecoveryContext(
436 original_tool="read",
437 original_args={"file_path": "chapters/04-data-types.html"},
438 )
439 existing_recovery.add_attempt(
440 "read",
441 {"file_path": "chapters/04-data-types.html"},
442 "File not found",
443 )
444 context = build_context(
445 temp_dir=temp_dir,
446 messages=[],
447 safeguards=FakeSafeguards(),
448 assess_confidence=assess_confidence,
449 verify_action=verify_action,
450 recovery_context=existing_recovery,
451 auto_recover=False,
452 )
453 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
454 tool_call = ToolCall(
455 id="bash-1",
456 name="bash",
457 arguments={"command": "ls chapters"},
458 )
459 executor = FakeExecutor(
460 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
461 )
462
463 summary = TurnSummary(final_response="")
464 await runner.execute_batch(
465 tool_calls=[tool_call],
466 tool_source="assistant",
467 pending_tool_calls_seen=set(),
468 emit=_noop_emit,
469 summary=summary,
470 dod=create_definition_of_done("Fix the chapter links"),
471 executor=executor, # type: ignore[arg-type]
472 on_confirmation=None,
473 on_user_question=None,
474 emit_confirmation=None,
475 consecutive_errors=0,
476 )
477
478 assert context.recovery_context is existing_recovery
479 assert existing_recovery.successful_steps == [
480 ("bash", {"command": "ls chapters"})
481 ]
482
483
484 @pytest.mark.asyncio
485 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
486 temp_dir: Path,
487 ) -> None:
488 async def assess_confidence(
489 tool_name: str,
490 tool_args: dict,
491 context: str,
492 ) -> ConfidenceAssessment:
493 raise AssertionError("Confidence scoring should be disabled in this scenario")
494
495 async def verify_action(
496 tool_name: str,
497 tool_args: dict,
498 result: str,
499 expected: str = "",
500 ) -> ActionVerification:
501 raise AssertionError("Verification should not run for this scenario")
502
503 existing_recovery = RecoveryContext(
504 original_tool="read",
505 original_args={"file_path": "chapters/04-data-types.html"},
506 )
507 existing_recovery.add_attempt(
508 "read",
509 {"file_path": "chapters/04-data-types.html"},
510 "File not found",
511 )
512 context = build_context(
513 temp_dir=temp_dir,
514 messages=[],
515 safeguards=FakeSafeguards(),
516 assess_confidence=assess_confidence,
517 verify_action=verify_action,
518 recovery_context=existing_recovery,
519 auto_recover=False,
520 )
521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
522 tool_call = ToolCall(
523 id="patch-1",
524 name="patch",
525 arguments={
526 "file_path": "index.html",
527 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
528 },
529 )
530 executor = FakeExecutor(
531 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
532 )
533
534 summary = TurnSummary(final_response="")
535 await runner.execute_batch(
536 tool_calls=[tool_call],
537 tool_source="assistant",
538 pending_tool_calls_seen=set(),
539 emit=_noop_emit,
540 summary=summary,
541 dod=create_definition_of_done("Fix the chapter links"),
542 executor=executor, # type: ignore[arg-type]
543 on_confirmation=None,
544 on_user_question=None,
545 emit_confirmation=None,
546 consecutive_errors=0,
547 )
548
549 assert context.recovery_context is None
550
551
552 @pytest.mark.asyncio
553 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
554 temp_dir: Path,
555 ) -> None:
556 async def assess_confidence(
557 tool_name: str,
558 tool_args: dict,
559 context: str,
560 ) -> ConfidenceAssessment:
561 raise AssertionError("Confidence scoring should be disabled in this scenario")
562
563 async def verify_action(
564 tool_name: str,
565 tool_args: dict,
566 result: str,
567 expected: str = "",
568 ) -> ActionVerification:
569 raise AssertionError("Verification should not run for this scenario")
570
571 messages = [
572 Message(
573 role=Role.TOOL,
574 content=(
575 "Observation [glob]: Result: "
576 f"{temp_dir}/chapters/01-introduction.html\n"
577 f"{temp_dir}/chapters/02-setup.html\n"
578 f"{temp_dir}/chapters/03-basics.html"
579 ),
580 tool_results=[],
581 ),
582 Message(
583 role=Role.ASSISTANT,
584 content="I already inspected the first chapter title.",
585 tool_calls=[
586 ToolCall(
587 id="read-ch1",
588 name="read",
589 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
590 )
591 ],
592 ),
593 Message.tool_result_message(
594 tool_call_id="read-ch1",
595 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
596 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 ),
598 Message(
599 role=Role.ASSISTANT,
600 content="I should update the index now.",
601 tool_calls=[
602 ToolCall(
603 id="read-index",
604 name="read",
605 arguments={"file_path": str(temp_dir / 'index.html')},
606 )
607 ],
608 ),
609 ]
610 context = build_context(
611 temp_dir=temp_dir,
612 messages=messages,
613 safeguards=FakeSafeguards(),
614 assess_confidence=assess_confidence,
615 verify_action=verify_action,
616 auto_recover=False,
617 )
618 (temp_dir / "chapters").mkdir()
619 (temp_dir / "index.html").write_text("<ul></ul>\n")
620 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
621 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
622 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
623 implementation_plan = temp_dir / "implementation.md"
624 implementation_plan.write_text(
625 "\n".join(
626 [
627 "# Implementation Plan",
628 "",
629 "## File Changes",
630 f"- `{temp_dir / 'index.html'}`",
631 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
632 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
633 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
634 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
635 ]
636 )
637 )
638 context.session.current_task = (
639 f"Update {temp_dir / 'index.html'} with the right chapter links."
640 )
641 persistent_messages: list[str] = []
642 ephemeral_messages: list[str] = []
643 context.queue_steering_message_callback = persistent_messages.append
644 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
645 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
646 tool_call = ToolCall(
647 id="read-dup",
648 name="read",
649 arguments={"file_path": str(temp_dir / "index.html")},
650 )
651 duplicate_message = (
652 "[Skipped - duplicate action: Already read "
653 f"{temp_dir / 'index.html'} recently without any intervening changes; "
654 "reuse the earlier read result instead of rereading]"
655 )
656 executor = FakeExecutor(
657 [
658 ToolExecutionOutcome(
659 tool_call=tool_call,
660 state=ToolExecutionState.DUPLICATE,
661 message=Message.tool_result_message(
662 tool_call_id=tool_call.id,
663 display_content=duplicate_message,
664 result_content=duplicate_message,
665 ),
666 event_content=duplicate_message,
667 is_error=False,
668 result_output=duplicate_message,
669 )
670 ]
671 )
672
673 summary = TurnSummary(final_response="")
674 dod = create_definition_of_done("Fix the chapter links")
675 dod.implementation_plan = str(implementation_plan)
676 dod.pending_items.append("Create the remaining chapter files")
677 await runner.execute_batch(
678 tool_calls=[tool_call],
679 tool_source="assistant",
680 pending_tool_calls_seen=set(),
681 emit=_noop_emit,
682 summary=summary,
683 dod=dod,
684 executor=executor, # type: ignore[arg-type]
685 on_confirmation=None,
686 on_user_question=None,
687 emit_confirmation=None,
688 consecutive_errors=0,
689 )
690
691 assert len(persistent_messages) == 1
692 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
693 assert "A declared output artifact is still missing." in persistent_messages[0]
694 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
695 assert (
696 f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
697 in persistent_messages[0]
698 )
699 assert ephemeral_messages == []
700
701
702 @pytest.mark.asyncio
703 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
704 temp_dir: Path,
705 ) -> None:
706 async def assess_confidence(
707 tool_name: str,
708 tool_args: dict,
709 context: str,
710 ) -> ConfidenceAssessment:
711 raise AssertionError("Confidence scoring should not run for this scenario")
712
713 async def verify_action(
714 tool_name: str,
715 tool_args: dict,
716 result: str,
717 expected: str = "",
718 ) -> ActionVerification:
719 raise AssertionError("Verification should not run for this scenario")
720
721 context = build_context(
722 temp_dir=temp_dir,
723 messages=[],
724 safeguards=FakeSafeguards(),
725 assess_confidence=assess_confidence,
726 verify_action=verify_action,
727 auto_recover=False,
728 )
729 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
730 dod = create_definition_of_done("Create a multi-file nginx guide.")
731 sync_todos_to_definition_of_done(
732 dod,
733 [
734 {
735 "content": "Create 03-first-website.html",
736 "active_form": "Creating 03-first-website.html",
737 "status": "pending",
738 },
739 {
740 "content": "Create 04-configuration-basics.html",
741 "active_form": "Creating 04-configuration-basics.html",
742 "status": "pending",
743 },
744 ],
745 )
746
747 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
748 chapter_path.parent.mkdir(parents=True)
749 write_call = ToolCall(
750 id="write-ch3",
751 name="write",
752 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
753 )
754 stale_todo_call = ToolCall(
755 id="todo-stale",
756 name="TodoWrite",
757 arguments={
758 "todos": [
759 {
760 "content": "Create 03-first-website.html",
761 "active_form": "Creating 03-first-website.html",
762 "status": "pending",
763 },
764 {
765 "content": "Create 04-configuration-basics.html",
766 "active_form": "Creating 04-configuration-basics.html",
767 "status": "pending",
768 },
769 ]
770 },
771 )
772 executor = FakeExecutor(
773 [
774 tool_outcome(
775 tool_call=write_call,
776 output=f"Successfully wrote {chapter_path}",
777 is_error=False,
778 ),
779 tool_outcome(
780 tool_call=stale_todo_call,
781 output="Todos updated",
782 is_error=False,
783 metadata={
784 "new_todos": [
785 {
786 "content": "Create 03-first-website.html",
787 "active_form": "Creating 03-first-website.html",
788 "status": "pending",
789 },
790 {
791 "content": "Create 04-configuration-basics.html",
792 "active_form": "Creating 04-configuration-basics.html",
793 "status": "pending",
794 },
795 ]
796 },
797 ),
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 await runner.execute_batch(
803 tool_calls=[write_call, stale_todo_call],
804 tool_source="assistant",
805 pending_tool_calls_seen=set(),
806 emit=_noop_emit,
807 summary=summary,
808 dod=dod,
809 executor=executor, # type: ignore[arg-type]
810 on_confirmation=None,
811 on_user_question=None,
812 emit_confirmation=None,
813 consecutive_errors=0,
814 )
815
816 assert "Create 03-first-website.html" in dod.completed_items
817 assert "Create 03-first-website.html" not in dod.pending_items
818 assert "Create 04-configuration-basics.html" in dod.pending_items
819
820
821 @pytest.mark.asyncio
822 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
823 temp_dir: Path,
824 ) -> None:
825 async def assess_confidence(
826 tool_name: str,
827 tool_args: dict,
828 context: str,
829 ) -> ConfidenceAssessment:
830 raise AssertionError("Confidence scoring should be disabled in this scenario")
831
832 async def verify_action(
833 tool_name: str,
834 tool_args: dict,
835 result: str,
836 expected: str = "",
837 ) -> ActionVerification:
838 raise AssertionError("Verification should not run for this scenario")
839
840 chapters = temp_dir / "chapters"
841 chapters.mkdir()
842 (chapters / "01-introduction.html").write_text(
843 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
844 )
845 (chapters / "02-setup.html").write_text(
846 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
847 )
848 (temp_dir / "index.html").write_text("<ul></ul>\n")
849
850 context = build_context(
851 temp_dir=temp_dir,
852 messages=[],
853 safeguards=FakeSafeguards(),
854 assess_confidence=assess_confidence,
855 verify_action=verify_action,
856 auto_recover=False,
857 )
858 context.session.current_task = (
859 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
860 )
861 persistent_messages: list[str] = []
862 ephemeral_messages: list[str] = []
863 context.queue_steering_message_callback = persistent_messages.append
864 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
865 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
866 tool_call = ToolCall(
867 id="glob-1",
868 name="glob",
869 arguments={"path": str(chapters), "pattern": "*.html"},
870 )
871 executor = FakeExecutor(
872 [
873 tool_outcome(
874 tool_call=tool_call,
875 output="\n".join(
876 [
877 str(chapters / "01-introduction.html"),
878 str(chapters / "02-setup.html"),
879 ]
880 ),
881 is_error=False,
882 )
883 ]
884 )
885
886 summary = TurnSummary(final_response="")
887 await runner.execute_batch(
888 tool_calls=[tool_call],
889 tool_source="assistant",
890 pending_tool_calls_seen=set(),
891 emit=_noop_emit,
892 summary=summary,
893 dod=create_definition_of_done("Fix the chapter links"),
894 executor=executor, # type: ignore[arg-type]
895 on_confirmation=None,
896 on_user_question=None,
897 emit_confirmation=None,
898 consecutive_errors=0,
899 )
900
901 assert persistent_messages == []
902 assert ephemeral_messages == []
903 assert len(summary.tool_result_messages) == 1
904 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
905
906
907 @pytest.mark.asyncio
908 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
909 temp_dir: Path,
910 ) -> None:
911 async def assess_confidence(
912 tool_name: str,
913 tool_args: dict,
914 context: str,
915 ) -> ConfidenceAssessment:
916 raise AssertionError("Confidence scoring should be disabled in this scenario")
917
918 async def verify_action(
919 tool_name: str,
920 tool_args: dict,
921 result: str,
922 expected: str = "",
923 ) -> ActionVerification:
924 raise AssertionError("Verification should not run for this scenario")
925
926 chapters = temp_dir / "chapters"
927 chapters.mkdir()
928 (chapters / "01-introduction.html").write_text(
929 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
930 )
931 (chapters / "02-setup.html").write_text(
932 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
933 )
934 index_path = temp_dir / "index.html"
935 old_block = (
936 '<ul class="chapter-list">\n'
937 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
938 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
939 "</ul>\n"
940 )
941 new_block = (
942 '<ul class="chapter-list">\n'
943 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
944 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
945 "</ul>\n"
946 )
947 index_path.write_text(new_block)
948
949 context = build_context(
950 temp_dir=temp_dir,
951 messages=[],
952 safeguards=FakeSafeguards(),
953 assess_confidence=assess_confidence,
954 verify_action=verify_action,
955 auto_recover=False,
956 )
957 context.session.current_task = (
958 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
959 )
960 persistent_messages: list[str] = []
961 ephemeral_messages: list[str] = []
962 context.queue_steering_message_callback = persistent_messages.append
963 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
964 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
965 tool_call = ToolCall(
966 id="edit-1",
967 name="edit",
968 arguments={
969 "file_path": str(index_path),
970 "old_string": old_block,
971 "new_string": new_block,
972 },
973 )
974 executor = FakeExecutor(
975 [
976 tool_outcome(
977 tool_call=tool_call,
978 output=f"Successfully edited {index_path}",
979 is_error=False,
980 )
981 ]
982 )
983
984 summary = TurnSummary(final_response="")
985 await runner.execute_batch(
986 tool_calls=[tool_call],
987 tool_source="assistant",
988 pending_tool_calls_seen=set(),
989 emit=_noop_emit,
990 summary=summary,
991 dod=create_definition_of_done(
992 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
993 ),
994 executor=executor, # type: ignore[arg-type]
995 on_confirmation=None,
996 on_user_question=None,
997 emit_confirmation=None,
998 consecutive_errors=0,
999 )
1000
1001 assert all(
1002 "Semantic verification preview:" not in message.content
1003 for message in summary.tool_result_messages
1004 )
1005 assert persistent_messages == []
1006 assert ephemeral_messages == []
1007
1008
1009 @pytest.mark.asyncio
1010 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1011 temp_dir: Path,
1012 ) -> None:
1013 async def assess_confidence(
1014 tool_name: str,
1015 tool_args: dict,
1016 context: str,
1017 ) -> ConfidenceAssessment:
1018 raise AssertionError("Confidence scoring should be disabled in this scenario")
1019
1020 async def verify_action(
1021 tool_name: str,
1022 tool_args: dict,
1023 result: str,
1024 expected: str = "",
1025 ) -> ActionVerification:
1026 raise AssertionError("Verification should not run for this scenario")
1027
1028 chapters = temp_dir / "chapters"
1029 chapters.mkdir()
1030 (chapters / "01-introduction.html").write_text(
1031 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1032 )
1033 (chapters / "02-setup.html").write_text(
1034 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1035 )
1036 index_path = temp_dir / "index.html"
1037 index_path.write_text(
1038 "<h2>Table of Contents</h2>\n"
1039 '<ul class="chapter-list">\n'
1040 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1041 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1042 "</ul>\n"
1043 )
1044
1045 prompt = (
1046 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1047 "for the structure and cadence of the guide. We are going to make an all "
1048 "new equally thorough guide on how to use the nginx tool."
1049 )
1050
1051 context = build_context(
1052 temp_dir=temp_dir,
1053 messages=[],
1054 safeguards=FakeSafeguards(),
1055 assess_confidence=assess_confidence,
1056 verify_action=verify_action,
1057 auto_recover=False,
1058 )
1059 context.session.current_task = prompt # type: ignore[attr-defined]
1060 persistent_messages: list[str] = []
1061 ephemeral_messages: list[str] = []
1062 context.queue_steering_message_callback = persistent_messages.append
1063 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1064 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1065 tool_call = ToolCall(
1066 id="read-index",
1067 name="read",
1068 arguments={"file_path": str(index_path)},
1069 )
1070 executor = FakeExecutor(
1071 [
1072 tool_outcome(
1073 tool_call=tool_call,
1074 output=index_path.read_text(),
1075 is_error=False,
1076 )
1077 ]
1078 )
1079
1080 summary = TurnSummary(final_response="")
1081 await runner.execute_batch(
1082 tool_calls=[tool_call],
1083 tool_source="assistant",
1084 pending_tool_calls_seen=set(),
1085 emit=_noop_emit,
1086 summary=summary,
1087 dod=create_definition_of_done(prompt),
1088 executor=executor, # type: ignore[arg-type]
1089 on_confirmation=None,
1090 on_user_question=None,
1091 emit_confirmation=None,
1092 consecutive_errors=0,
1093 )
1094
1095 assert persistent_messages == []
1096 assert ephemeral_messages == []
1097 assert all(
1098 "Semantic verification preview:" not in message.content
1099 for message in summary.tool_result_messages
1100 )
1101
1102
1103 @pytest.mark.asyncio
1104 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1105 temp_dir: Path,
1106 ) -> None:
1107 async def assess_confidence(
1108 tool_name: str,
1109 tool_args: dict,
1110 context: str,
1111 ) -> ConfidenceAssessment:
1112 raise AssertionError("Confidence scoring should be disabled in this scenario")
1113
1114 async def verify_action(
1115 tool_name: str,
1116 tool_args: dict,
1117 result: str,
1118 expected: str = "",
1119 ) -> ActionVerification:
1120 raise AssertionError("Verification should not run for this scenario")
1121
1122 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1123 reference.parent.mkdir(parents=True)
1124 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1125 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1126 chapters = nginx_root / "chapters"
1127 implementation_plan = temp_dir / "implementation.md"
1128 implementation_plan.write_text(
1129 "\n".join(
1130 [
1131 "# Implementation Plan",
1132 "",
1133 "## File Changes",
1134 f"- `{chapters}/`",
1135 f"- `{nginx_root / 'index.html'}`",
1136 "",
1137 ]
1138 )
1139 )
1140
1141 context = build_context(
1142 temp_dir=temp_dir,
1143 messages=[],
1144 safeguards=FakeSafeguards(),
1145 assess_confidence=assess_confidence,
1146 verify_action=verify_action,
1147 auto_recover=False,
1148 )
1149 persistent_messages: list[str] = []
1150 ephemeral_messages: list[str] = []
1151 context.queue_steering_message_callback = persistent_messages.append
1152 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1153 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1154 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1155 dod.implementation_plan = str(implementation_plan)
1156 sync_todos_to_definition_of_done(
1157 dod,
1158 [
1159 {
1160 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1161 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1162 "status": "pending",
1163 },
1164 {
1165 "content": "Create the nginx directory structure",
1166 "active_form": "Working on: Create the nginx directory structure",
1167 "status": "pending",
1168 },
1169 {
1170 "content": "Create the nginx index.html file",
1171 "active_form": "Working on: Create the nginx index.html file",
1172 "status": "pending",
1173 },
1174 ],
1175 )
1176 tool_call = ToolCall(
1177 id="read-reference",
1178 name="read",
1179 arguments={"file_path": str(reference)},
1180 )
1181 executor = FakeExecutor(
1182 [
1183 tool_outcome(
1184 tool_call=tool_call,
1185 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1186 is_error=False,
1187 )
1188 ]
1189 )
1190
1191 summary = TurnSummary(final_response="")
1192 await runner.execute_batch(
1193 tool_calls=[tool_call],
1194 tool_source="assistant",
1195 pending_tool_calls_seen=set(),
1196 emit=_noop_emit,
1197 summary=summary,
1198 dod=dod,
1199 executor=executor, # type: ignore[arg-type]
1200 on_confirmation=None,
1201 on_user_question=None,
1202 emit_confirmation=None,
1203 consecutive_errors=0,
1204 )
1205
1206 assert (
1207 "Examine the existing Fortran guide structure to understand the cadence and format"
1208 in dod.completed_items
1209 )
1210 assert any(
1211 "Continue with the next pending item: `Create the nginx directory structure`"
1212 in message
1213 for message in persistent_messages
1214 )
1215 assert any(
1216 "Resume by creating `chapters/` now." in message
1217 for message in persistent_messages
1218 )
1219 assert all("01-introduction.html" not in message for message in persistent_messages)
1220 assert ephemeral_messages == []
1221
1222
1223 @pytest.mark.asyncio
1224 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1225 temp_dir: Path,
1226 ) -> None:
1227 async def assess_confidence(
1228 tool_name: str,
1229 tool_args: dict,
1230 context: str,
1231 ) -> ConfidenceAssessment:
1232 raise AssertionError("Confidence scoring should be disabled in this scenario")
1233
1234 async def verify_action(
1235 tool_name: str,
1236 tool_args: dict,
1237 result: str,
1238 expected: str = "",
1239 ) -> ActionVerification:
1240 raise AssertionError("Verification should not run for this scenario")
1241
1242 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1243 reference.parent.mkdir(parents=True)
1244 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1245 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1246 chapters = nginx_root / "chapters"
1247 implementation_plan = temp_dir / "implementation.md"
1248 implementation_plan.write_text(
1249 "\n".join(
1250 [
1251 "# Implementation Plan",
1252 "",
1253 "## File Changes",
1254 f"- `{nginx_root / 'index.html'}`",
1255 f"- `{chapters}/`",
1256 "",
1257 ]
1258 )
1259 )
1260
1261 context = build_context(
1262 temp_dir=temp_dir,
1263 messages=[],
1264 safeguards=FakeSafeguards(),
1265 assess_confidence=assess_confidence,
1266 verify_action=verify_action,
1267 auto_recover=False,
1268 )
1269 persistent_messages: list[str] = []
1270 ephemeral_messages: list[str] = []
1271 context.queue_steering_message_callback = persistent_messages.append
1272 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1273 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1274 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1275 dod.implementation_plan = str(implementation_plan)
1276 sync_todos_to_definition_of_done(
1277 dod,
1278 [
1279 {
1280 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1281 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1282 "status": "pending",
1283 },
1284 {
1285 "content": "Create the nginx directory structure",
1286 "active_form": "Working on: Create the nginx directory structure",
1287 "status": "pending",
1288 },
1289 {
1290 "content": "Create the nginx index.html file",
1291 "active_form": "Working on: Create the nginx index.html file",
1292 "status": "pending",
1293 },
1294 ],
1295 project_root=temp_dir,
1296 )
1297 tool_call = ToolCall(
1298 id="read-reference-index-first",
1299 name="read",
1300 arguments={"file_path": str(reference)},
1301 )
1302 executor = FakeExecutor(
1303 [
1304 tool_outcome(
1305 tool_call=tool_call,
1306 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1307 is_error=False,
1308 )
1309 ]
1310 )
1311
1312 summary = TurnSummary(final_response="")
1313 await runner.execute_batch(
1314 tool_calls=[tool_call],
1315 tool_source="assistant",
1316 pending_tool_calls_seen=set(),
1317 emit=_noop_emit,
1318 summary=summary,
1319 dod=dod,
1320 executor=executor, # type: ignore[arg-type]
1321 on_confirmation=None,
1322 on_user_question=None,
1323 emit_confirmation=None,
1324 consecutive_errors=0,
1325 )
1326
1327 assert persistent_messages
1328 assert any(
1329 "Continue with the next pending item: `Create the nginx directory structure`"
1330 in message
1331 for message in persistent_messages
1332 )
1333 assert any(
1334 "Resume by creating `chapters/` now." in message
1335 for message in persistent_messages
1336 )
1337 assert all(
1338 "Next step: create `index.html`." not in message
1339 for message in persistent_messages
1340 )
1341 assert ephemeral_messages == []
1342
1343
1344 @pytest.mark.asyncio
1345 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1346 temp_dir: Path,
1347 ) -> None:
1348 async def assess_confidence(
1349 tool_name: str,
1350 tool_args: dict,
1351 context: str,
1352 ) -> ConfidenceAssessment:
1353 raise AssertionError("Confidence scoring should be disabled in this scenario")
1354
1355 async def verify_action(
1356 tool_name: str,
1357 tool_args: dict,
1358 result: str,
1359 expected: str = "",
1360 ) -> ActionVerification:
1361 raise AssertionError("Verification should not run for this scenario")
1362
1363 reference = temp_dir / "fortran" / "index.html"
1364 reference.parent.mkdir(parents=True)
1365 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1366
1367 messages = [
1368 Message(
1369 role=Role.TOOL,
1370 content=(
1371 "Observation [read]: Result: "
1372 "<h1>Fortran Beginner's Guide</h1>\n"
1373 ),
1374 )
1375 ]
1376 context = build_context(
1377 temp_dir=temp_dir,
1378 messages=messages,
1379 safeguards=FakeSafeguards(),
1380 assess_confidence=assess_confidence,
1381 verify_action=verify_action,
1382 auto_recover=False,
1383 )
1384 prompt = (
1385 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1386 "for the structure and cadence of the guide. We are going to make an all "
1387 "new equally thorough guide on how to use the nginx tool."
1388 )
1389 context.session.current_task = prompt
1390 persistent_messages: list[str] = []
1391 ephemeral_messages: list[str] = []
1392 context.queue_steering_message_callback = persistent_messages.append
1393 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1394 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1395 dod = create_definition_of_done(prompt)
1396 sync_todos_to_definition_of_done(
1397 dod,
1398 [
1399 {
1400 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1401 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1402 "status": "completed",
1403 },
1404 {
1405 "content": "Create the nginx directory structure",
1406 "active_form": "Working on: Create the nginx directory structure",
1407 "status": "pending",
1408 },
1409 {
1410 "content": "Create the nginx index.html file",
1411 "active_form": "Working on: Create the nginx index.html file",
1412 "status": "pending",
1413 },
1414 ],
1415 )
1416 tool_call = ToolCall(
1417 id="read-dup",
1418 name="read",
1419 arguments={"file_path": str(reference)},
1420 )
1421 duplicate_message = (
1422 "[Skipped - duplicate action: Already read "
1423 f"{reference} recently without any intervening changes; "
1424 "reuse the earlier read result instead of rereading]"
1425 )
1426 executor = FakeExecutor(
1427 [
1428 ToolExecutionOutcome(
1429 tool_call=tool_call,
1430 state=ToolExecutionState.DUPLICATE,
1431 message=Message.tool_result_message(
1432 tool_call_id=tool_call.id,
1433 display_content=duplicate_message,
1434 result_content=duplicate_message,
1435 ),
1436 event_content=duplicate_message,
1437 is_error=False,
1438 result_output=duplicate_message,
1439 )
1440 ]
1441 )
1442
1443 summary = TurnSummary(final_response="")
1444 await runner.execute_batch(
1445 tool_calls=[tool_call],
1446 tool_source="assistant",
1447 pending_tool_calls_seen=set(),
1448 emit=_noop_emit,
1449 summary=summary,
1450 dod=dod,
1451 executor=executor, # type: ignore[arg-type]
1452 on_confirmation=None,
1453 on_user_question=None,
1454 emit_confirmation=None,
1455 consecutive_errors=0,
1456 )
1457
1458 assert len(persistent_messages) == 1
1459 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1460 assert (
1461 "Continue with the next pending item: `Create the nginx directory structure`"
1462 in persistent_messages[0]
1463 )
1464 assert "Update `" not in persistent_messages[0]
1465 assert ephemeral_messages == []
1466
1467
1468 @pytest.mark.asyncio
1469 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1470 temp_dir: Path,
1471 ) -> None:
1472 async def assess_confidence(
1473 tool_name: str,
1474 tool_args: dict,
1475 context: str,
1476 ) -> ConfidenceAssessment:
1477 raise AssertionError("Confidence scoring should be disabled in this scenario")
1478
1479 async def verify_action(
1480 tool_name: str,
1481 tool_args: dict,
1482 result: str,
1483 expected: str = "",
1484 ) -> ActionVerification:
1485 raise AssertionError("Verification should not run for this scenario")
1486
1487 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1488 chapters = guide_root / "chapters"
1489 chapters.mkdir(parents=True)
1490 chapter_one = chapters / "01-introduction.html"
1491 chapter_one.write_text("<html></html>\n")
1492 index_path = guide_root / "index.html"
1493
1494 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1495 reference.parent.mkdir(parents=True, exist_ok=True)
1496 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1497
1498 implementation_plan = temp_dir / "implementation.md"
1499 implementation_plan.write_text(
1500 "\n".join(
1501 [
1502 "# Implementation Plan",
1503 "",
1504 "## File Changes",
1505 f"- `{guide_root}/`",
1506 f"- `{chapters}/`",
1507 f"- `{index_path}`",
1508 f"- `{chapter_one}`",
1509 f"- `{chapters / '02-installation.html'}`",
1510 "",
1511 ]
1512 )
1513 )
1514
1515 context = build_context(
1516 temp_dir=temp_dir,
1517 messages=[],
1518 safeguards=FakeSafeguards(),
1519 assess_confidence=assess_confidence,
1520 verify_action=verify_action,
1521 auto_recover=False,
1522 )
1523 persistent_messages: list[str] = []
1524 ephemeral_messages: list[str] = []
1525 context.queue_steering_message_callback = persistent_messages.append
1526 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1527 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1528 dod = create_definition_of_done("Create a multi-file nginx guide.")
1529 dod.implementation_plan = str(implementation_plan)
1530 dod.touched_files.append(str(chapter_one))
1531 sync_todos_to_definition_of_done(
1532 dod,
1533 [
1534 {
1535 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1536 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1537 "status": "pending",
1538 },
1539 {
1540 "content": "Create each chapter file with appropriate content",
1541 "active_form": "Working on: Create each chapter file with appropriate content",
1542 "status": "pending",
1543 },
1544 {
1545 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1546 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1547 "status": "pending",
1548 },
1549 ],
1550 )
1551 tool_call = ToolCall(
1552 id="read-reference-chapter",
1553 name="read",
1554 arguments={"file_path": str(reference)},
1555 )
1556 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1557 executor = FakeExecutor(
1558 [
1559 ToolExecutionOutcome(
1560 tool_call=tool_call,
1561 state=ToolExecutionState.EXECUTED,
1562 message=Message.tool_result_message(
1563 tool_call_id=tool_call.id,
1564 display_content=read_output,
1565 result_content=read_output,
1566 ),
1567 event_content=read_output,
1568 is_error=False,
1569 result_output=read_output,
1570 )
1571 ]
1572 )
1573
1574 summary = TurnSummary(final_response="")
1575 await runner.execute_batch(
1576 tool_calls=[tool_call],
1577 tool_source="assistant",
1578 pending_tool_calls_seen=set(),
1579 emit=_noop_emit,
1580 summary=summary,
1581 dod=dod,
1582 executor=executor, # type: ignore[arg-type]
1583 on_confirmation=None,
1584 on_user_question=None,
1585 emit_confirmation=None,
1586 consecutive_errors=0,
1587 )
1588
1589 assert persistent_messages
1590 assert any(
1591 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1592 in message
1593 for message in persistent_messages
1594 )
1595 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1596 assert not any(
1597 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1598 in message
1599 for message in persistent_messages
1600 )
1601 assert ephemeral_messages == []
1602
1603
1604 @pytest.mark.asyncio
1605 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1606 temp_dir: Path,
1607 ) -> None:
1608 async def assess_confidence(
1609 tool_name: str,
1610 tool_args: dict,
1611 context: str,
1612 ) -> ConfidenceAssessment:
1613 raise AssertionError("Confidence scoring should not run for this scenario")
1614
1615 async def verify_action(
1616 tool_name: str,
1617 tool_args: dict,
1618 result: str,
1619 expected: str = "",
1620 ) -> ActionVerification:
1621 raise AssertionError("Verification should not run for this scenario")
1622
1623 guide_root = temp_dir / "guides" / "nginx"
1624 chapters = guide_root / "chapters"
1625 guide_root.mkdir(parents=True)
1626 chapters.mkdir()
1627 index_path = guide_root / "index.html"
1628 chapter_one = chapters / "01-getting-started.html"
1629 chapter_two = chapters / "02-installation.html"
1630 index_path.write_text("<html></html>\n")
1631 chapter_one.write_text("<h1>One</h1>\n")
1632 chapter_two.write_text("<h1>Two</h1>\n")
1633
1634 implementation_plan = temp_dir / "implementation.md"
1635 implementation_plan.write_text(
1636 "\n".join(
1637 [
1638 "# Implementation Plan",
1639 "",
1640 "## File Changes",
1641 f"- `{guide_root}/`",
1642 f"- `{chapters}/`",
1643 f"- `{index_path}`",
1644 f"- `{chapter_one}`",
1645 f"- `{chapter_two}`",
1646 "",
1647 ]
1648 )
1649 )
1650
1651 context = build_context(
1652 temp_dir=temp_dir,
1653 messages=[],
1654 safeguards=FakeSafeguards(),
1655 assess_confidence=assess_confidence,
1656 verify_action=verify_action,
1657 auto_recover=False,
1658 )
1659 persistent_messages: list[str] = []
1660 ephemeral_messages: list[str] = []
1661 context.queue_steering_message_callback = persistent_messages.append
1662 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1663 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1664 dod = create_definition_of_done("Create a multi-file nginx guide.")
1665 dod.implementation_plan = str(implementation_plan)
1666 dod.pending_items = [
1667 "Create 07-performance-tuning.html",
1668 "Verify all guide files are linked and complete",
1669 "Complete the requested work",
1670 ]
1671
1672 tool_call = ToolCall(
1673 id="read-dup",
1674 name="read",
1675 arguments={"file_path": str(chapter_one)},
1676 )
1677 duplicate_message = (
1678 "[Skipped - duplicate action: Already read "
1679 f"{chapter_one} recently without any intervening changes; "
1680 "reuse the earlier read result instead of rereading]"
1681 )
1682 executor = FakeExecutor(
1683 [
1684 ToolExecutionOutcome(
1685 tool_call=tool_call,
1686 state=ToolExecutionState.DUPLICATE,
1687 message=Message.tool_result_message(
1688 tool_call_id=tool_call.id,
1689 display_content=duplicate_message,
1690 result_content=duplicate_message,
1691 ),
1692 event_content=duplicate_message,
1693 is_error=False,
1694 result_output=duplicate_message,
1695 )
1696 ]
1697 )
1698
1699 summary = TurnSummary(final_response="")
1700 await runner.execute_batch(
1701 tool_calls=[tool_call],
1702 tool_source="assistant",
1703 pending_tool_calls_seen=set(),
1704 emit=_noop_emit,
1705 summary=summary,
1706 dod=dod,
1707 executor=executor, # type: ignore[arg-type]
1708 on_confirmation=None,
1709 on_user_question=None,
1710 emit_confirmation=None,
1711 consecutive_errors=0,
1712 )
1713
1714 assert len(persistent_messages) == 1
1715 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1716 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1717 assert ephemeral_messages == []
1718
1719
1720 @pytest.mark.asyncio
1721 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1722 temp_dir: Path,
1723 ) -> None:
1724 async def assess_confidence(
1725 tool_name: str,
1726 tool_args: dict,
1727 context: str,
1728 ) -> ConfidenceAssessment:
1729 raise AssertionError("Confidence scoring should not run for this scenario")
1730
1731 async def verify_action(
1732 tool_name: str,
1733 tool_args: dict,
1734 result: str,
1735 expected: str = "",
1736 ) -> ActionVerification:
1737 raise AssertionError("Verification should not run for this scenario")
1738
1739 guide_root = temp_dir / "guides" / "nginx"
1740 chapters = guide_root / "chapters"
1741 guide_root.mkdir(parents=True)
1742 chapters.mkdir()
1743 index_path = guide_root / "index.html"
1744 chapter_one = chapters / "01-getting-started.html"
1745 chapter_two = chapters / "02-installation.html"
1746 index_path.write_text("<html></html>\n")
1747 chapter_one.write_text("<h1>One</h1>\n")
1748 chapter_two.write_text("<h1>Two</h1>\n")
1749
1750 implementation_plan = temp_dir / "implementation.md"
1751 implementation_plan.write_text(
1752 "\n".join(
1753 [
1754 "# Implementation Plan",
1755 "",
1756 "## File Changes",
1757 f"- `{guide_root}/`",
1758 f"- `{chapters}/`",
1759 f"- `{index_path}`",
1760 f"- `{chapter_one}`",
1761 f"- `{chapter_two}`",
1762 "",
1763 ]
1764 )
1765 )
1766
1767 context = build_context(
1768 temp_dir=temp_dir,
1769 messages=[],
1770 safeguards=FakeSafeguards(),
1771 assess_confidence=assess_confidence,
1772 verify_action=verify_action,
1773 auto_recover=False,
1774 )
1775 persistent_messages: list[str] = []
1776 ephemeral_messages: list[str] = []
1777 context.queue_steering_message_callback = persistent_messages.append
1778 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1779 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1780 dod = create_definition_of_done("Create a multi-file nginx guide.")
1781 dod.implementation_plan = str(implementation_plan)
1782 dod.verification_commands = [f"ls -la {guide_root}"]
1783 dod.pending_items = [
1784 "Create 07-performance-tuning.html",
1785 "Complete the requested work",
1786 ]
1787
1788 tool_call = ToolCall(
1789 id="read-dup",
1790 name="read",
1791 arguments={"file_path": str(chapter_one)},
1792 )
1793 duplicate_message = (
1794 "[Skipped - duplicate action: Already read "
1795 f"{chapter_one} recently without any intervening changes; "
1796 "reuse the earlier read result instead of rereading]"
1797 )
1798 executor = FakeExecutor(
1799 [
1800 ToolExecutionOutcome(
1801 tool_call=tool_call,
1802 state=ToolExecutionState.DUPLICATE,
1803 message=Message.tool_result_message(
1804 tool_call_id=tool_call.id,
1805 display_content=duplicate_message,
1806 result_content=duplicate_message,
1807 ),
1808 event_content=duplicate_message,
1809 is_error=False,
1810 result_output=duplicate_message,
1811 )
1812 ]
1813 )
1814
1815 summary = TurnSummary(final_response="")
1816 await runner.execute_batch(
1817 tool_calls=[tool_call],
1818 tool_source="assistant",
1819 pending_tool_calls_seen=set(),
1820 emit=_noop_emit,
1821 summary=summary,
1822 dod=dod,
1823 executor=executor, # type: ignore[arg-type]
1824 on_confirmation=None,
1825 on_user_question=None,
1826 emit_confirmation=None,
1827 consecutive_errors=0,
1828 )
1829
1830 assert len(persistent_messages) == 1
1831 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1832 assert (
1833 "Move to verification or final confirmation using the files already on disk."
1834 in persistent_messages[0]
1835 )
1836 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1837 assert ephemeral_messages == []
1838
1839
1840 @pytest.mark.asyncio
1841 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1842 temp_dir: Path,
1843 ) -> None:
1844 async def assess_confidence(
1845 tool_name: str,
1846 tool_args: dict,
1847 context: str,
1848 ) -> ConfidenceAssessment:
1849 raise AssertionError("Confidence scoring should not run for this scenario")
1850
1851 async def verify_action(
1852 tool_name: str,
1853 tool_args: dict,
1854 result: str,
1855 expected: str = "",
1856 ) -> ActionVerification:
1857 raise AssertionError("Verification should not run for this scenario")
1858
1859 guide_root = temp_dir / "guides" / "nginx"
1860 chapters = guide_root / "chapters"
1861 guide_root.mkdir(parents=True)
1862 chapters.mkdir()
1863 index_path = guide_root / "index.html"
1864 chapter_one = chapters / "01-getting-started.html"
1865 chapter_two = chapters / "02-installation.html"
1866 index_path.write_text("<html></html>\n")
1867 chapter_one.write_text("<h1>One</h1>\n")
1868 chapter_two.write_text("<h1>Two</h1>\n")
1869
1870 implementation_plan = temp_dir / "implementation.md"
1871 implementation_plan.write_text(
1872 "\n".join(
1873 [
1874 "# Implementation Plan",
1875 "",
1876 "## File Changes",
1877 f"- `{guide_root}/`",
1878 f"- `{chapters}/`",
1879 f"- `{index_path}`",
1880 f"- `{chapter_one}`",
1881 f"- `{chapter_two}`",
1882 "",
1883 ]
1884 )
1885 )
1886
1887 context = build_context(
1888 temp_dir=temp_dir,
1889 messages=[],
1890 safeguards=FakeSafeguards(),
1891 assess_confidence=assess_confidence,
1892 verify_action=verify_action,
1893 auto_recover=False,
1894 )
1895 persistent_messages: list[str] = []
1896 ephemeral_messages: list[str] = []
1897 context.queue_steering_message_callback = persistent_messages.append
1898 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1899 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1900 dod = create_definition_of_done("Create a multi-file nginx guide.")
1901 dod.implementation_plan = str(implementation_plan)
1902 dod.verification_commands = [f"ls -la {guide_root}"]
1903 dod.pending_items = [
1904 "Create 01-getting-started.html",
1905 "Creating 02-installation.html",
1906 "Complete the requested work",
1907 ]
1908
1909 tool_call = ToolCall(
1910 id="read-dup-built-stale",
1911 name="read",
1912 arguments={"file_path": str(chapter_one)},
1913 )
1914 duplicate_message = (
1915 "[Skipped - duplicate action: Already read "
1916 f"{chapter_one} recently without any intervening changes; "
1917 "reuse the earlier read result instead of rereading]"
1918 )
1919 executor = FakeExecutor(
1920 [
1921 ToolExecutionOutcome(
1922 tool_call=tool_call,
1923 state=ToolExecutionState.DUPLICATE,
1924 message=Message.tool_result_message(
1925 tool_call_id=tool_call.id,
1926 display_content=duplicate_message,
1927 result_content=duplicate_message,
1928 ),
1929 event_content=duplicate_message,
1930 is_error=False,
1931 result_output=duplicate_message,
1932 )
1933 ]
1934 )
1935
1936 summary = TurnSummary(final_response="")
1937 await runner.execute_batch(
1938 tool_calls=[tool_call],
1939 tool_source="assistant",
1940 pending_tool_calls_seen=set(),
1941 emit=_noop_emit,
1942 summary=summary,
1943 dod=dod,
1944 executor=executor, # type: ignore[arg-type]
1945 on_confirmation=None,
1946 on_user_question=None,
1947 emit_confirmation=None,
1948 consecutive_errors=0,
1949 )
1950
1951 assert len(persistent_messages) == 1
1952 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1953 assert (
1954 "Move to verification or final confirmation using the files already on disk."
1955 in persistent_messages[0]
1956 )
1957 assert "Create 01-getting-started.html" not in persistent_messages[0]
1958 assert "Creating 02-installation.html" not in persistent_messages[0]
1959 assert ephemeral_messages == []
1960
1961
1962 @pytest.mark.asyncio
1963 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1964 temp_dir: Path,
1965 ) -> None:
1966 async def assess_confidence(
1967 tool_name: str,
1968 tool_args: dict,
1969 context: str,
1970 ) -> ConfidenceAssessment:
1971 raise AssertionError("Confidence scoring should be disabled in this scenario")
1972
1973 async def verify_action(
1974 tool_name: str,
1975 tool_args: dict,
1976 result: str,
1977 expected: str = "",
1978 ) -> ActionVerification:
1979 raise AssertionError("Verification should not run for this scenario")
1980
1981 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1982 reference.parent.mkdir(parents=True)
1983 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1984
1985 context = build_context(
1986 temp_dir=temp_dir,
1987 messages=[],
1988 safeguards=FakeSafeguards(),
1989 assess_confidence=assess_confidence,
1990 verify_action=verify_action,
1991 auto_recover=False,
1992 )
1993 persistent_messages: list[str] = []
1994 ephemeral_messages: list[str] = []
1995 context.queue_steering_message_callback = persistent_messages.append
1996 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1997 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1998 dod = create_definition_of_done("Create a multi-file nginx guide.")
1999 sync_todos_to_definition_of_done(
2000 dod,
2001 [
2002 {
2003 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2004 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2005 "status": "pending",
2006 },
2007 {
2008 "content": "Create the nginx index.html file",
2009 "active_form": "Working on: Create the nginx index.html file",
2010 "status": "pending",
2011 },
2012 ],
2013 )
2014 tool_call = ToolCall(
2015 id="read-reference",
2016 name="read",
2017 arguments={"file_path": str(reference)},
2018 )
2019 executor = FakeExecutor(
2020 [
2021 tool_outcome(
2022 tool_call=tool_call,
2023 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2024 is_error=False,
2025 )
2026 ]
2027 )
2028
2029 summary = TurnSummary(final_response="")
2030 await runner.execute_batch(
2031 tool_calls=[tool_call],
2032 tool_source="assistant",
2033 pending_tool_calls_seen=set(),
2034 emit=_noop_emit,
2035 summary=summary,
2036 dod=dod,
2037 executor=executor, # type: ignore[arg-type]
2038 on_confirmation=None,
2039 on_user_question=None,
2040 emit_confirmation=None,
2041 consecutive_errors=0,
2042 )
2043
2044 assert any(
2045 "Continue with the next pending item: `Create the nginx index.html file`"
2046 in message
2047 for message in persistent_messages
2048 )
2049 assert any(
2050 "stop gathering more reference material and perform the change now" in message
2051 for message in persistent_messages
2052 )
2053 assert ephemeral_messages == []
2054
2055
2056 @pytest.mark.asyncio
2057 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2058 temp_dir: Path,
2059 ) -> None:
2060 async def assess_confidence(
2061 tool_name: str,
2062 tool_args: dict,
2063 context: str,
2064 ) -> ConfidenceAssessment:
2065 raise AssertionError("Confidence scoring should be disabled in this scenario")
2066
2067 async def verify_action(
2068 tool_name: str,
2069 tool_args: dict,
2070 result: str,
2071 expected: str = "",
2072 ) -> ActionVerification:
2073 raise AssertionError("Verification should not run for this scenario")
2074
2075 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2076 reference.parent.mkdir(parents=True)
2077 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2078
2079 context = build_context(
2080 temp_dir=temp_dir,
2081 messages=[],
2082 safeguards=FakeSafeguards(),
2083 assess_confidence=assess_confidence,
2084 verify_action=verify_action,
2085 auto_recover=False,
2086 )
2087 persistent_messages: list[str] = []
2088 ephemeral_messages: list[str] = []
2089 context.queue_steering_message_callback = persistent_messages.append
2090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2092 dod = create_definition_of_done("Create a multi-file nginx guide.")
2093 sync_todos_to_definition_of_done(
2094 dod,
2095 [
2096 {
2097 "content": "First, examine the existing fortran guide structure and content",
2098 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2099 "status": "pending",
2100 },
2101 {
2102 "content": "Create the nginx directory structure",
2103 "active_form": "Working on: Create the nginx directory structure",
2104 "status": "pending",
2105 },
2106 ],
2107 )
2108 tool_call = ToolCall(
2109 id="read-reference",
2110 name="read",
2111 arguments={"file_path": str(reference)},
2112 )
2113 executor = FakeExecutor(
2114 [
2115 tool_outcome(
2116 tool_call=tool_call,
2117 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2118 is_error=False,
2119 )
2120 ]
2121 )
2122
2123 summary = TurnSummary(final_response="")
2124 await runner.execute_batch(
2125 tool_calls=[tool_call],
2126 tool_source="assistant",
2127 pending_tool_calls_seen=set(),
2128 emit=_noop_emit,
2129 summary=summary,
2130 dod=dod,
2131 executor=executor, # type: ignore[arg-type]
2132 on_confirmation=None,
2133 on_user_question=None,
2134 emit_confirmation=None,
2135 consecutive_errors=0,
2136 )
2137
2138 assert persistent_messages
2139 assert any(
2140 "Continue with the next pending item: `Create the nginx directory structure`"
2141 in message
2142 for message in persistent_messages
2143 )
2144 assert ephemeral_messages == []
2145
2146
2147 @pytest.mark.asyncio
2148 async def test_tool_batch_runner_missing_artifact_nudge_stays_quiet_after_setup_mkdir(
2149 temp_dir: Path,
2150 ) -> None:
2151 async def assess_confidence(
2152 tool_name: str,
2153 tool_args: dict,
2154 context: str,
2155 ) -> ConfidenceAssessment:
2156 raise AssertionError("Confidence scoring should be disabled in this scenario")
2157
2158 async def verify_action(
2159 tool_name: str,
2160 tool_args: dict,
2161 result: str,
2162 expected: str = "",
2163 ) -> ActionVerification:
2164 raise AssertionError("Verification should not run for this scenario")
2165
2166 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2167 chapters = nginx_root / "chapters"
2168 implementation_plan = temp_dir / "implementation.md"
2169 implementation_plan.write_text(
2170 "\n".join(
2171 [
2172 "# Implementation Plan",
2173 "",
2174 "## File Changes",
2175 f"- `{chapters}/`",
2176 f"- `{nginx_root / 'index.html'}`",
2177 "",
2178 ]
2179 )
2180 )
2181
2182 context = build_context(
2183 temp_dir=temp_dir,
2184 messages=[],
2185 safeguards=FakeSafeguards(),
2186 assess_confidence=assess_confidence,
2187 verify_action=verify_action,
2188 auto_recover=False,
2189 )
2190 persistent_messages: list[str] = []
2191 ephemeral_messages: list[str] = []
2192 context.queue_steering_message_callback = persistent_messages.append
2193 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2194 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2195 dod = create_definition_of_done("Create a multi-file nginx guide.")
2196 dod.implementation_plan = str(implementation_plan)
2197 sync_todos_to_definition_of_done(
2198 dod,
2199 [
2200 {
2201 "content": "Create the nginx directory structure",
2202 "active_form": "Creating the nginx directory structure",
2203 "status": "pending",
2204 },
2205 {
2206 "content": "Develop the main index.html file with proper structure",
2207 "active_form": "Developing the main index.html file with proper structure",
2208 "status": "pending",
2209 },
2210 ],
2211 )
2212
2213 tool_call = ToolCall(
2214 id="mkdir-nginx",
2215 name="bash",
2216 arguments={"command": f"mkdir -p {chapters}"},
2217 )
2218 executor = FakeExecutor(
2219 [
2220 tool_outcome(
2221 tool_call=tool_call,
2222 output="",
2223 is_error=False,
2224 )
2225 ]
2226 )
2227
2228 summary = TurnSummary(final_response="")
2229 await runner.execute_batch(
2230 tool_calls=[tool_call],
2231 tool_source="assistant",
2232 pending_tool_calls_seen=set(),
2233 emit=_noop_emit,
2234 summary=summary,
2235 dod=dod,
2236 executor=executor, # type: ignore[arg-type]
2237 on_confirmation=None,
2238 on_user_question=None,
2239 emit_confirmation=None,
2240 consecutive_errors=0,
2241 )
2242
2243 assert persistent_messages == []
2244 assert ephemeral_messages == []
2245
2246
2247 @pytest.mark.asyncio
2248 async def test_tool_batch_runner_first_file_handoff_stays_persistent(
2249 temp_dir: Path,
2250 ) -> None:
2251 async def assess_confidence(
2252 tool_name: str,
2253 tool_args: dict,
2254 context: str,
2255 ) -> ConfidenceAssessment:
2256 raise AssertionError("Confidence scoring should be disabled in this scenario")
2257
2258 async def verify_action(
2259 tool_name: str,
2260 tool_args: dict,
2261 result: str,
2262 expected: str = "",
2263 ) -> ActionVerification:
2264 raise AssertionError("Verification should not run for this scenario")
2265
2266 nginx_root = temp_dir / "guides" / "nginx"
2267 chapters = nginx_root / "chapters"
2268 chapters.mkdir(parents=True)
2269 index_path = nginx_root / "index.html"
2270
2271 implementation_plan = temp_dir / "implementation.md"
2272 implementation_plan.write_text(
2273 "\n".join(
2274 [
2275 "# Implementation Plan",
2276 "",
2277 "## File Changes",
2278 f"- `{chapters}/`",
2279 f"- `{index_path}`",
2280 f"- `{chapters / '01-introduction.html'}`",
2281 "",
2282 ]
2283 )
2284 )
2285
2286 context = build_context(
2287 temp_dir=temp_dir,
2288 messages=[],
2289 safeguards=FakeSafeguards(),
2290 assess_confidence=assess_confidence,
2291 verify_action=verify_action,
2292 auto_recover=False,
2293 )
2294 persistent_messages: list[str] = []
2295 ephemeral_messages: list[str] = []
2296 context.queue_steering_message_callback = persistent_messages.append
2297 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2298 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2299 dod = create_definition_of_done("Create a multi-file nginx guide.")
2300 dod.implementation_plan = str(implementation_plan)
2301 sync_todos_to_definition_of_done(
2302 dod,
2303 [
2304 {
2305 "content": "Create the main index.html file with proper structure",
2306 "active_form": "Creating the main index.html file with proper structure",
2307 "status": "pending",
2308 },
2309 {
2310 "content": "Create each chapter file with appropriate content",
2311 "active_form": "Creating each chapter file with appropriate content",
2312 "status": "pending",
2313 },
2314 ],
2315 )
2316
2317 tool_call = ToolCall(
2318 id="write-index",
2319 name="write",
2320 arguments={
2321 "file_path": str(index_path),
2322 "content": "<html></html>\n",
2323 },
2324 )
2325 executor = FakeExecutor(
2326 [
2327 tool_outcome(
2328 tool_call=tool_call,
2329 output=f"Successfully wrote 14 bytes to {index_path}",
2330 is_error=False,
2331 )
2332 ]
2333 )
2334
2335 summary = TurnSummary(final_response="")
2336 await runner.execute_batch(
2337 tool_calls=[tool_call],
2338 tool_source="assistant",
2339 pending_tool_calls_seen=set(),
2340 emit=_noop_emit,
2341 summary=summary,
2342 dod=dod,
2343 executor=executor, # type: ignore[arg-type]
2344 on_confirmation=None,
2345 on_user_question=None,
2346 emit_confirmation=None,
2347 consecutive_errors=0,
2348 )
2349
2350 assert persistent_messages
2351 message = persistent_messages[-1]
2352 assert "Confirmed progress:" in message
2353 assert "Next step: create `01-introduction.html`." in message
2354 assert (
2355 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2356 in message
2357 )
2358 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2359 assert ephemeral_messages == []
2360
2361
2362 @pytest.mark.asyncio
2363 async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
2364 temp_dir: Path,
2365 ) -> None:
2366 async def assess_confidence(
2367 tool_name: str,
2368 tool_args: dict,
2369 context: str,
2370 ) -> ConfidenceAssessment:
2371 raise AssertionError("Confidence scoring should be disabled in this scenario")
2372
2373 async def verify_action(
2374 tool_name: str,
2375 tool_args: dict,
2376 result: str,
2377 expected: str = "",
2378 ) -> ActionVerification:
2379 raise AssertionError("Verification should not run for this scenario")
2380
2381 nginx_root = temp_dir / "guides" / "nginx"
2382 chapters = nginx_root / "chapters"
2383 chapters.mkdir(parents=True)
2384 index_path = nginx_root / "index.html"
2385
2386 implementation_plan = temp_dir / "implementation.md"
2387 implementation_plan.write_text(
2388 "\n".join(
2389 [
2390 "# Implementation Plan",
2391 "",
2392 "## File Changes",
2393 f"- `{chapters}/`",
2394 f"- `{index_path}`",
2395 f"- `{chapters / '01-introduction.html'}`",
2396 "",
2397 ]
2398 )
2399 )
2400
2401 context = build_context(
2402 temp_dir=temp_dir,
2403 messages=[
2404 Message(
2405 role=Role.USER,
2406 content=(
2407 "[EMPTY ASSISTANT RESPONSE]\n"
2408 "Respond with that concrete mutation tool call now. Do not return an empty response."
2409 ),
2410 )
2411 ],
2412 safeguards=FakeSafeguards(),
2413 assess_confidence=assess_confidence,
2414 verify_action=verify_action,
2415 auto_recover=False,
2416 )
2417 persistent_messages: list[str] = []
2418 ephemeral_messages: list[str] = []
2419 context.queue_steering_message_callback = persistent_messages.append
2420 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2421 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2422 dod = create_definition_of_done("Create a multi-file nginx guide.")
2423 dod.implementation_plan = str(implementation_plan)
2424 sync_todos_to_definition_of_done(
2425 dod,
2426 [
2427 {
2428 "content": "Create the main index.html file with proper structure",
2429 "active_form": "Creating the main index.html file with proper structure",
2430 "status": "pending",
2431 },
2432 {
2433 "content": "Create each chapter file with appropriate content",
2434 "active_form": "Creating each chapter file with appropriate content",
2435 "status": "pending",
2436 },
2437 ],
2438 )
2439
2440 tool_call = ToolCall(
2441 id="write-index-recovered",
2442 name="write",
2443 arguments={
2444 "file_path": str(index_path),
2445 "content": "<html></html>\n",
2446 },
2447 )
2448 executor = FakeExecutor(
2449 [
2450 tool_outcome(
2451 tool_call=tool_call,
2452 output=f"Successfully wrote 14 bytes to {index_path}",
2453 is_error=False,
2454 )
2455 ]
2456 )
2457
2458 summary = TurnSummary(final_response="")
2459 await runner.execute_batch(
2460 tool_calls=[tool_call],
2461 tool_source="assistant",
2462 pending_tool_calls_seen=set(),
2463 emit=_noop_emit,
2464 summary=summary,
2465 dod=dod,
2466 executor=executor, # type: ignore[arg-type]
2467 on_confirmation=None,
2468 on_user_question=None,
2469 emit_confirmation=None,
2470 consecutive_errors=0,
2471 )
2472
2473 assert persistent_messages == []
2474 assert ephemeral_messages
2475 message = ephemeral_messages[-1]
2476 assert "Resume by creating `01-introduction.html` now." in message
2477
2478
2479 @pytest.mark.asyncio
2480 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
2481 temp_dir: Path,
2482 ) -> None:
2483 async def assess_confidence(
2484 tool_name: str,
2485 tool_args: dict,
2486 context: str,
2487 ) -> ConfidenceAssessment:
2488 raise AssertionError("Confidence scoring should be disabled in this scenario")
2489
2490 async def verify_action(
2491 tool_name: str,
2492 tool_args: dict,
2493 result: str,
2494 expected: str = "",
2495 ) -> ActionVerification:
2496 raise AssertionError("Verification should not run for this scenario")
2497
2498 guide_root = temp_dir / "guides" / "nginx"
2499 chapters = guide_root / "chapters"
2500 chapters.mkdir(parents=True)
2501 index_path = guide_root / "index.html"
2502 chapter_one = chapters / "01-getting-started.html"
2503 chapter_one.write_text("<h1>One</h1>\n")
2504 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2505
2506 implementation_plan = temp_dir / "implementation.md"
2507 implementation_plan.write_text(
2508 "\n".join(
2509 [
2510 "# Implementation Plan",
2511 "",
2512 "## File Changes",
2513 f"- `{index_path}`",
2514 f"- `{chapter_one}`",
2515 f"- `{chapters / '06-ssl-configuration.html'}`",
2516 "",
2517 ]
2518 )
2519 )
2520
2521 context = build_context(
2522 temp_dir=temp_dir,
2523 messages=[],
2524 safeguards=FakeSafeguards(),
2525 assess_confidence=assess_confidence,
2526 verify_action=verify_action,
2527 auto_recover=False,
2528 )
2529 persistent_messages: list[str] = []
2530 ephemeral_messages: list[str] = []
2531 context.queue_steering_message_callback = persistent_messages.append
2532 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2533 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2534 dod = create_definition_of_done("Create a multi-file nginx guide.")
2535 dod.implementation_plan = str(implementation_plan)
2536 sync_todos_to_definition_of_done(
2537 dod,
2538 [
2539 {
2540 "content": "Ensure all files are properly linked and formatted consistently",
2541 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2542 "status": "pending",
2543 },
2544 {
2545 "content": "Create the final chapter (06-ssl-configuration.html)",
2546 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
2547 "status": "pending",
2548 },
2549 ],
2550 )
2551 assert tool_batches_should_prioritize_missing_artifact(
2552 dod=dod,
2553 next_pending=dod.pending_items[0],
2554 missing_artifact=(chapters / "06-ssl-configuration.html", False),
2555 project_root=temp_dir,
2556 )
2557
2558 tool_call = ToolCall(
2559 id="dup-read",
2560 name="read",
2561 arguments={"file_path": str(index_path)},
2562 )
2563 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
2564
2565 assert persistent_messages
2566 message = persistent_messages[-1]
2567 assert "06-ssl-configuration.html" in message
2568 assert "Do not switch into review or consistency-check mode" in message
2569 assert (
2570 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
2571 not in message
2572 )
2573
2574
2575 @pytest.mark.asyncio
2576 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
2577 temp_dir: Path,
2578 ) -> None:
2579 async def assess_confidence(
2580 tool_name: str,
2581 tool_args: dict,
2582 context: str,
2583 ) -> ConfidenceAssessment:
2584 raise AssertionError("Confidence scoring should be disabled in this scenario")
2585
2586 async def verify_action(
2587 tool_name: str,
2588 tool_args: dict,
2589 result: str,
2590 expected: str = "",
2591 ) -> ActionVerification:
2592 raise AssertionError("Verification should not run for this scenario")
2593
2594 guide_root = temp_dir / "guides" / "nginx"
2595 chapters = guide_root / "chapters"
2596 chapters.mkdir(parents=True)
2597 index_path = guide_root / "index.html"
2598 chapter_one = chapters / "01-getting-started.html"
2599 chapter_two = chapters / "02-installation.html"
2600 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2601 chapter_one.write_text("<h1>One</h1>\n")
2602 chapter_two.write_text("<h1>Two</h1>\n")
2603
2604 implementation_plan = temp_dir / "implementation.md"
2605 implementation_plan.write_text(
2606 "\n".join(
2607 [
2608 "# Implementation Plan",
2609 "",
2610 "## File Changes",
2611 f"- `{chapters}/`",
2612 f"- `{index_path}`",
2613 f"- `{chapter_one}`",
2614 f"- `{chapter_two}`",
2615 "",
2616 ]
2617 )
2618 )
2619
2620 context = build_context(
2621 temp_dir=temp_dir,
2622 messages=[],
2623 safeguards=FakeSafeguards(),
2624 assess_confidence=assess_confidence,
2625 verify_action=verify_action,
2626 auto_recover=False,
2627 )
2628 persistent_messages: list[str] = []
2629 ephemeral_messages: list[str] = []
2630 context.queue_steering_message_callback = persistent_messages.append
2631 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2632 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2633 dod = create_definition_of_done("Create a multi-file nginx guide.")
2634 dod.implementation_plan = str(implementation_plan)
2635 sync_todos_to_definition_of_done(
2636 dod,
2637 [
2638 {
2639 "content": "Create the guide files",
2640 "active_form": "Working on: Create the guide files",
2641 "status": "completed",
2642 },
2643 {
2644 "content": "Ensure all files are properly linked and formatted consistently",
2645 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2646 "status": "pending",
2647 },
2648 ],
2649 )
2650 tool_call = ToolCall(
2651 id="write-final",
2652 name="write",
2653 arguments={
2654 "file_path": str(chapter_two),
2655 "content": "<h1>Two</h1>\n",
2656 },
2657 )
2658 executor = FakeExecutor(
2659 [
2660 tool_outcome(
2661 tool_call=tool_call,
2662 output=f"Successfully wrote {chapter_two}",
2663 is_error=False,
2664 )
2665 ]
2666 )
2667
2668 summary = TurnSummary(final_response="")
2669 await runner.execute_batch(
2670 tool_calls=[tool_call],
2671 tool_source="assistant",
2672 pending_tool_calls_seen=set(),
2673 emit=_noop_emit,
2674 summary=summary,
2675 dod=dod,
2676 executor=executor, # type: ignore[arg-type]
2677 on_confirmation=None,
2678 on_user_question=None,
2679 emit_confirmation=None,
2680 consecutive_errors=0,
2681 )
2682
2683 assert any(
2684 "All explicitly planned artifacts now exist." in message
2685 for message in persistent_messages
2686 )
2687 assert any(
2688 "Ensure all files are properly linked and formatted consistently" in message
2689 for message in persistent_messages
2690 )
2691 assert any(
2692 "Move to verification once no specific mismatch remains." in message
2693 for message in persistent_messages
2694 )
2695
2696
2697 @pytest.mark.asyncio
2698 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
2699 temp_dir: Path,
2700 ) -> None:
2701 async def assess_confidence(
2702 tool_name: str,
2703 tool_args: dict,
2704 context: str,
2705 ) -> ConfidenceAssessment:
2706 raise AssertionError("Confidence scoring should not run in this scenario")
2707
2708 async def verify_action(
2709 tool_name: str,
2710 tool_args: dict,
2711 result: str,
2712 expected: str = "",
2713 ) -> ActionVerification:
2714 raise AssertionError("Verification should not run in this scenario")
2715
2716 guide_root = temp_dir / "guides" / "nginx"
2717 chapters = guide_root / "chapters"
2718 guide_root.mkdir(parents=True)
2719 chapters.mkdir()
2720 index_path = guide_root / "index.html"
2721 index_path.write_text("<html></html>\n")
2722 chapter_one = chapters / "01-getting-started.html"
2723 chapter_two = chapters / "02-installation.html"
2724 implementation_plan = temp_dir / "implementation.md"
2725 implementation_plan.write_text(
2726 "\n".join(
2727 [
2728 "# Implementation Plan",
2729 "",
2730 "## File Changes",
2731 f"- `{guide_root}/`",
2732 f"- `{index_path}`",
2733 f"- `{chapter_one}`",
2734 f"- `{chapter_two}`",
2735 "",
2736 ]
2737 )
2738 )
2739
2740 context = build_context(
2741 temp_dir=temp_dir,
2742 messages=[],
2743 safeguards=FakeSafeguards(),
2744 assess_confidence=assess_confidence,
2745 verify_action=verify_action,
2746 auto_recover=False,
2747 )
2748 persistent_messages: list[str] = []
2749 ephemeral_messages: list[str] = []
2750 context.queue_steering_message_callback = persistent_messages.append
2751 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2752 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2753 dod = create_definition_of_done("Create a multi-file nginx guide.")
2754 dod.implementation_plan = str(implementation_plan)
2755 sync_todos_to_definition_of_done(
2756 dod,
2757 [
2758 {
2759 "content": "Create the main index.html file with proper structure",
2760 "active_form": "Working on: Create the main index.html file with proper structure",
2761 "status": "pending",
2762 },
2763 {
2764 "content": "Create each chapter file in sequence, following the established pattern",
2765 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
2766 "status": "pending",
2767 },
2768 {
2769 "content": "Ensure all files are properly linked and formatted consistently",
2770 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2771 "status": "pending",
2772 },
2773 ],
2774 )
2775 tool_call = ToolCall(
2776 id="write-index",
2777 name="write",
2778 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
2779 )
2780 executor = FakeExecutor(
2781 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
2782 )
2783
2784 summary = TurnSummary(final_response="")
2785 await runner.execute_batch(
2786 tool_calls=[tool_call],
2787 tool_source="assistant",
2788 pending_tool_calls_seen=set(),
2789 emit=_noop_emit,
2790 summary=summary,
2791 dod=dod,
2792 executor=executor, # type: ignore[arg-type]
2793 on_confirmation=None,
2794 on_user_question=None,
2795 emit_confirmation=None,
2796 consecutive_errors=0,
2797 )
2798
2799 assert persistent_messages
2800 message = persistent_messages[-1]
2801 assert "Next step: create `01-getting-started.html`." in message
2802 assert (
2803 f"Prefer one `write(file_path=..., content=...)` call for `{chapter_one.resolve(strict=False)}` now."
2804 in message
2805 )
2806 assert "refresh `TodoWrite`" not in message
2807 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2808
2809
2810 @pytest.mark.asyncio
2811 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
2812 temp_dir: Path,
2813 ) -> None:
2814 async def assess_confidence(
2815 tool_name: str,
2816 tool_args: dict,
2817 context: str,
2818 ) -> ConfidenceAssessment:
2819 raise AssertionError("Confidence scoring should not run in this scenario")
2820
2821 async def verify_action(
2822 tool_name: str,
2823 tool_args: dict,
2824 result: str,
2825 expected: str = "",
2826 ) -> ActionVerification:
2827 raise AssertionError("Verification should not run in this scenario")
2828
2829 guide_root = temp_dir / "guides" / "nginx"
2830 chapters = guide_root / "chapters"
2831 guide_root.mkdir(parents=True)
2832 chapters.mkdir()
2833 index_path = guide_root / "index.html"
2834 index_path.write_text("<html></html>\n")
2835
2836 chapter_paths = [
2837 chapters / "01-getting-started.html",
2838 chapters / "02-installation.html",
2839 chapters / "03-first-website.html",
2840 chapters / "04-configuration-basics.html",
2841 chapters / "05-advanced-configurations.html",
2842 chapters / "06-performance-tuning.html",
2843 chapters / "07-security-best-practices.html",
2844 ]
2845 for chapter in chapter_paths[:4]:
2846 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
2847 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
2848
2849 implementation_plan = temp_dir / "implementation.md"
2850 implementation_plan.write_text(
2851 "\n".join(
2852 [
2853 "# Implementation Plan",
2854 "",
2855 "## File Changes",
2856 f"- `{guide_root}/`",
2857 f"- `{chapters}/`",
2858 f"- `{index_path}`",
2859 *[f"- `{path}`" for path in chapter_paths],
2860 "",
2861 ]
2862 )
2863 )
2864
2865 context = build_context(
2866 temp_dir=temp_dir,
2867 messages=[],
2868 safeguards=FakeSafeguards(),
2869 assess_confidence=assess_confidence,
2870 verify_action=verify_action,
2871 auto_recover=False,
2872 )
2873 persistent_messages: list[str] = []
2874 ephemeral_messages: list[str] = []
2875 context.queue_steering_message_callback = persistent_messages.append
2876 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2877 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2878 dod = create_definition_of_done("Create a thorough nginx guide.")
2879 dod.implementation_plan = str(implementation_plan)
2880 sync_todos_to_definition_of_done(
2881 dod,
2882 [
2883 {
2884 "content": "Create the nginx guide artifacts",
2885 "active_form": "Creating nginx guide artifacts",
2886 "status": "pending",
2887 },
2888 {
2889 "content": "Verify all guide files are linked and complete",
2890 "active_form": "Verifying guide linkage and completeness",
2891 "status": "pending",
2892 },
2893 ],
2894 )
2895 tool_call = ToolCall(
2896 id="write-chapter-05",
2897 name="write",
2898 arguments={
2899 "file_path": str(chapter_paths[4]),
2900 "content": "<h1>Advanced configurations</h1>\n",
2901 },
2902 )
2903 executor = FakeExecutor(
2904 [
2905 tool_outcome(
2906 tool_call=tool_call,
2907 output=f"Successfully wrote {chapter_paths[4]}",
2908 is_error=False,
2909 )
2910 ]
2911 )
2912
2913 summary = TurnSummary(final_response="")
2914 await runner.execute_batch(
2915 tool_calls=[tool_call],
2916 tool_source="assistant",
2917 pending_tool_calls_seen=set(),
2918 emit=_noop_emit,
2919 summary=summary,
2920 dod=dod,
2921 executor=executor, # type: ignore[arg-type]
2922 on_confirmation=None,
2923 on_user_question=None,
2924 emit_confirmation=None,
2925 consecutive_errors=0,
2926 )
2927
2928 assert any(
2929 "Resume by creating `06-performance-tuning.html` now." in message
2930 for message in ephemeral_messages
2931 )
2932 assert not any(
2933 "All explicitly planned artifacts now exist." in message
2934 for message in ephemeral_messages
2935 )
2936
2937
2938 @pytest.mark.asyncio
2939 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
2940 temp_dir: Path,
2941 ) -> None:
2942 async def assess_confidence(
2943 tool_name: str,
2944 tool_args: dict,
2945 context: str,
2946 ) -> ConfidenceAssessment:
2947 raise AssertionError("Confidence scoring should not run in this scenario")
2948
2949 async def verify_action(
2950 tool_name: str,
2951 tool_args: dict,
2952 result: str,
2953 expected: str = "",
2954 ) -> ActionVerification:
2955 raise AssertionError("Verification should not run in this scenario")
2956
2957 guide_root = temp_dir / "guides" / "nginx"
2958 chapters = guide_root / "chapters"
2959 guide_root.mkdir(parents=True)
2960 chapters.mkdir()
2961 index_path = guide_root / "index.html"
2962 chapter_paths = [
2963 chapters / "01-introduction.html",
2964 chapters / "02-installation.html",
2965 chapters / "03-configuration.html",
2966 chapters / "04-basic-usage.html",
2967 chapters / "05-advanced-features.html",
2968 ]
2969 for path in (index_path, *chapter_paths[:4]):
2970 path.write_text("<html></html>\n")
2971
2972 implementation_plan = temp_dir / "implementation.md"
2973 implementation_plan.write_text(
2974 "\n".join(
2975 [
2976 "# Implementation Plan",
2977 "",
2978 "## File Changes",
2979 f"- `{guide_root}/`",
2980 f"- `{chapters}/`",
2981 f"- `{index_path}`",
2982 *[f"- `{path}`" for path in chapter_paths],
2983 "",
2984 ]
2985 )
2986 )
2987
2988 context = build_context(
2989 temp_dir=temp_dir,
2990 messages=[],
2991 safeguards=FakeSafeguards(),
2992 assess_confidence=assess_confidence,
2993 verify_action=verify_action,
2994 auto_recover=False,
2995 )
2996 persistent_messages: list[str] = []
2997 ephemeral_messages: list[str] = []
2998 context.queue_steering_message_callback = persistent_messages.append
2999 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3000 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3001 dod = create_definition_of_done("Create a thorough nginx guide.")
3002 dod.implementation_plan = str(implementation_plan)
3003 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3004 dod.completed_items.extend(
3005 [
3006 "Create the nginx directory structure",
3007 "Create the main index.html file with proper structure",
3008 ]
3009 )
3010 sync_todos_to_definition_of_done(
3011 dod,
3012 [
3013 {
3014 "content": "Create each chapter file with appropriate content",
3015 "active_form": "Creating each chapter file with appropriate content",
3016 "status": "pending",
3017 }
3018 ],
3019 )
3020 tool_call = ToolCall(
3021 id="write-chapter-04",
3022 name="write",
3023 arguments={
3024 "file_path": str(chapter_paths[3]),
3025 "content": "<html>updated</html>\n",
3026 },
3027 )
3028 executor = FakeExecutor(
3029 [
3030 tool_outcome(
3031 tool_call=tool_call,
3032 output=f"Successfully wrote {chapter_paths[3]}",
3033 is_error=False,
3034 )
3035 ]
3036 )
3037
3038 summary = TurnSummary(final_response="")
3039 await runner.execute_batch(
3040 tool_calls=[tool_call],
3041 tool_source="assistant",
3042 pending_tool_calls_seen=set(),
3043 emit=_noop_emit,
3044 summary=summary,
3045 dod=dod,
3046 executor=executor, # type: ignore[arg-type]
3047 on_confirmation=None,
3048 on_user_question=None,
3049 emit_confirmation=None,
3050 consecutive_errors=0,
3051 )
3052
3053 assert ephemeral_messages
3054 message = ephemeral_messages[-1]
3055 assert "Resume by creating `05-advanced-features.html` now." in message
3056 assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
3057 assert "refresh `TodoWrite`" not in message
3058
3059
3060 @pytest.mark.asyncio
3061 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3062 temp_dir: Path,
3063 ) -> None:
3064 async def assess_confidence(
3065 tool_name: str,
3066 tool_args: dict,
3067 context: str,
3068 ) -> ConfidenceAssessment:
3069 raise AssertionError("Confidence scoring should not run in this scenario")
3070
3071 async def verify_action(
3072 tool_name: str,
3073 tool_args: dict,
3074 result: str,
3075 expected: str = "",
3076 ) -> ActionVerification:
3077 raise AssertionError("Verification should not run in this scenario")
3078
3079 guide_root = temp_dir / "guides" / "nginx"
3080 chapters = guide_root / "chapters"
3081 guide_root.mkdir(parents=True)
3082 chapters.mkdir()
3083 index_path = guide_root / "index.html"
3084 index_path.write_text("<html></html>\n")
3085 chapter_one = chapters / "01-getting-started.html"
3086 chapter_two = chapters / "02-installation.html"
3087 chapter_one.write_text("<h1>One</h1>\n")
3088
3089 implementation_plan = temp_dir / "implementation.md"
3090 implementation_plan.write_text(
3091 "\n".join(
3092 [
3093 "# Implementation Plan",
3094 "",
3095 "## File Changes",
3096 f"- `{guide_root}/`",
3097 f"- `{chapters}/`",
3098 f"- `{index_path}`",
3099 f"- `{chapter_one}`",
3100 f"- `{chapter_two}`",
3101 "",
3102 ]
3103 )
3104 )
3105
3106 context = build_context(
3107 temp_dir=temp_dir,
3108 messages=[],
3109 safeguards=FakeSafeguards(),
3110 assess_confidence=assess_confidence,
3111 verify_action=verify_action,
3112 auto_recover=False,
3113 )
3114 persistent_messages: list[str] = []
3115 ephemeral_messages: list[str] = []
3116 context.queue_steering_message_callback = persistent_messages.append
3117 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3118 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3119 dod = create_definition_of_done("Create a multi-file nginx guide.")
3120 dod.implementation_plan = str(implementation_plan)
3121 sync_todos_to_definition_of_done(
3122 dod,
3123 [
3124 {
3125 "content": "Create 01-getting-started.html",
3126 "active_form": "Creating 01-getting-started.html",
3127 "status": "completed",
3128 },
3129 {
3130 "content": "Create 02-installation.html",
3131 "active_form": "Creating 02-installation.html",
3132 "status": "pending",
3133 },
3134 ],
3135 )
3136 dod.touched_files.extend([str(index_path), str(chapter_one)])
3137
3138 tool_call = ToolCall(
3139 id="todo-only",
3140 name="TodoWrite",
3141 arguments={
3142 "todos": [
3143 {
3144 "content": "Create 01-getting-started.html",
3145 "active_form": "Creating 01-getting-started.html",
3146 "status": "completed",
3147 },
3148 {
3149 "content": "Create 02-installation.html",
3150 "active_form": "Creating 02-installation.html",
3151 "status": "pending",
3152 },
3153 ]
3154 },
3155 )
3156 executor = FakeExecutor(
3157 [
3158 tool_outcome(
3159 tool_call=tool_call,
3160 output="Todos updated",
3161 is_error=False,
3162 metadata={
3163 "new_todos": [
3164 {
3165 "content": "Create 01-getting-started.html",
3166 "active_form": "Creating 01-getting-started.html",
3167 "status": "completed",
3168 },
3169 {
3170 "content": "Create 02-installation.html",
3171 "active_form": "Creating 02-installation.html",
3172 "status": "pending",
3173 },
3174 ]
3175 },
3176 )
3177 ]
3178 )
3179
3180 summary = TurnSummary(final_response="")
3181 await runner.execute_batch(
3182 tool_calls=[tool_call],
3183 tool_source="assistant",
3184 pending_tool_calls_seen=set(),
3185 emit=_noop_emit,
3186 summary=summary,
3187 dod=dod,
3188 executor=executor, # type: ignore[arg-type]
3189 on_confirmation=None,
3190 on_user_question=None,
3191 emit_confirmation=None,
3192 consecutive_errors=0,
3193 )
3194
3195 assert persistent_messages
3196 message = persistent_messages[-1]
3197 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3198 assert "Resume by creating `02-installation.html` now." in message
3199 assert "refresh `TodoWrite`" in message
3200 assert "Do not spend the next turn on TodoWrite alone" in message
3201 assert ephemeral_messages == []
3202
3203
3204 @pytest.mark.asyncio
3205 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3206 temp_dir: Path,
3207 ) -> None:
3208 async def assess_confidence(
3209 tool_name: str,
3210 tool_args: dict,
3211 context: str,
3212 ) -> ConfidenceAssessment:
3213 raise AssertionError("Confidence scoring should not run in this scenario")
3214
3215 async def verify_action(
3216 tool_name: str,
3217 tool_args: dict,
3218 result: str,
3219 expected: str = "",
3220 ) -> ActionVerification:
3221 raise AssertionError("Verification should not run in this scenario")
3222
3223 guide_root = temp_dir / "guides" / "nginx"
3224 chapters = guide_root / "chapters"
3225 guide_root.mkdir(parents=True)
3226 chapters.mkdir()
3227 index_path = guide_root / "index.html"
3228 chapter_one = chapters / "01-getting-started.html"
3229 chapter_two = chapters / "02-installation.html"
3230 index_path.write_text("<html></html>\n")
3231 chapter_one.write_text("<h1>One</h1>\n")
3232 chapter_two.write_text("<h1>Two</h1>\n")
3233
3234 implementation_plan = temp_dir / "implementation.md"
3235 implementation_plan.write_text(
3236 "\n".join(
3237 [
3238 "# Implementation Plan",
3239 "",
3240 "## File Changes",
3241 f"- `{guide_root}/`",
3242 f"- `{chapters}/`",
3243 f"- `{index_path}`",
3244 f"- `{chapter_one}`",
3245 f"- `{chapter_two}`",
3246 "",
3247 ]
3248 )
3249 )
3250
3251 context = build_context(
3252 temp_dir=temp_dir,
3253 messages=[],
3254 safeguards=FakeSafeguards(),
3255 assess_confidence=assess_confidence,
3256 verify_action=verify_action,
3257 auto_recover=False,
3258 )
3259 queued_messages: list[str] = []
3260 context.queue_steering_message_callback = queued_messages.append
3261 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3262 dod = create_definition_of_done("Create a multi-file nginx guide.")
3263 dod.implementation_plan = str(implementation_plan)
3264 dod.verification_commands = [f"ls -la {guide_root}"]
3265 sync_todos_to_definition_of_done(
3266 dod,
3267 [
3268 {
3269 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3270 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3271 "status": "pending",
3272 },
3273 {
3274 "content": "Verify all guide files are linked and complete",
3275 "active_form": "Working on: Verify all guide files are linked and complete",
3276 "status": "pending",
3277 },
3278 ],
3279 project_root=temp_dir,
3280 )
3281
3282 tool_call = ToolCall(
3283 id="todo-only",
3284 name="TodoWrite",
3285 arguments={
3286 "todos": [
3287 {
3288 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3289 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3290 "status": "pending",
3291 },
3292 {
3293 "content": "Verify all guide files are linked and complete",
3294 "active_form": "Working on: Verify all guide files are linked and complete",
3295 "status": "pending",
3296 },
3297 ]
3298 },
3299 )
3300 executor = FakeExecutor(
3301 [
3302 tool_outcome(
3303 tool_call=tool_call,
3304 output="Todos updated",
3305 is_error=False,
3306 metadata={
3307 "new_todos": [
3308 {
3309 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3310 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3311 "status": "pending",
3312 },
3313 {
3314 "content": "Verify all guide files are linked and complete",
3315 "active_form": "Working on: Verify all guide files are linked and complete",
3316 "status": "pending",
3317 },
3318 ]
3319 },
3320 )
3321 ]
3322 )
3323
3324 summary = TurnSummary(final_response="")
3325 await runner.execute_batch(
3326 tool_calls=[tool_call],
3327 tool_source="assistant",
3328 pending_tool_calls_seen=set(),
3329 emit=_noop_emit,
3330 summary=summary,
3331 dod=dod,
3332 executor=executor, # type: ignore[arg-type]
3333 on_confirmation=None,
3334 on_user_question=None,
3335 emit_confirmation=None,
3336 consecutive_errors=0,
3337 )
3338
3339 assert queued_messages
3340 message = queued_messages[-1]
3341 assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
3342 assert "Verify all guide files are linked and complete" in message
3343 assert "Move to verification once no specific mismatch remains." in message
3344 assert "reopen reference materials" in message
3345 assert "Fortran guide structure" not in message
3346
3347
3348 @pytest.mark.asyncio
3349 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
3350 temp_dir: Path,
3351 ) -> None:
3352 async def assess_confidence(
3353 tool_name: str,
3354 tool_args: dict,
3355 context: str,
3356 ) -> ConfidenceAssessment:
3357 raise AssertionError("Confidence scoring should not run in this scenario")
3358
3359 async def verify_action(
3360 tool_name: str,
3361 tool_args: dict,
3362 result: str,
3363 expected: str = "",
3364 ) -> ActionVerification:
3365 raise AssertionError("Verification should not run in this scenario")
3366
3367 guide_root = temp_dir / "guides" / "nginx"
3368 chapters = guide_root / "chapters"
3369 guide_root.mkdir(parents=True)
3370 chapters.mkdir()
3371 index_path = guide_root / "index.html"
3372 index_path.write_text(
3373 "\n".join(
3374 [
3375 "<!DOCTYPE html>",
3376 "<html>",
3377 "<body>",
3378 '<a href="chapters/01-introduction.html">Introduction</a>',
3379 "</body>",
3380 "</html>",
3381 "",
3382 ]
3383 )
3384 )
3385
3386 implementation_plan = temp_dir / "implementation.md"
3387 implementation_plan.write_text(
3388 "\n".join(
3389 [
3390 "# Implementation Plan",
3391 "",
3392 "## File Changes",
3393 f"- `{guide_root}/`",
3394 f"- `{chapters}/`",
3395 f"- `{index_path}`",
3396 "",
3397 ]
3398 )
3399 )
3400
3401 context = build_context(
3402 temp_dir=temp_dir,
3403 messages=[],
3404 safeguards=FakeSafeguards(),
3405 assess_confidence=assess_confidence,
3406 verify_action=verify_action,
3407 auto_recover=False,
3408 )
3409 queued_messages: list[str] = []
3410 context.queue_steering_message_callback = queued_messages.append
3411 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3412 dod = create_definition_of_done("Create a multi-file nginx guide.")
3413 dod.implementation_plan = str(implementation_plan)
3414 dod.touched_files.append(str(index_path))
3415 sync_todos_to_definition_of_done(
3416 dod,
3417 [
3418 {
3419 "content": "Examine the existing Fortran guide structure",
3420 "active_form": "Examining the existing Fortran guide structure",
3421 "status": "completed",
3422 },
3423 {
3424 "content": "Create the nginx directory structure",
3425 "active_form": "Creating the nginx directory structure",
3426 "status": "completed",
3427 },
3428 {
3429 "content": "Write the introduction chapter",
3430 "active_form": "Writing the introduction chapter",
3431 "status": "pending",
3432 },
3433 ],
3434 project_root=temp_dir,
3435 )
3436
3437 tool_call = ToolCall(
3438 id="todo-next-mutation",
3439 name="TodoWrite",
3440 arguments={
3441 "todos": [
3442 {
3443 "content": "Examine the existing Fortran guide structure",
3444 "active_form": "Examining the existing Fortran guide structure",
3445 "status": "completed",
3446 },
3447 {
3448 "content": "Create the nginx directory structure",
3449 "active_form": "Creating the nginx directory structure",
3450 "status": "completed",
3451 },
3452 {
3453 "content": "Write the introduction chapter",
3454 "active_form": "Writing the introduction chapter",
3455 "status": "pending",
3456 },
3457 ]
3458 },
3459 )
3460 executor = FakeExecutor(
3461 [
3462 tool_outcome(
3463 tool_call=tool_call,
3464 output="Todos updated",
3465 is_error=False,
3466 metadata={
3467 "new_todos": [
3468 {
3469 "content": "Examine the existing Fortran guide structure",
3470 "active_form": "Examining the existing Fortran guide structure",
3471 "status": "completed",
3472 },
3473 {
3474 "content": "Create the nginx directory structure",
3475 "active_form": "Creating the nginx directory structure",
3476 "status": "completed",
3477 },
3478 {
3479 "content": "Write the introduction chapter",
3480 "active_form": "Writing the introduction chapter",
3481 "status": "pending",
3482 },
3483 ]
3484 },
3485 )
3486 ]
3487 )
3488
3489 summary = TurnSummary(final_response="")
3490 await runner.execute_batch(
3491 tool_calls=[tool_call],
3492 tool_source="assistant",
3493 pending_tool_calls_seen=set(),
3494 emit=_noop_emit,
3495 summary=summary,
3496 dod=dod,
3497 executor=executor, # type: ignore[arg-type]
3498 on_confirmation=None,
3499 on_user_question=None,
3500 emit_confirmation=None,
3501 consecutive_errors=0,
3502 )
3503
3504 assert queued_messages
3505 message = queued_messages[-1]
3506 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3507 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3508 assert "Resume by creating `01-introduction.html` now." in message
3509 assert "It is the next missing declared output under `chapters/`." in message
3510 assert "Prefer one `write` call for `" in message
3511 assert "01-introduction.html` instead of more rereads." in message
3512 assert "Do not spend the next turn on TodoWrite alone" in message
3513
3514
3515 @pytest.mark.asyncio
3516 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
3517 temp_dir: Path,
3518 ) -> None:
3519 async def assess_confidence(
3520 tool_name: str,
3521 tool_args: dict,
3522 context: str,
3523 ) -> ConfidenceAssessment:
3524 raise AssertionError("Confidence scoring should not run in this scenario")
3525
3526 async def verify_action(
3527 tool_name: str,
3528 tool_args: dict,
3529 result: str,
3530 expected: str = "",
3531 ) -> ActionVerification:
3532 raise AssertionError("Verification should not run in this scenario")
3533
3534 guide_root = temp_dir / "guides" / "nginx"
3535 chapters = guide_root / "chapters"
3536 guide_root.mkdir(parents=True)
3537 chapters.mkdir()
3538 index_path = guide_root / "index.html"
3539 index_path.write_text(
3540 "\n".join(
3541 [
3542 "<html>",
3543 '<a href="chapters/introduction.html">Introduction</a>',
3544 '<a href="chapters/installation.html">Installation</a>',
3545 "</html>",
3546 ]
3547 )
3548 + "\n"
3549 )
3550
3551 implementation_plan = temp_dir / "implementation.md"
3552 implementation_plan.write_text(
3553 "\n".join(
3554 [
3555 "# Implementation Plan",
3556 "",
3557 "## File Changes",
3558 f"- `{guide_root}/`",
3559 f"- `{chapters}/`",
3560 f"- `{index_path}`",
3561 "",
3562 ]
3563 )
3564 )
3565
3566 dod = create_definition_of_done("Create a multi-file nginx guide.")
3567 dod.implementation_plan = str(implementation_plan)
3568 dod.pending_items = [
3569 "Write the introduction chapter",
3570 "Complete the requested work",
3571 ]
3572 dod.touched_files.append(str(index_path))
3573
3574 queued_messages: list[str] = []
3575 context = build_context(
3576 temp_dir=temp_dir,
3577 messages=[],
3578 safeguards=FakeSafeguards(),
3579 assess_confidence=assess_confidence,
3580 verify_action=verify_action,
3581 auto_recover=False,
3582 )
3583 context.queue_steering_message_callback = queued_messages.append
3584 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3585
3586 tool_call = ToolCall(
3587 id="todo-1",
3588 name="TodoWrite",
3589 arguments={
3590 "todos": [
3591 {
3592 "content": "Write the introduction chapter",
3593 "activeForm": "Writing the introduction chapter",
3594 "status": "pending",
3595 }
3596 ]
3597 },
3598 )
3599 executor = FakeExecutor(
3600 [
3601 tool_outcome(
3602 tool_call=tool_call,
3603 output="Todos updated",
3604 is_error=False,
3605 metadata={
3606 "new_todos": [
3607 {
3608 "content": "Write the introduction chapter",
3609 "active_form": "Writing the introduction chapter",
3610 "status": "pending",
3611 }
3612 ]
3613 },
3614 )
3615 ]
3616 )
3617
3618 summary = TurnSummary(final_response="")
3619 await runner.execute_batch(
3620 tool_calls=[tool_call],
3621 tool_source="assistant",
3622 pending_tool_calls_seen=set(),
3623 emit=_noop_emit,
3624 summary=summary,
3625 dod=dod,
3626 executor=executor, # type: ignore[arg-type]
3627 on_confirmation=None,
3628 on_user_question=None,
3629 emit_confirmation=None,
3630 consecutive_errors=0,
3631 )
3632
3633 assert queued_messages
3634 message = queued_messages[-1]
3635 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3636 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3637 assert "Resume by creating `introduction.html` now." in message
3638 assert "It is the next missing declared output under `chapters/`." in message
3639 assert "Prefer one `write` call for `" in message
3640 assert "introduction.html` instead of more rereads." in message
3641 assert "Do not spend the next turn on TodoWrite alone" in message
3642
3643
3644 @pytest.mark.asyncio
3645 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
3646 temp_dir: Path,
3647 ) -> None:
3648 async def assess_confidence(
3649 tool_name: str,
3650 tool_args: dict,
3651 context: str,
3652 ) -> ConfidenceAssessment:
3653 raise AssertionError("Confidence scoring should not run in this scenario")
3654
3655 async def verify_action(
3656 tool_name: str,
3657 tool_args: dict,
3658 result: str,
3659 expected: str = "",
3660 ) -> ActionVerification:
3661 raise AssertionError("Verification should not run in this scenario")
3662
3663 guide_root = temp_dir / "guides" / "nginx"
3664 chapters = guide_root / "chapters"
3665 guide_root.mkdir(parents=True)
3666 chapters.mkdir()
3667 index_path = guide_root / "index.html"
3668 chapter_one = chapters / "01-introduction.html"
3669 index_path.write_text(
3670 "\n".join(
3671 [
3672 "<html>",
3673 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
3674 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3675 "</html>",
3676 ]
3677 )
3678 + "\n"
3679 )
3680 chapter_one.write_text("<html></html>\n")
3681
3682 implementation_plan = temp_dir / "implementation.md"
3683 implementation_plan.write_text(
3684 "\n".join(
3685 [
3686 "# Implementation Plan",
3687 "",
3688 "## File Changes",
3689 f"- `{guide_root}/`",
3690 f"- `{chapters}/`",
3691 f"- `{index_path}`",
3692 "",
3693 ]
3694 )
3695 )
3696
3697 dod = create_definition_of_done("Create a multi-file nginx guide.")
3698 dod.implementation_plan = str(implementation_plan)
3699 dod.pending_items = [
3700 "Creating Chapter 2: Installation and Setup",
3701 "Complete the requested work",
3702 ]
3703 dod.touched_files.extend([str(index_path), str(chapter_one)])
3704
3705 queued_messages: list[str] = []
3706 context = build_context(
3707 temp_dir=temp_dir,
3708 messages=[],
3709 safeguards=FakeSafeguards(),
3710 assess_confidence=assess_confidence,
3711 verify_action=verify_action,
3712 auto_recover=False,
3713 )
3714 context.queue_steering_message_callback = queued_messages.append
3715 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3716
3717 tool_call = ToolCall(
3718 id="todo-1",
3719 name="TodoWrite",
3720 arguments={
3721 "todos": [
3722 {
3723 "content": "Creating Chapter 2: Installation and Setup",
3724 "activeForm": "Creating Chapter 2: Installation and Setup",
3725 "status": "pending",
3726 }
3727 ]
3728 },
3729 )
3730 executor = FakeExecutor(
3731 [
3732 tool_outcome(
3733 tool_call=tool_call,
3734 output="Todos updated",
3735 is_error=False,
3736 metadata={
3737 "new_todos": [
3738 {
3739 "content": "Creating Chapter 2: Installation and Setup",
3740 "active_form": "Creating Chapter 2: Installation and Setup",
3741 "status": "pending",
3742 }
3743 ]
3744 },
3745 )
3746 ]
3747 )
3748
3749 summary = TurnSummary(final_response="")
3750 await runner.execute_batch(
3751 tool_calls=[tool_call],
3752 tool_source="assistant",
3753 pending_tool_calls_seen=set(),
3754 emit=_noop_emit,
3755 summary=summary,
3756 dod=dod,
3757 executor=executor, # type: ignore[arg-type]
3758 on_confirmation=None,
3759 on_user_question=None,
3760 emit_confirmation=None,
3761 consecutive_errors=0,
3762 )
3763
3764 assert queued_messages
3765 message = queued_messages[-1]
3766 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3767 assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
3768 assert "Resume by creating `02-installation.html` now." in message
3769 assert (
3770 f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
3771 "instead of more rereads."
3772 in message
3773 )
3774 assert "Make your next response the concrete mutation tool call itself" in message
3775
3776
3777 @pytest.mark.asyncio
3778 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
3779 temp_dir: Path,
3780 ) -> None:
3781 async def assess_confidence(
3782 tool_name: str,
3783 tool_args: dict,
3784 context: str,
3785 ) -> ConfidenceAssessment:
3786 raise AssertionError("Confidence scoring should not run in this scenario")
3787
3788 async def verify_action(
3789 tool_name: str,
3790 tool_args: dict,
3791 result: str,
3792 expected: str = "",
3793 ) -> ActionVerification:
3794 raise AssertionError("Verification should not run in this scenario")
3795
3796 reference_chapters = temp_dir / "fortran" / "chapters"
3797 reference_chapters.mkdir(parents=True)
3798 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
3799
3800 guide_root = temp_dir / "guides" / "nginx"
3801 chapters = guide_root / "chapters"
3802 guide_root.mkdir(parents=True)
3803 chapters.mkdir()
3804 index_path = guide_root / "index.html"
3805 index_path.write_text("<html></html>\n")
3806
3807 implementation_plan = temp_dir / "implementation.md"
3808 implementation_plan.write_text(
3809 "\n".join(
3810 [
3811 "# Implementation Plan",
3812 "",
3813 "## File Changes",
3814 f"- `{guide_root}/`",
3815 f"- `{chapters}/`",
3816 f"- `{index_path}`",
3817 "",
3818 ]
3819 )
3820 )
3821
3822 dod = create_definition_of_done("Create a multi-file nginx guide.")
3823 dod.implementation_plan = str(implementation_plan)
3824 dod.pending_items = [
3825 "Write the introduction chapter",
3826 "Complete the requested work",
3827 ]
3828 dod.touched_files.append(str(index_path))
3829
3830 queued_messages: list[str] = []
3831 context = build_context(
3832 temp_dir=temp_dir,
3833 messages=[
3834 Message(
3835 role=Role.ASSISTANT,
3836 content="",
3837 tool_calls=[
3838 ToolCall(
3839 id="read-ref-1",
3840 name="read",
3841 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
3842 )
3843 ],
3844 )
3845 ],
3846 safeguards=FakeSafeguards(),
3847 assess_confidence=assess_confidence,
3848 verify_action=verify_action,
3849 auto_recover=False,
3850 )
3851 context.queue_steering_message_callback = queued_messages.append
3852 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3853
3854 tool_call = ToolCall(
3855 id="todo-observed-1",
3856 name="TodoWrite",
3857 arguments={
3858 "todos": [
3859 {
3860 "content": "Write the introduction chapter",
3861 "activeForm": "Writing the introduction chapter",
3862 "status": "pending",
3863 }
3864 ]
3865 },
3866 )
3867 executor = FakeExecutor(
3868 [
3869 tool_outcome(
3870 tool_call=tool_call,
3871 output="Todos updated",
3872 is_error=False,
3873 metadata={
3874 "new_todos": [
3875 {
3876 "content": "Write the introduction chapter",
3877 "active_form": "Writing the introduction chapter",
3878 "status": "pending",
3879 }
3880 ]
3881 },
3882 )
3883 ]
3884 )
3885
3886 summary = TurnSummary(final_response="")
3887 await runner.execute_batch(
3888 tool_calls=[tool_call],
3889 tool_source="assistant",
3890 pending_tool_calls_seen=set(),
3891 emit=_noop_emit,
3892 summary=summary,
3893 dod=dod,
3894 executor=executor, # type: ignore[arg-type]
3895 on_confirmation=None,
3896 on_user_question=None,
3897 emit_confirmation=None,
3898 consecutive_errors=0,
3899 )
3900
3901 assert queued_messages
3902 message = queued_messages[-1]
3903 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3904 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3905 assert "Resume by creating `01-introduction.html` now." in message
3906 assert (
3907 "It mirrors the observed filename pattern from another `chapters/` directory "
3908 "you already inspected."
3909 in message
3910 )
3911 assert "01-introduction.html` instead of more rereads." in message
3912
3913
3914 @pytest.mark.asyncio
3915 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
3916 temp_dir: Path,
3917 ) -> None:
3918 async def assess_confidence(
3919 tool_name: str,
3920 tool_args: dict,
3921 context: str,
3922 ) -> ConfidenceAssessment:
3923 raise AssertionError("Confidence scoring should not run in this scenario")
3924
3925 async def verify_action(
3926 tool_name: str,
3927 tool_args: dict,
3928 result: str,
3929 expected: str = "",
3930 ) -> ActionVerification:
3931 raise AssertionError("Verification should not run in this scenario")
3932
3933 guide_root = temp_dir / "guides" / "nginx"
3934 chapters = guide_root / "chapters"
3935 guide_root.mkdir(parents=True)
3936 chapters.mkdir()
3937 index_path = guide_root / "index.html"
3938 chapter_one = chapters / "01-getting-started.html"
3939 chapter_two = chapters / "02-installation.html"
3940 index_path.write_text("<html></html>\n")
3941 chapter_one.write_text("<h1>One</h1>\n")
3942
3943 implementation_plan = temp_dir / "implementation.md"
3944 implementation_plan.write_text(
3945 "\n".join(
3946 [
3947 "# Implementation Plan",
3948 "",
3949 "## File Changes",
3950 f"- `{guide_root}/`",
3951 f"- `{chapters}/`",
3952 f"- `{index_path}`",
3953 f"- `{chapter_one}`",
3954 f"- `{chapter_two}`",
3955 "",
3956 ]
3957 )
3958 )
3959
3960 context = build_context(
3961 temp_dir=temp_dir,
3962 messages=[],
3963 safeguards=FakeSafeguards(),
3964 assess_confidence=assess_confidence,
3965 verify_action=verify_action,
3966 auto_recover=False,
3967 )
3968 queued_messages: list[str] = []
3969 context.queue_steering_message_callback = queued_messages.append
3970 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3971 dod = create_definition_of_done("Create a multi-file nginx guide.")
3972 dod.implementation_plan = str(implementation_plan)
3973 sync_todos_to_definition_of_done(
3974 dod,
3975 [
3976 {
3977 "content": "Create 01-getting-started.html",
3978 "active_form": "Creating 01-getting-started.html",
3979 "status": "completed",
3980 },
3981 {
3982 "content": "Create 02-installation.html",
3983 "active_form": "Creating 02-installation.html",
3984 "status": "pending",
3985 },
3986 ],
3987 project_root=temp_dir,
3988 )
3989 dod.touched_files.extend([str(index_path), str(chapter_one)])
3990
3991 tool_call = ToolCall(
3992 id="working-note",
3993 name="notepad_write_working",
3994 arguments={"content": "Creating the second chapter file: Installation"},
3995 )
3996 executor = FakeExecutor(
3997 [
3998 tool_outcome(
3999 tool_call=tool_call,
4000 output="Working note recorded",
4001 is_error=False,
4002 )
4003 ]
4004 )
4005
4006 summary = TurnSummary(final_response="")
4007 await runner.execute_batch(
4008 tool_calls=[tool_call],
4009 tool_source="assistant",
4010 pending_tool_calls_seen=set(),
4011 emit=_noop_emit,
4012 summary=summary,
4013 dod=dod,
4014 executor=executor, # type: ignore[arg-type]
4015 on_confirmation=None,
4016 on_user_question=None,
4017 emit_confirmation=None,
4018 consecutive_errors=0,
4019 )
4020
4021 assert queued_messages
4022 message = queued_messages[-1]
4023 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4024 assert "Resume by creating `02-installation.html` now." in message
4025 assert "Make your next response the concrete mutation tool call itself" in message
4026 assert "refresh `TodoWrite`" in message
4027 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
4028
4029
4030 @pytest.mark.asyncio
4031 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
4032 temp_dir: Path,
4033 ) -> None:
4034 async def assess_confidence(
4035 tool_name: str,
4036 tool_args: dict,
4037 context: str,
4038 ) -> ConfidenceAssessment:
4039 raise AssertionError("Confidence scoring should be disabled in this scenario")
4040
4041 async def verify_action(
4042 tool_name: str,
4043 tool_args: dict,
4044 result: str,
4045 expected: str = "",
4046 ) -> ActionVerification:
4047 raise AssertionError("Verification should not run in this scenario")
4048
4049 implementation_plan = temp_dir / "implementation.md"
4050 implementation_plan.write_text(
4051 "\n".join(
4052 [
4053 "# Implementation Plan",
4054 "",
4055 "## File Changes",
4056 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
4057 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
4058 "",
4059 ]
4060 )
4061 )
4062
4063 context = build_context(
4064 temp_dir=temp_dir,
4065 messages=[],
4066 safeguards=FakeSafeguards(),
4067 assess_confidence=assess_confidence,
4068 verify_action=verify_action,
4069 auto_recover=False,
4070 )
4071 queued_messages: list[str] = []
4072 context.queue_steering_message_callback = queued_messages.append
4073 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4074 dod = create_definition_of_done("Create a multi-file nginx guide.")
4075 dod.implementation_plan = str(implementation_plan)
4076 dod.pending_items.extend(
4077 [
4078 "First, examine the existing fortran guide structure and content to understand the format",
4079 "Create the nginx directory structure",
4080 "Develop the main index.html file for the nginx guide",
4081 ]
4082 )
4083
4084 tool_call = ToolCall(
4085 id="working-note",
4086 name="notepad_write_working",
4087 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
4088 )
4089 executor = FakeExecutor(
4090 [
4091 tool_outcome(
4092 tool_call=tool_call,
4093 output="Working note recorded",
4094 is_error=False,
4095 )
4096 ]
4097 )
4098
4099 summary = TurnSummary(final_response="")
4100 await runner.execute_batch(
4101 tool_calls=[tool_call],
4102 tool_source="assistant",
4103 pending_tool_calls_seen=set(),
4104 emit=_noop_emit,
4105 summary=summary,
4106 dod=dod,
4107 executor=executor, # type: ignore[arg-type]
4108 on_confirmation=None,
4109 on_user_question=None,
4110 emit_confirmation=None,
4111 consecutive_errors=0,
4112 )
4113
4114 assert queued_messages
4115 message = queued_messages[-1]
4116 assert (
4117 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
4118 in message
4119 )
4120 assert "one concrete evidence-gathering tool call" in message
4121 assert "Resume by creating `index.html` now." not in message
4122
4123
4124 @pytest.mark.asyncio
4125 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
4126 temp_dir: Path,
4127 ) -> None:
4128 async def assess_confidence(
4129 tool_name: str,
4130 tool_args: dict,
4131 context: str,
4132 ) -> ConfidenceAssessment:
4133 raise AssertionError("Confidence scoring should be disabled in this scenario")
4134
4135 async def verify_action(
4136 tool_name: str,
4137 tool_args: dict,
4138 result: str,
4139 expected: str = "",
4140 ) -> ActionVerification:
4141 raise AssertionError("Verification should not run in this scenario")
4142
4143 guide_root = temp_dir / "guides" / "nginx"
4144 chapters_dir = guide_root / "chapters"
4145 chapters_dir.mkdir(parents=True)
4146 index_path = guide_root / "index.html"
4147 first_chapter = chapters_dir / "01-introduction.html"
4148 index_path.write_text(
4149 "\n".join(
4150 [
4151 '<a href="chapters/01-introduction.html">Introduction</a>',
4152 '<a href="chapters/02-installation.html">Installation</a>',
4153 '<a href="chapters/03-configuration.html">Configuration</a>',
4154 ]
4155 )
4156 )
4157 first_chapter.write_text("<h1>Introduction</h1>\n")
4158
4159 implementation_plan = temp_dir / "implementation.md"
4160 implementation_plan.write_text(
4161 "\n".join(
4162 [
4163 "# Implementation Plan",
4164 "",
4165 "## File Changes",
4166 f"- `{guide_root / 'index.html'}`",
4167 f"- `{chapters_dir}/`",
4168 "",
4169 ]
4170 )
4171 )
4172
4173 context = build_context(
4174 temp_dir=temp_dir,
4175 messages=[],
4176 safeguards=FakeSafeguards(),
4177 assess_confidence=assess_confidence,
4178 verify_action=verify_action,
4179 auto_recover=False,
4180 )
4181 queued_messages: list[str] = []
4182 context.queue_steering_message_callback = queued_messages.append
4183 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4184 dod = create_definition_of_done("Create a multi-file nginx guide.")
4185 dod.implementation_plan = str(implementation_plan)
4186 dod.pending_items.extend(
4187 [
4188 "First, examine the existing fortran guide structure and content to understand the format",
4189 "Create chapter files following the established pattern",
4190 ]
4191 )
4192 dod.touched_files.extend([str(index_path), str(first_chapter)])
4193
4194 tool_call = ToolCall(
4195 id="working-note",
4196 name="notepad_write_working",
4197 arguments={"content": "Created index and first chapter; next is chapter 2"},
4198 )
4199 executor = FakeExecutor(
4200 [
4201 tool_outcome(
4202 tool_call=tool_call,
4203 output="Working note recorded",
4204 is_error=False,
4205 )
4206 ]
4207 )
4208
4209 summary = TurnSummary(final_response="")
4210 await runner.execute_batch(
4211 tool_calls=[tool_call],
4212 tool_source="assistant",
4213 pending_tool_calls_seen=set(),
4214 emit=_noop_emit,
4215 summary=summary,
4216 dod=dod,
4217 executor=executor, # type: ignore[arg-type]
4218 on_confirmation=None,
4219 on_user_question=None,
4220 emit_confirmation=None,
4221 consecutive_errors=0,
4222 )
4223
4224 assert queued_messages
4225 message = queued_messages[-1]
4226 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4227 assert "Resume by creating `02-installation.html` now." in message
4228 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
4229
4230
4231 @pytest.mark.asyncio
4232 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
4233 temp_dir: Path,
4234 ) -> None:
4235 async def assess_confidence(
4236 tool_name: str,
4237 tool_args: dict,
4238 context: str,
4239 ) -> ConfidenceAssessment:
4240 raise AssertionError("Confidence scoring should be disabled in this scenario")
4241
4242 async def verify_action(
4243 tool_name: str,
4244 tool_args: dict,
4245 result: str,
4246 expected: str = "",
4247 ) -> ActionVerification:
4248 raise AssertionError("Verification should not run in this scenario")
4249
4250 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
4251 chapters_dir = fortran_root / "chapters"
4252 chapters_dir.mkdir(parents=True)
4253
4254 implementation_plan = temp_dir / "implementation.md"
4255 implementation_plan.write_text(
4256 "\n".join(
4257 [
4258 "# Implementation Plan",
4259 "",
4260 "## File Changes",
4261 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
4262 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
4263 "",
4264 ]
4265 )
4266 )
4267
4268 context = build_context(
4269 temp_dir=temp_dir,
4270 messages=[],
4271 safeguards=FakeSafeguards(),
4272 assess_confidence=assess_confidence,
4273 verify_action=verify_action,
4274 auto_recover=False,
4275 )
4276 queued_messages: list[str] = []
4277 context.queue_steering_message_callback = queued_messages.append
4278 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4279 dod = create_definition_of_done("Create a multi-file nginx guide.")
4280 dod.implementation_plan = str(implementation_plan)
4281 dod.pending_items.extend(
4282 [
4283 "First, examine the existing fortran guide structure and content",
4284 "Create the nginx directory structure",
4285 "Develop the main index.html file for nginx guide",
4286 ]
4287 )
4288
4289 tool_call = ToolCall(
4290 id="glob-1",
4291 name="glob",
4292 arguments={"pattern": "**", "path": str(fortran_root)},
4293 )
4294 executor = FakeExecutor(
4295 [
4296 tool_outcome(
4297 tool_call=tool_call,
4298 output=f"{fortran_root}\n{chapters_dir}",
4299 is_error=False,
4300 )
4301 ]
4302 )
4303
4304 summary = TurnSummary(final_response="")
4305 await runner.execute_batch(
4306 tool_calls=[tool_call],
4307 tool_source="assistant",
4308 pending_tool_calls_seen=set(),
4309 emit=_noop_emit,
4310 summary=summary,
4311 dod=dod,
4312 executor=executor, # type: ignore[arg-type]
4313 on_confirmation=None,
4314 on_user_question=None,
4315 emit_confirmation=None,
4316 consecutive_errors=0,
4317 )
4318
4319 assert queued_messages == []
4320
4321
4322 @pytest.mark.asyncio
4323 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
4324 temp_dir: Path,
4325 ) -> None:
4326 async def assess_confidence(
4327 tool_name: str,
4328 tool_args: dict,
4329 context: str,
4330 ) -> ConfidenceAssessment:
4331 raise AssertionError("Confidence scoring should not run in this scenario")
4332
4333 async def verify_action(
4334 tool_name: str,
4335 tool_args: dict,
4336 result: str,
4337 expected: str = "",
4338 ) -> ActionVerification:
4339 raise AssertionError("Verification should not run in this scenario")
4340
4341 prompt = (
4342 "Have a look at ~/Loader/guides/fortran/index.html, then "
4343 "~/Loader/guides/fortran/chapters. The table of contents links in "
4344 "index.html are inaccurate and the href’s are wrong. Let’s update the "
4345 "links and their link texts to be correct."
4346 )
4347 chapters = temp_dir / "chapters"
4348 chapters.mkdir()
4349 (chapters / "01-introduction.html").write_text(
4350 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
4351 )
4352 (chapters / "02-setup.html").write_text(
4353 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
4354 )
4355 current_block = (
4356 "<h2>Table of Contents</h2>\n"
4357 ' <ul class="chapter-list">\n'
4358 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
4359 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
4360 " </ul>\n"
4361 )
4362 index_path = temp_dir / "index.html"
4363 index_path.write_text(current_block)
4364
4365 context = build_context(
4366 temp_dir=temp_dir,
4367 messages=[],
4368 safeguards=FakeSafeguards(),
4369 assess_confidence=assess_confidence,
4370 verify_action=verify_action,
4371 auto_recover=False,
4372 )
4373 context.session.current_task = prompt # type: ignore[attr-defined]
4374 queued_messages: list[str] = []
4375 context.queue_steering_message_callback = queued_messages.append
4376 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4377 tool_call = ToolCall(
4378 id="edit-1",
4379 name="edit",
4380 arguments={
4381 "file_path": str(index_path),
4382 "old_string": current_block,
4383 "new_string": current_block,
4384 },
4385 )
4386 executor = FakeExecutor(
4387 [
4388 tool_outcome(
4389 tool_call=tool_call,
4390 output=(
4391 "[Blocked - old_string and new_string are identical - no change "
4392 "would occur] Suggestion: Provide different old and new strings"
4393 ),
4394 is_error=True,
4395 state=ToolExecutionState.BLOCKED,
4396 )
4397 ]
4398 )
4399
4400 await runner.execute_batch(
4401 tool_calls=[tool_call],
4402 tool_source="assistant",
4403 pending_tool_calls_seen=set(),
4404 emit=_noop_emit,
4405 summary=TurnSummary(final_response=""),
4406 dod=create_definition_of_done(prompt),
4407 executor=executor, # type: ignore[arg-type]
4408 on_confirmation=None,
4409 on_user_question=None,
4410 emit_confirmation=None,
4411 consecutive_errors=0,
4412 )
4413
4414 assert queued_messages == []
4415
4416
4417 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
4418 temp_dir: Path,
4419 ) -> None:
4420 async def assess_confidence(
4421 tool_name: str,
4422 tool_args: dict,
4423 context: str,
4424 ) -> ConfidenceAssessment:
4425 raise AssertionError("Confidence scoring should be disabled in this scenario")
4426
4427 async def verify_action(
4428 tool_name: str,
4429 tool_args: dict,
4430 result: str,
4431 expected: str = "",
4432 ) -> ActionVerification:
4433 raise AssertionError("Verification should not run in this scenario")
4434
4435 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
4436 context = build_context(
4437 temp_dir=temp_dir,
4438 messages=[
4439 Message(
4440 role=Role.ASSISTANT,
4441 content=(
4442 "Repair focus:\n"
4443 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
4444 f"- Immediate next step: edit `{repair_target}`.\n"
4445 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
4446 ),
4447 )
4448 ],
4449 safeguards=FakeSafeguards(),
4450 assess_confidence=assess_confidence,
4451 verify_action=verify_action,
4452 )
4453 queued: list[str] = []
4454 context.queue_steering_message_callback = queued.append
4455 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4456
4457 runner._queue_blocked_html_edit_nudge(
4458 ToolCall(
4459 id="edit-1",
4460 name="edit",
4461 arguments={
4462 "file_path": str(repair_target),
4463 "old_string": "same",
4464 "new_string": "same",
4465 },
4466 ),
4467 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
4468 )
4469
4470 assert queued
4471 assert str(repair_target) in queued[0]
4472 assert "no on-disk change" in queued[0]
4473 assert "replace the surrounding block" in queued[0]
4474 assert "Do not reopen unrelated reference materials" in queued[0]
4475
4476
4477 async def _noop_emit(event: AgentEvent) -> None:
4478 return None
4479
4480
4481 @pytest.mark.asyncio
4482 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
4483 temp_dir: Path,
4484 ) -> None:
4485 async def assess_confidence(
4486 tool_name: str,
4487 tool_args: dict,
4488 context: str,
4489 ) -> ConfidenceAssessment:
4490 raise AssertionError("Confidence scoring should be disabled in this scenario")
4491
4492 async def verify_action(
4493 tool_name: str,
4494 tool_args: dict,
4495 result: str,
4496 expected: str = "",
4497 ) -> ActionVerification:
4498 raise AssertionError("Verification should not run for this scenario")
4499
4500 context = build_context(
4501 temp_dir=temp_dir,
4502 messages=[],
4503 safeguards=FakeSafeguards(),
4504 assess_confidence=assess_confidence,
4505 verify_action=verify_action,
4506 )
4507 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4508 tool_call = ToolCall(
4509 id="write-1",
4510 name="write",
4511 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4512 )
4513 executor = FakeExecutor(
4514 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4515 )
4516 summary = TurnSummary(final_response="")
4517 dod = create_definition_of_done("Update README and verify it still works.")
4518 events: list[AgentEvent] = []
4519
4520 async def emit(event: AgentEvent) -> None:
4521 events.append(event)
4522
4523 await runner.execute_batch(
4524 tool_calls=[tool_call],
4525 tool_source="assistant",
4526 pending_tool_calls_seen=set(),
4527 emit=emit,
4528 summary=summary,
4529 dod=dod,
4530 executor=executor, # type: ignore[arg-type]
4531 on_confirmation=None,
4532 on_user_question=None,
4533 emit_confirmation=None,
4534 consecutive_errors=0,
4535 )
4536
4537 assert dod.last_verification_result == "planned"
4538 assert dod.verification_commands
4539 assert "Collect verification evidence" in dod.pending_items
4540 assert dod.active_verification_attempt_id == "verification-attempt-1"
4541 assert dod.active_verification_attempt_number == 1
4542 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
4543 assert summary.workflow_timeline[-1].policy_outcome == "planned"
4544 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
4545 assert (
4546 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4547 == "verification-attempt-1"
4548 )
4549 assert (
4550 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4551 )
4552
4553
4554 @pytest.mark.asyncio
4555 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
4556 temp_dir: Path,
4557 ) -> None:
4558 async def assess_confidence(
4559 tool_name: str,
4560 tool_args: dict,
4561 context: str,
4562 ) -> ConfidenceAssessment:
4563 raise AssertionError("Confidence scoring should be disabled in this scenario")
4564
4565 async def verify_action(
4566 tool_name: str,
4567 tool_args: dict,
4568 result: str,
4569 expected: str = "",
4570 ) -> ActionVerification:
4571 raise AssertionError("Verification should not run in this scenario")
4572
4573 context = build_context(
4574 temp_dir=temp_dir,
4575 messages=[],
4576 safeguards=FakeSafeguards(),
4577 assess_confidence=assess_confidence,
4578 verify_action=verify_action,
4579 )
4580 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4581 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
4582 chapters = nginx_root / "chapters"
4583 implementation_plan = temp_dir / "implementation.md"
4584 implementation_plan.write_text(
4585 "\n".join(
4586 [
4587 "# Implementation Plan",
4588 "",
4589 "## File Changes",
4590 f"- `{chapters}/`",
4591 f"- `{nginx_root / 'index.html'}`",
4592 "",
4593 ]
4594 )
4595 )
4596
4597 tool_call = ToolCall(
4598 id="mkdir-1",
4599 name="bash",
4600 arguments={"command": f"mkdir -p {chapters}"},
4601 )
4602 executor = FakeExecutor(
4603 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
4604 )
4605 summary = TurnSummary(final_response="")
4606 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
4607 dod.implementation_plan = str(implementation_plan)
4608 events: list[AgentEvent] = []
4609
4610 async def emit(event: AgentEvent) -> None:
4611 events.append(event)
4612
4613 await runner.execute_batch(
4614 tool_calls=[tool_call],
4615 tool_source="assistant",
4616 pending_tool_calls_seen=set(),
4617 emit=emit,
4618 summary=summary,
4619 dod=dod,
4620 executor=executor, # type: ignore[arg-type]
4621 on_confirmation=None,
4622 on_user_question=None,
4623 emit_confirmation=None,
4624 consecutive_errors=0,
4625 )
4626
4627 assert dod.last_verification_result is None
4628 assert "Collect verification evidence" not in dod.pending_items
4629 assert not any(
4630 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
4631 )
4632
4633
4634 @pytest.mark.asyncio
4635 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
4636 temp_dir: Path,
4637 ) -> None:
4638 async def assess_confidence(
4639 tool_name: str,
4640 tool_args: dict,
4641 context: str,
4642 ) -> ConfidenceAssessment:
4643 raise AssertionError("Confidence scoring should be disabled in this scenario")
4644
4645 async def verify_action(
4646 tool_name: str,
4647 tool_args: dict,
4648 result: str,
4649 expected: str = "",
4650 ) -> ActionVerification:
4651 raise AssertionError("Verification should not run for this scenario")
4652
4653 context = build_context(
4654 temp_dir=temp_dir,
4655 messages=[],
4656 safeguards=FakeSafeguards(),
4657 assess_confidence=assess_confidence,
4658 verify_action=verify_action,
4659 )
4660 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4661 tool_call = ToolCall(
4662 id="write-1",
4663 name="write",
4664 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4665 )
4666 executor = FakeExecutor(
4667 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4668 )
4669 summary = TurnSummary(final_response="")
4670 dod = create_definition_of_done("Update README and verify it still works.")
4671 dod.verification_commands = ["uv run pytest -q"]
4672 dod.last_verification_result = "passed"
4673 dod.verification_attempt_counter = 1
4674 dod.active_verification_attempt_id = "verification-attempt-1"
4675 dod.active_verification_attempt_number = 1
4676 dod.evidence = [
4677 VerificationEvidence(
4678 command="uv run pytest -q",
4679 passed=True,
4680 stdout="401 passed",
4681 kind="test",
4682 )
4683 ]
4684 dod.completed_items.append("Collect verification evidence")
4685 events: list[AgentEvent] = []
4686
4687 async def emit(event: AgentEvent) -> None:
4688 events.append(event)
4689
4690 await runner.execute_batch(
4691 tool_calls=[tool_call],
4692 tool_source="assistant",
4693 pending_tool_calls_seen=set(),
4694 emit=emit,
4695 summary=summary,
4696 dod=dod,
4697 executor=executor, # type: ignore[arg-type]
4698 on_confirmation=None,
4699 on_user_question=None,
4700 emit_confirmation=None,
4701 consecutive_errors=0,
4702 )
4703
4704 assert dod.last_verification_result == "stale"
4705 assert dod.evidence == []
4706 assert "Collect verification evidence" in dod.pending_items
4707 assert "Collect verification evidence" not in dod.completed_items
4708 assert dod.active_verification_attempt_id == "verification-attempt-2"
4709 assert dod.active_verification_attempt_number == 2
4710 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
4711 assert summary.workflow_timeline[-1].policy_outcome == "stale"
4712 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
4713 assert (
4714 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4715 == "verification-attempt-1"
4716 )
4717 assert (
4718 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4719 )
4720 assert (
4721 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
4722 == "verification-attempt-2"
4723 )
4724 assert (
4725 summary.workflow_timeline[-1].verification_observations[0].command
4726 == "uv run pytest -q"
4727 )
4728
4729
4730 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
4731 async def assess_confidence(
4732 tool_name: str,
4733 tool_args: dict,
4734 context: str,
4735 ) -> ConfidenceAssessment:
4736 raise AssertionError("Confidence scoring should be disabled in this scenario")
4737
4738 async def verify_action(
4739 tool_name: str,
4740 tool_args: dict,
4741 result: str,
4742 expected: str = "",
4743 ) -> ActionVerification:
4744 raise AssertionError("Verification should not run in this scenario")
4745
4746 repair_target = temp_dir / "guide" / "index.html"
4747 context = build_context(
4748 temp_dir=temp_dir,
4749 messages=[
4750 Message(
4751 role=Role.ASSISTANT,
4752 content=(
4753 "Repair focus:\n"
4754 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
4755 f"- Immediate next step: edit `{repair_target}`.\n"
4756 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
4757 ),
4758 )
4759 ],
4760 safeguards=FakeSafeguards(),
4761 assess_confidence=assess_confidence,
4762 verify_action=verify_action,
4763 )
4764 queued: list[str] = []
4765 context.queue_steering_message_callback = queued.append
4766 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4767
4768 runner._queue_blocked_active_repair_nudge(
4769 "[Blocked - active repair scope: verification already identified the repair target.]"
4770 )
4771
4772 assert queued
4773 assert str(repair_target) in queued[0]
4774 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
4775 assert "Do not reopen unrelated reference materials" in queued[0]
4776
4777
4778 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
4779 temp_dir: Path,
4780 ) -> None:
4781 async def assess_confidence(
4782 tool_name: str,
4783 tool_args: dict,
4784 context: str,
4785 ) -> ConfidenceAssessment:
4786 raise AssertionError("Confidence scoring should be disabled in this scenario")
4787
4788 async def verify_action(
4789 tool_name: str,
4790 tool_args: dict,
4791 result: str,
4792 expected: str = "",
4793 ) -> ActionVerification:
4794 raise AssertionError("Verification should not run in this scenario")
4795
4796 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
4797 stylesheet = temp_dir / "guide" / "styles.css"
4798 context = build_context(
4799 temp_dir=temp_dir,
4800 messages=[
4801 Message(
4802 role=Role.ASSISTANT,
4803 content=(
4804 "Repair focus:\n"
4805 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
4806 f"- Immediate next step: edit `{repair_target}`.\n"
4807 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
4808 ),
4809 )
4810 ],
4811 safeguards=FakeSafeguards(),
4812 assess_confidence=assess_confidence,
4813 verify_action=verify_action,
4814 )
4815 queued: list[str] = []
4816 context.queue_steering_message_callback = queued.append
4817 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4818
4819 runner._queue_blocked_active_repair_mutation_nudge(
4820 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
4821 )
4822
4823 assert queued
4824 assert str(repair_target) in queued[0]
4825 assert str(stylesheet) in queued[0]
4826 assert "before widening the change set" in queued[0]
4827
4828
4829 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
4830 temp_dir: Path,
4831 ) -> None:
4832 async def assess_confidence(
4833 tool_name: str,
4834 tool_args: dict,
4835 context: str,
4836 ) -> ConfidenceAssessment:
4837 raise AssertionError("Confidence scoring should be disabled in this scenario")
4838
4839 async def verify_action(
4840 tool_name: str,
4841 tool_args: dict,
4842 result: str,
4843 expected: str = "",
4844 ) -> ActionVerification:
4845 raise AssertionError("Verification should not run in this scenario")
4846
4847 context = build_context(
4848 temp_dir=temp_dir,
4849 messages=[],
4850 safeguards=FakeSafeguards(),
4851 assess_confidence=assess_confidence,
4852 verify_action=verify_action,
4853 )
4854 queued: list[str] = []
4855 context.queue_steering_message_callback = queued.append
4856 store = DefinitionOfDoneStore(temp_dir)
4857 dod = create_definition_of_done("Create a multi-file guide from a reference")
4858 plan_path = temp_dir / "implementation.md"
4859 plan_path.write_text(
4860 "# File Changes\n"
4861 "- `guide/index.html`\n"
4862 "- `guide/chapters/01-getting-started.html`\n"
4863 "- `guide/chapters/02-installation.html`\n"
4864 "- `guide/chapters/03-first-website.html`\n"
4865 )
4866 dod.implementation_plan = str(plan_path)
4867 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
4868 (temp_dir / "guide" / "index.html").write_text("index")
4869 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
4870 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
4871 runner = ToolBatchRunner(context, store)
4872
4873 runner._queue_blocked_late_reference_drift_nudge(
4874 "[Blocked - late reference drift: several planned artifacts already exist.]",
4875 dod=dod,
4876 )
4877
4878 assert queued
4879 assert "03-first-website.html" in queued[0]
4880 assert "older reference materials" in queued[0]
4881
4882
4883 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
4884 temp_dir: Path,
4885 ) -> None:
4886 async def assess_confidence(
4887 tool_name: str,
4888 tool_args: dict,
4889 context: str,
4890 ) -> ConfidenceAssessment:
4891 raise AssertionError("Confidence scoring should be disabled in this scenario")
4892
4893 async def verify_action(
4894 tool_name: str,
4895 tool_args: dict,
4896 result: str,
4897 expected: str = "",
4898 ) -> ActionVerification:
4899 raise AssertionError("Verification should not run in this scenario")
4900
4901 guide_root = temp_dir / "guide"
4902 chapters = guide_root / "chapters"
4903 guide_root.mkdir(parents=True)
4904 chapters.mkdir()
4905 index_path = guide_root / "index.html"
4906 chapter_one = chapters / "01-getting-started.html"
4907 chapter_two = chapters / "02-installation.html"
4908 index_path.write_text("index")
4909 chapter_one.write_text("one")
4910 chapter_two.write_text("two")
4911
4912 implementation_plan = temp_dir / "implementation.md"
4913 implementation_plan.write_text(
4914 "\n".join(
4915 [
4916 "# Implementation Plan",
4917 "",
4918 "## File Changes",
4919 f"- `{guide_root}`",
4920 f"- `{chapters}`",
4921 f"- `{index_path}`",
4922 f"- `{chapter_one}`",
4923 f"- `{chapter_two}`",
4924 "",
4925 ]
4926 )
4927 )
4928
4929 context = build_context(
4930 temp_dir=temp_dir,
4931 messages=[],
4932 safeguards=FakeSafeguards(),
4933 assess_confidence=assess_confidence,
4934 verify_action=verify_action,
4935 )
4936 queued: list[str] = []
4937 context.queue_steering_message_callback = queued.append
4938 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4939 dod = create_definition_of_done("Create a multi-file guide from a reference")
4940 dod.implementation_plan = str(implementation_plan)
4941 dod.verification_commands = [f"ls -la {guide_root}"]
4942 sync_todos_to_definition_of_done(
4943 dod,
4944 [
4945 {
4946 "content": "Verify all guide files are linked and complete",
4947 "active_form": "Working on: Verify all guide files are linked and complete",
4948 "status": "pending",
4949 }
4950 ],
4951 project_root=temp_dir,
4952 )
4953
4954 runner._queue_blocked_completed_artifact_scope_nudge(
4955 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
4956 dod=dod,
4957 )
4958
4959 assert queued
4960 assert "All explicitly planned artifacts already exist." in queued[0]
4961 assert "Verify all guide files are linked and complete" in queued[0]
4962 assert "Do not reopen earlier reference materials." in queued[0]
4963
4964
4965 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
4966 temp_dir: Path,
4967 ) -> None:
4968 async def assess_confidence(
4969 tool_name: str,
4970 tool_args: dict,
4971 context: str,
4972 ) -> ConfidenceAssessment:
4973 raise AssertionError("Confidence scoring should be disabled in this scenario")
4974
4975 async def verify_action(
4976 tool_name: str,
4977 tool_args: dict,
4978 result: str,
4979 expected: str = "",
4980 ) -> ActionVerification:
4981 raise AssertionError("Verification should not run in this scenario")
4982
4983 context = build_context(
4984 temp_dir=temp_dir,
4985 messages=[],
4986 safeguards=FakeSafeguards(),
4987 assess_confidence=assess_confidence,
4988 verify_action=verify_action,
4989 )
4990 queued: list[str] = []
4991 context.queue_steering_message_callback = queued.append
4992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4993
4994 runner._queue_blocked_html_declared_target_nudge(
4995 ToolCall(
4996 id="write-ch1",
4997 name="write",
4998 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
4999 ),
5000 (
5001 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
5002 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
5003 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
5004 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
5005 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
5006 ),
5007 )
5008
5009 assert queued
5010 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
5011 assert "`chapters/02-installation.html`" in queued[0]
5012 assert "same file now" in queued[0]
5013
5014
5015 @pytest.mark.asyncio
5016 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
5017 temp_dir: Path,
5018 ) -> None:
5019 async def assess_confidence(
5020 tool_name: str,
5021 tool_args: dict,
5022 context: str,
5023 ) -> ConfidenceAssessment:
5024 raise AssertionError("Confidence scoring should be disabled in this scenario")
5025
5026 async def verify_action(
5027 tool_name: str,
5028 tool_args: dict,
5029 result: str,
5030 expected: str = "",
5031 ) -> ActionVerification:
5032 raise AssertionError("Verification should not run in this scenario")
5033
5034 guide_root = temp_dir / "guides" / "nginx"
5035 chapters = guide_root / "chapters"
5036 chapters.mkdir(parents=True)
5037 index_path = guide_root / "index.html"
5038 chapter_one = chapters / "01-introduction.html"
5039 chapter_two = chapters / "02-installation.html"
5040 index_path.write_text("<html></html>\n")
5041 chapter_one.write_text("<h1>Intro</h1>\n")
5042
5043 implementation_plan = temp_dir / "implementation.md"
5044 implementation_plan.write_text(
5045 "\n".join(
5046 [
5047 "# Implementation Plan",
5048 "",
5049 "## File Changes",
5050 f"- `{index_path}`",
5051 f"- `{chapter_one}`",
5052 f"- `{chapter_two}`",
5053 "",
5054 ]
5055 )
5056 )
5057
5058 context = build_context(
5059 temp_dir=temp_dir,
5060 messages=[],
5061 safeguards=FakeSafeguards(),
5062 assess_confidence=assess_confidence,
5063 verify_action=verify_action,
5064 auto_recover=False,
5065 )
5066 queued: list[str] = []
5067 context.queue_steering_message_callback = queued.append
5068 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5069 tool_call = ToolCall(
5070 id="write-2",
5071 name="write",
5072 arguments={"file_path": "", "content": "<html></html>\n"},
5073 )
5074 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
5075 executor = FakeExecutor(
5076 [
5077 ToolExecutionOutcome(
5078 tool_call=tool_call,
5079 state=ToolExecutionState.BLOCKED,
5080 message=Message.tool_result_message(
5081 tool_call_id=tool_call.id,
5082 display_content=blocked_message,
5083 result_content=blocked_message,
5084 is_error=True,
5085 ),
5086 event_content=blocked_message,
5087 is_error=True,
5088 result_output=blocked_message,
5089 )
5090 ]
5091 )
5092 dod = create_definition_of_done("Create a multi-file nginx guide.")
5093 dod.implementation_plan = str(implementation_plan)
5094 dod.touched_files.extend([str(index_path), str(chapter_one)])
5095 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
5096
5097 await runner.execute_batch(
5098 tool_calls=[tool_call],
5099 tool_source="assistant",
5100 pending_tool_calls_seen=set(),
5101 emit=_noop_emit,
5102 summary=TurnSummary(final_response=""),
5103 dod=dod,
5104 executor=executor, # type: ignore[arg-type]
5105 on_confirmation=None,
5106 on_user_question=None,
5107 emit_confirmation=None,
5108 consecutive_errors=0,
5109 )
5110
5111 assert queued
5112 assert "did not provide a valid `file_path`" in queued[0]
5113 assert "Resume by creating `02-installation.html` now." in queued[0]
5114 assert (
5115 f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
5116 in queued[0]
5117 )
5118 assert context.recovery_context is not None
5119 assert context.recovery_context.attempts[-1].error == blocked_message